Skip to content

Instantly share code, notes, and snippets.

@teruhisa
Last active September 2, 2018 12:34
Show Gist options
  • Save teruhisa/3bfc36c77242978001ae to your computer and use it in GitHub Desktop.
Save teruhisa/3bfc36c77242978001ae to your computer and use it in GitHub Desktop.
Nikkei Scraper
// ==UserScript==
// @name Nikkei Scraper
// @namespace http://http://teruhisa.github.io//
// @version 0.1
// @description scrape news article content and dump it into body.
// @author Teruhisa Haruguchi
// @match https://gateway.itc.u-tokyo.ac.jp/g3/,DanaInfo=t21.nikkei.co.jp+CMN0F12.do
// @match https://t21.nikkei.co.jp/g3/CMN0F12.do
// @grant none
// ==/UserScript==
// a function that loads jQuery and calls a callback function when jQuery has finished loading
function addJQuery(callback) {
var script = document.createElement("script");
script.setAttribute("src", "//ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js");
script.addEventListener('load', function() {
var script = document.createElement("script");
script.textContent = "window.jQ=jQuery.noConflict(true);(" + callback.toString() + ")();";
document.body.appendChild(script);
}, false);
document.body.appendChild(script);
}
function main() {
jQ(document).ready(function () {
var $ = window.jQ;
// Check if we initialized already.
var check = $('#scrapeStart');
if (check.size()) {
return;
}
// Inject custom buttons.
$('head').append('<style>#result{margin-top:30px;}#scrapeStatus{text-align:right;}.sc-item .hl1{background:yellow;}.sc-item h1{background:lightyellow;}#scrape .button {float:right;cursor:pointer; padding: 2px 10px; display:inline-block;}.scraping .nk-popup-shadow, .scraping .nk-popup{margin-top:1000000px;}</style');
$('#contentsFrame')
.css('position', 'relative')
.append('<div id="result"></div>')
.append('<div id="scrape" style="position:absolute;top:0;right:0;"></div>');
$('#scrape')
.append('<div id="scrapePdf" class="button" style="background:lime;">pdf</div>')
.append('<div id="scrapePreview" class="button" style="background:hotpink;">preview</div>')
.append('<div id="scrapeStatus">hello</div>');
$('#scrapePdf').click(scrapePdf);
$('#scrapePreview').click(scrapePreview);
/////////////////////
// Scrape handlers //
/////////////////////
var RETRY_COUNT = 40;
var TIMER_DELAY = 100;
var TIMER_DELAY_SEGMENT = 1000 / RETRY_COUNT;
function scrapePdf() {
// Start event loop.
if (hasNext()) {
$('body').addClass('scraping');
// hasn't reached end, browse through the list.
$('.nk-list-headline-title .nk-list-pdf').trigger('click');
runNextBlock(getSegmentStartIndex(), scrapePdf, RETRY_COUNT);
$('.nk-navigator-next').first().trigger('click');
} else {
$('body').removeClass('scraping');
log('done');
}
}
function scrapePreview() {
// Start event loop.
if (hasNext()) {
$('body').addClass('scraping');
// hasn't reached end, browse through the list.
scrapePreviewItem(0, getSegmentStartIndex(), getSegmentEndIndex(), scrapePreview);
} else {
scrapePreviewItem(0, getSegmentStartIndex(), getSegmentEndIndex(), function(){
$('body').removeClass('scraping');
log('done');
});
}
}
function scrapePreviewItem(index, start, end, nextStep) {
log('scrapePreviewItem:', index + ':' + start);
debugger;
if (start + index <= end) {
var item = $($('.nk-list-headline').get(index));
item.trigger('click');
var headline = item.text();
acknowledgePreviewPrompt(headline, function(){
scrapePreviewItem(index + 1, start, end, nextStep);
}, RETRY_COUNT);
} else {
$('.nk-navigator-next').first().trigger('click');
runNextBlock(getSegmentStartIndex(), nextStep, RETRY_COUNT);
}
}
function acknowledgePreviewPrompt(headline, nextStep, attempt) {
if (attempt < 0) {
return recordTextAndCheckNext(headline, nextStep, RETRY_COUNT);
}
log('acknowledgePreviewPrompt: ' + headline);
setTimeout(function(){
var popupMsg = $('.nk-popup-msg');
if (!isHidden(popupMsg)) {
popupMsg.next().children('.nk-popup-ok').trigger('click');
acknowledgePreviewPrompt(headline, nextStep, -1);//attempt-1);
// Debug
//popupMsg.parent().css('display','none');
} else {
recordTextAndCheckNext(headline, nextStep, RETRY_COUNT);
}
}, 30);
//}, delay(attempt));
}
function recordTextAndCheckNext(headline, nextStep, attempt) {
if (attempt < 0) {
return nextStep();
}
log('recordTextAndCheckNext', attempt);
// Wait till the body text is loaded.
setTimeout(function(){
var matchIndex = -1;
var titles = $('.nk-gv-bodytitle');
for (var i = 0; i < titles.size(); i++) {
if ($(titles.get(i)).text() === headline) {
matchIndex = i;
break;
}
}
if (matchIndex >= 0) {
var text = $($('.nk-gv-body-view td').get(matchIndex));//.text();
var date = $($('.nk-gv-bodytitlesub .nk-gv-attribute').get(matchIndex));//.text();
$('<div class="sc-item"></div>').append('<h1>'+headline+'</h1>').append(text).append(date).append('<div class="clear"></div>').appendTo('#result');
//$('#result').append('<div class="sc-item"></div>');
//$('#result .sc-item').last().append('<h1>'+headline+'</h1>').append(text).append(date).append('<div class="clear"></div>');
nextStep();
} else {
recordTextAndCheckNext(headline, nextStep, attempt-1);
}
}, delay(attempt));
}
function scrapePreviewNextBlock() {
runNextBlock(getSegmentStartIndex(), scrapePreview, RETRY_COUNT);
}
function hasNext() {
return !$('.nk-navigator-next').hasClass('nk-navigator-no-link');
}
function runNextBlock(currentSegment, callback, attempt) {
if (!hasNext()) {
callback();
return;
}
if (attempt < 0) {
log('timeout: increase attempt or delay?');
//callback();
return;
}
log('runNextBlock' + currentSegment + ':' +attempt);
debugger;
setTimeout(function(){
if (currentSegment === getSegmentStartIndex()) {
runNextBlock(currentSegment, callback, attempt-1);
} else {
callback();
}
}, delay(attempt) * 20);
}
function getSegmentStartIndex() {
var form = $('.nk-list-form').first();
return parseInt(form.attr('start'), 10);
}
function getSegmentEndIndex() {
var form = $('.nk-list-view-detail').first();
return parseInt(form.attr('end'), 10);
}
function isHidden(el) {
var style = window.getComputedStyle(el.get(0));
return (style.display === 'none')
}
function log(msg) {
$('#scrapeStatus').text(msg);
}
function delay(attempt) {
return TIMER_DELAY + (RETRY_COUNT - attempt) * TIMER_DELAY_SEGMENT;
}
});
}
// load jQuery and execute the main function
addJQuery(main);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment