-
-
Save dinh/e1fa1ecd203806dbef4b to your computer and use it in GitHub Desktop.
Nikkei Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==UserScript== | |
// @name Nikkei Scraper | |
// @namespace http://http://teruhisa.github.io// | |
// @version 0.1 | |
// @description scrape news article content and dump it into body. | |
// @author Teruhisa Haruguchi | |
// @match https://gateway.itc.u-tokyo.ac.jp/g3/,DanaInfo=t21.nikkei.co.jp+CMN0F12.do | |
// @match https://t21.nikkei.co.jp/g3/CMN0F12.do | |
// @grant none | |
// ==/UserScript== | |
// a function that loads jQuery and calls a callback function when jQuery has finished loading | |
function addJQuery(callback) { | |
var script = document.createElement("script"); | |
script.setAttribute("src", "//ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"); | |
script.addEventListener('load', function() { | |
var script = document.createElement("script"); | |
script.textContent = "window.jQ=jQuery.noConflict(true);(" + callback.toString() + ")();"; | |
document.body.appendChild(script); | |
}, false); | |
document.body.appendChild(script); | |
} | |
function main() { | |
jQ(document).ready(function () { | |
var $ = window.jQ; | |
// Check if we initialized already. | |
var check = $('#scrapeStart'); | |
if (check.size()) { | |
return; | |
} | |
// Inject custom buttons. | |
$('head').append('<style>#result{margin-top:30px;}#scrapeStatus{text-align:right;}.sc-item .hl1{background:yellow;}.sc-item h1{background:lightyellow;}#scrape .button {float:right;cursor:pointer; padding: 2px 10px; display:inline-block;}.scraping .nk-popup-shadow, .scraping .nk-popup{margin-top:1000000px;}</style'); | |
$('#contentsFrame') | |
.css('position', 'relative') | |
.append('<div id="result"></div>') | |
.append('<div id="scrape" style="position:absolute;top:0;right:0;"></div>'); | |
$('#scrape') | |
.append('<div id="scrapePdf" class="button" style="background:lime;">pdf</div>') | |
.append('<div id="scrapePreview" class="button" style="background:hotpink;">preview</div>') | |
.append('<div id="scrapeStatus">hello</div>'); | |
$('#scrapePdf').click(scrapePdf); | |
$('#scrapePreview').click(scrapePreview); | |
///////////////////// | |
// Scrape handlers // | |
///////////////////// | |
var RETRY_COUNT = 40; | |
var TIMER_DELAY = 100; | |
var TIMER_DELAY_SEGMENT = 1000 / RETRY_COUNT; | |
function scrapePdf() { | |
// Start event loop. | |
if (hasNext()) { | |
$('body').addClass('scraping'); | |
// hasn't reached end, browse through the list. | |
$('.nk-list-headline-title .nk-list-pdf').trigger('click'); | |
runNextBlock(getSegmentStartIndex(), scrapePdf, RETRY_COUNT); | |
$('.nk-navigator-next').first().trigger('click'); | |
} else { | |
$('body').removeClass('scraping'); | |
log('done'); | |
} | |
} | |
function scrapePreview() { | |
// Start event loop. | |
if (hasNext()) { | |
$('body').addClass('scraping'); | |
// hasn't reached end, browse through the list. | |
scrapePreviewItem(0, getSegmentStartIndex(), getSegmentEndIndex(), scrapePreview); | |
} else { | |
scrapePreviewItem(0, getSegmentStartIndex(), getSegmentEndIndex(), function(){ | |
$('body').removeClass('scraping'); | |
log('done'); | |
}); | |
} | |
} | |
function scrapePreviewItem(index, start, end, nextStep) { | |
log('scrapePreviewItem:', index + ':' + start); | |
debugger; | |
if (start + index <= end) { | |
var item = $($('.nk-list-headline').get(index)); | |
item.trigger('click'); | |
var headline = item.text(); | |
acknowledgePreviewPrompt(headline, function(){ | |
scrapePreviewItem(index + 1, start, end, nextStep); | |
}, RETRY_COUNT); | |
} else { | |
$('.nk-navigator-next').first().trigger('click'); | |
runNextBlock(getSegmentStartIndex(), nextStep, RETRY_COUNT); | |
} | |
} | |
function acknowledgePreviewPrompt(headline, nextStep, attempt) { | |
if (attempt < 0) { | |
return recordTextAndCheckNext(headline, nextStep, RETRY_COUNT); | |
} | |
log('acknowledgePreviewPrompt: ' + headline); | |
setTimeout(function(){ | |
var popupMsg = $('.nk-popup-msg'); | |
if (!isHidden(popupMsg)) { | |
popupMsg.next().children('.nk-popup-ok').trigger('click'); | |
acknowledgePreviewPrompt(headline, nextStep, -1);//attempt-1); | |
// Debug | |
//popupMsg.parent().css('display','none'); | |
} else { | |
recordTextAndCheckNext(headline, nextStep, RETRY_COUNT); | |
} | |
}, 30); | |
//}, delay(attempt)); | |
} | |
function recordTextAndCheckNext(headline, nextStep, attempt) { | |
if (attempt < 0) { | |
return nextStep(); | |
} | |
log('recordTextAndCheckNext', attempt); | |
// Wait till the body text is loaded. | |
setTimeout(function(){ | |
var matchIndex = -1; | |
var titles = $('.nk-gv-bodytitle'); | |
for (var i = 0; i < titles.size(); i++) { | |
if ($(titles.get(i)).text() === headline) { | |
matchIndex = i; | |
break; | |
} | |
} | |
if (matchIndex >= 0) { | |
var text = $($('.nk-gv-body-view td').get(matchIndex));//.text(); | |
var date = $($('.nk-gv-bodytitlesub .nk-gv-attribute').get(matchIndex));//.text(); | |
$('<div class="sc-item"></div>').append('<h1>'+headline+'</h1>').append(text).append(date).append('<div class="clear"></div>').appendTo('#result'); | |
//$('#result').append('<div class="sc-item"></div>'); | |
//$('#result .sc-item').last().append('<h1>'+headline+'</h1>').append(text).append(date).append('<div class="clear"></div>'); | |
nextStep(); | |
} else { | |
recordTextAndCheckNext(headline, nextStep, attempt-1); | |
} | |
}, delay(attempt)); | |
} | |
function scrapePreviewNextBlock() { | |
runNextBlock(getSegmentStartIndex(), scrapePreview, RETRY_COUNT); | |
} | |
function hasNext() { | |
return !$('.nk-navigator-next').hasClass('nk-navigator-no-link'); | |
} | |
function runNextBlock(currentSegment, callback, attempt) { | |
if (!hasNext()) { | |
callback(); | |
return; | |
} | |
if (attempt < 0) { | |
log('timeout: increase attempt or delay?'); | |
//callback(); | |
return; | |
} | |
log('runNextBlock' + currentSegment + ':' +attempt); | |
debugger; | |
setTimeout(function(){ | |
if (currentSegment === getSegmentStartIndex()) { | |
runNextBlock(currentSegment, callback, attempt-1); | |
} else { | |
callback(); | |
} | |
}, delay(attempt) * 20); | |
} | |
function getSegmentStartIndex() { | |
var form = $('.nk-list-form').first(); | |
return parseInt(form.attr('start'), 10); | |
} | |
function getSegmentEndIndex() { | |
var form = $('.nk-list-view-detail').first(); | |
return parseInt(form.attr('end'), 10); | |
} | |
function isHidden(el) { | |
var style = window.getComputedStyle(el.get(0)); | |
return (style.display === 'none') | |
} | |
function log(msg) { | |
$('#scrapeStatus').text(msg); | |
} | |
function delay(attempt) { | |
return TIMER_DELAY + (RETRY_COUNT - attempt) * TIMER_DELAY_SEGMENT; | |
} | |
}); | |
} | |
// load jQuery and execute the main function | |
addJQuery(main); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment