Last active
December 11, 2015 04:28
-
-
Save i09158knct/4544997 to your computer and use it in GitHub Desktop.
Using Phantom.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function generateGoogleSearchUrl(word, num) { | |
var encodedWord = encodeURI(word); | |
var number = '&num=' + (num || 50); | |
var prefix = 'https://www.google.co.jp/search?q='; | |
var suffix = '&ie=UTF-8'; | |
return prefix + encodedWord + number + suffix; | |
} | |
function getWordList(fileName) { | |
var fs = require('fs'); | |
var file = fs.open(fileName, 'r'); | |
var wordList = file.read().split('\n'); | |
return wordList; | |
} | |
function writeResult(word, result) { | |
var fs = require('fs'); | |
fs.makeDirectory('outputs'); | |
var file = fs.open('outputs/' + word, 'w'); | |
file.write(result); | |
file.write('\n'); | |
file.flush(); | |
} | |
function evalGetUrlList() { | |
var links = document.getElementsByClassName('r'); | |
links = [].slice.call(links); | |
var urls = links.map(function(link) { | |
return link.children[0].href; | |
}); | |
urls = urls.filter(function(url) { | |
return !/.pdf$/.test(url); | |
}); | |
return JSON.stringify(urls); | |
} | |
;(function _main() { | |
var fileName = phantom.args[0]; | |
var urlNumber = +phantom.args[1]; | |
if (typeof fileName === 'undefined') { | |
console.log('Error: no input file'); | |
phantom.exit(1); | |
return; | |
} | |
var wordList = getWordList(fileName); | |
var page = require('webpage').create(); | |
page.settings.userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25'; | |
mainLoop(0); | |
function mainLoop(current) { | |
var word = wordList[current]; | |
if (!word) { | |
phantom.exit(); | |
return; | |
} | |
var searchUrl = generateGoogleSearchUrl(word, urlNumber); | |
page.open(searchUrl, function(status) { | |
var resultJSON = page.evaluate(evalGetUrlList); | |
var result = JSON.parse(resultJSON).join('\n'); | |
writeResult(word, result); | |
setTimeout(function() { mainLoop(current + 1); }, 1000); | |
}); | |
} | |
})(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function generateGoogleSearchUrl(word, num) { | |
var encodedWord = encodeURI(word); | |
var number = "&num=" + (num || 50); | |
var prefix = "https://www.google.co.jp/search?q="; | |
var suffix = "&ie=UTF-8"; | |
return prefix + encodedWord + number + suffix; | |
} | |
;(function _main() { | |
var word = phantom.args[0]; | |
var urlNumber = +phantom.args[1]; | |
var delay = +phantom.args[2] || 0; | |
if (typeof word === "undefined") { | |
console.log("Error: no search word"); | |
phantom.exit(1); | |
return; | |
} | |
var page = require("webpage").create(); | |
// page.settings.userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7"; | |
page.settings.userAgent = "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25"; | |
var serchUrl = generateGoogleSearchUrl(word, urlNumber); | |
setTimeout(function() { | |
main(word); | |
}, delay); | |
function main(word) { | |
page.open(serchUrl, function(status) { | |
var resultJSON = page.evaluate(function() { | |
var urlList = []; | |
var links = window.document.getElementsByClassName("l"); | |
for (var i = 0, linksLength = links.length; i < linksLength; i++) { | |
urlList.push(links[i].href); | |
} | |
return JSON.stringify(urlList); | |
}); | |
console.log(JSON.parse(resultJSON).join("\n")); | |
phantom.exit(); | |
}); | |
} | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment