Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Find top keywords associated with a Google search with this Node.js application.
var request = require("request"),
cheerio = require("cheerio"),
url = "https://www.google.com/search?q=data+mining",
corpus = {},
totalResults = 0,
resultsDownloaded = 0;
function callback () {
resultsDownloaded++;
if (resultsDownloaded !== totalResults) {
return;
}
var words = [];
// stick all words in an array
for (prop in corpus) {
words.push({
word: prop,
count: corpus[prop]
});
}
// sort array based on how often they occur
words.sort(function (a, b) {
return b.count - a.count;
});
// finally, log the first fifty most popular words
console.log(words.slice(0, 20));
}
request(url, function (error, response, body) {
if (error) {
console.log(“Couldn’t get page because of error:+ error);
return;
}
// load the body of the page into Cheerio so we can traverse the DOM
var $ = cheerio.load(body),
links = $(".r a");
links.each(function (i, link) {
// get the href attribute of each link
var url = $(link).attr("href");
// strip out unnecessary junk
url = url.replace("/url?q=", "").split("&")[0];
if (url.charAt(0) === "/") {
return;
}
// this link counts as a result, so increment results
totalResults++;
// download that page
request(url, function (error, response, body) {
if (error) {
console.log(“Couldn’t get page because of error:+ error);
return;
}
// load the page into cheerio
var $page = cheerio.load(body),
text = $page("body").text();
// throw away extra whitespace and non-alphanumeric characters
text = text.replace(/\s+/g, " ")
.replace(/[^a-zA-Z ]/g, "")
.toLowerCase();
// split on spaces for a list of all the words on that page and
// loop through that list
text.split(" ").forEach(function (word) {
// we don't want to include very short or long words, as they're
// probably bad data
if (word.length < 4 || word.length > 20) {
return;
}
if (corpus[word]) {
// if this word is already in our "corpus", our collection
// of terms, increase the count by one
corpus[word]++;
} else {
// otherwise, say that we've found one of that word so far
corpus[word] = 1;
}
});
// and when our request is completed, call the callback to wrap up!
callback();
});
});
});
@tim-peterson

This comment has been minimized.

Copy link

commented Aug 8, 2015

Lines 37 and 62 have improper quotes, so you get an error when running this example.

@akatrodiya

This comment has been minimized.

Copy link

commented Nov 18, 2015

Nice example!

@kristianjaeger

This comment has been minimized.

Copy link

commented Dec 3, 2015

Nice example. People could try https://github.com/jasondavies/d3-cloud too if they'd like to visualize the data. Cheers.

@chainberg

This comment has been minimized.

Copy link

commented Nov 21, 2018

Lines 37 and 62 have improper quotes, so you get an error when running this example.

Thanks! I've spent an hour on that error.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.