Last active
August 23, 2022 10:26
-
-
Save elliotbonneville/1bf694b8c83f358e0404 to your computer and use it in GitHub Desktop.
Find top keywords associated with a Google search with this Node.js application.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require("request"), | |
cheerio = require("cheerio"), | |
url = "https://www.google.com/search?q=data+mining", | |
corpus = {}, | |
totalResults = 0, | |
resultsDownloaded = 0; | |
function callback () { | |
resultsDownloaded++; | |
if (resultsDownloaded !== totalResults) { | |
return; | |
} | |
var words = []; | |
// stick all words in an array | |
for (prop in corpus) { | |
words.push({ | |
word: prop, | |
count: corpus[prop] | |
}); | |
} | |
// sort array based on how often they occur | |
words.sort(function (a, b) { | |
return b.count - a.count; | |
}); | |
// finally, log the first fifty most popular words | |
console.log(words.slice(0, 20)); | |
} | |
request(url, function (error, response, body) { | |
if (error) { | |
console.log(“Couldn’t get page because of error: “ + error); | |
return; | |
} | |
// load the body of the page into Cheerio so we can traverse the DOM | |
var $ = cheerio.load(body), | |
links = $(".r a"); | |
links.each(function (i, link) { | |
// get the href attribute of each link | |
var url = $(link).attr("href"); | |
// strip out unnecessary junk | |
url = url.replace("/url?q=", "").split("&")[0]; | |
if (url.charAt(0) === "/") { | |
return; | |
} | |
// this link counts as a result, so increment results | |
totalResults++; | |
// download that page | |
request(url, function (error, response, body) { | |
if (error) { | |
console.log(“Couldn’t get page because of error: “ + error); | |
return; | |
} | |
// load the page into cheerio | |
var $page = cheerio.load(body), | |
text = $page("body").text(); | |
// throw away extra whitespace and non-alphanumeric characters | |
text = text.replace(/\s+/g, " ") | |
.replace(/[^a-zA-Z ]/g, "") | |
.toLowerCase(); | |
// split on spaces for a list of all the words on that page and | |
// loop through that list | |
text.split(" ").forEach(function (word) { | |
// we don't want to include very short or long words, as they're | |
// probably bad data | |
if (word.length < 4 || word.length > 20) { | |
return; | |
} | |
if (corpus[word]) { | |
// if this word is already in our "corpus", our collection | |
// of terms, increase the count by one | |
corpus[word]++; | |
} else { | |
// otherwise, say that we've found one of that word so far | |
corpus[word] = 1; | |
} | |
}); | |
// and when our request is completed, call the callback to wrap up! | |
callback(); | |
}); | |
}); | |
}); |
Nice example!
Nice example. People could try https://github.com/jasondavies/d3-cloud too if they'd like to visualize the data. Cheers.
Lines 37 and 62 have improper quotes, so you get an error when running this example.
Thanks! I've spent an hour on that error.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Lines 37 and 62 have improper quotes, so you get an error when running this example.