Skip to content

Instantly share code, notes, and snippets.

@mknepprath
Last active June 11, 2020 20:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mknepprath/ec9689643d74c193556e71e7c2e41b56 to your computer and use it in GitHub Desktop.
Save mknepprath/ec9689643d74c193556e71e7c2e41b56 to your computer and use it in GitHub Desktop.
// Copy and paste all of this into the console on a page that contains tweets to
// start collecting. Run `stopCrawlAndGenerateList()` once you've collected enough
// tweets.
// Instantiating raw array of tweets.
const allTweetsById = [];
// Common "words" to filter out based on 4 Twitter accounts' likes.
const commonWords = ["Chris","got","of","all","my","on","just","I’ll","have","to","make","a","list","the","little","I","but","also","happy","any","about","are","take","|","for","and","is","built","with","KBLoad","·","day","here","at","long","The","in","it","or","hope","you","enjoy","(and","if","this","been","working","hard","get","when","Williams","2","","ever","not","like","Maybe","some","When","learned","never","even","No","one","other","it,","It's","better","stop","break","than","10","bad","things","be","people","asked","me","try","image73","open","give","If","loving","go","your","new","Show","years","ago","joined","A","lot","more","Just","Matt","Dave","New","It","My","latest","made","Today","first","as","can","finally","into","I'm","We're","week","we","answered","what","good","design","put","case","And","+","show","few","including","proud","that","by","around","our","being","We","these","from","no","think","use","now","learn","leave","work","need","back","making","while","most","they","pay","attention","their","7","group","beautiful","called","views3","MBLoad","school","enough","two","them.","Load","image65","kind","book","That","were","telling","he","was","has","her","white","fall","set","against","during","created","every","5","days","How","how","can't","wait","such","would","keep","saying","they're","because","So","wrote","down","everything","an","still","professional","-","idea","you’re","where","I’m","it’s","Aug","2019","views2","shit","trying","find","world","will","@","moving","content","over","2Replying","famous","last","words","done","I've","come","up","possible","home","table","today","after","months","call","forever","Tweet","wish","could","watch","time","1Replying","Justin","nothing","quite","are.","don’t","know","why","meant","out","gonna","2015","start","based","told","best","great","away","friends","already","buy","want","fucking","added","so","many","Alex","same","thing","really","hate","It’s","too","big","31Replying","remember","them","having","its","can’t","who","three","favorite","love","before","i","help","do","that’s","country","right","tell","image25","GIF1","it.","everyone","image57","Ryan","Not","image69","&","Ben","Sarah","31I","read","confirm","only","doesn’t","copy","thinking","should","minutes","\n\nThe","very","something","then","views1","Steve","amazing","likely","example","history","us","post","women","his","free","In","else","woman","image30","had","mad","man","tried","company","behavior","change","process","direct","nervous","less","much","30Replying","different","1","since","money","own","doing","going","books","hit","chat","Daniel","image36","always","share","piece","/","met","team","it's","thought","included","seems","both","He","experience","business","drink","rules","off","through","companies","written","super","story","well","couple","him","job","real","under","community","opportunity","Is","asking","Nick","helping","seem","shows","please","am","office","month.","I’ve","anything","nice","sitting","reading","listening","tool","me.","become","version","fun","excited","using","eye","across","look","feels","thread","makes","feel","Ashley","29Replying","comes","internet","image40","I’d","used","tweet","hear","someone","now.","word","year","least","once","power","talk","next","David","might","actual","means","pretty","there","currently","13","see","though","Twitter","gets","app","those","isn't","watching","Jared","way","3","sit","Staff","You","don't","place","4","meet","image77","Lee","coming","again","here:","A.","28Replying","that's","did","looking","hiring","understand","folks","may","went","This","actually","ten","month","project","early","KB1","C.","recently","King","that,","totally","support","life","6","seeing","said","live","learning","each","past","we're","another","person","original","GIF3","class","grateful","literally","I'll","you're","reach","sounds","These","often","sure","Sara","color","quickly","27Replying","opinion","let","entire","send","getting","Mark","image75","25I","say","fast","Dan","now,","number","social","media","without","says","car","yet","But","26Replying","seen","complete","works","given","living","personal","images23","Quote","Jessica","25Replying","later","buying","image54","movies","character","started","convinced","important","matter","25","weekend,","care","disagree","threadLoad","this.","Will","several","She","ask","OF","mean","didn't","far","end","day,","expect","older","weeks","body","mass","thank","name","\n","part","huge","plan","does","single","say,","until","probably","movie","Jon","worth","site","building","fix","times","KB14","must","anyone","realized","scene","online","speaking","conference","whole","behind","which","spend","old","Andrew","local","believe","came","Michael","video","straight","All","easy","speak","kids","instead","glad","KB16","almost","sad","KB11","8","write","reason","today,","image58","coffee","needs","talks","members","w/","Being","via","absolutely","top","others","night","twitter","wasn't","allow","small","tomorrow","image37","feeling","signed","needed","12","perfect","miss"]
// Crawls the webpage for all currently visible tweets.
const getTweetsById = () => {
// All tweets include a data attribute 'testid', so we're querying based on that.
const tweets = document.querySelectorAll("div[data-testid='tweet']");
// Looping through each of the queried tweets..
// `Array.from()` is necessary to use array methods like .map or .filter. These
// methods aren't available for NodeLists (lists of DOM elements).
const tweetsArray = Array.from(tweets).map(tweet => {
// For each tweet, we're getting an array of links. One of the links contains
// the tweet ID - we're searching for it based on the fact that it includes
// `/status/` in its URL.
const tweetLinks = Array.from(tweet.querySelectorAll("a")).filter(link =>
link.href.includes("/status/")
);
// Some tweets may contain links to other tweets, so we're grabbing the first
// link. The one we need will always be first - it's attached to the timestamp.
const firstTweetLink = tweetLinks[0];
// Each tweet has two children, the profile picture and the content. We want
// the latter (`lastChild`). The text section that contains the text has a
// `lang` attribute, so we're selecting it based on that.
const textElement = tweet.lastChild.querySelector('div[lang="en"]')
// Returning the tweet ID along with the tweet text so we can remove duplicates
// based on the ID easily. Some tweets don't contain text, so we return an
// empty string.
return {
id: firstTweetLink.href.split("/").pop(),
text: textElement ? textElement.innerText : ""
};
});
// Return the tweets array we've generated.
return tweetsArray;
};
// This function stops the webpage crawl and generates the final list of words
// sorted by frequency.
const stopCrawlAndGenerateList = () => {
// Stops the interval (loop) that is crawling the page.
clearInterval(fetchTweetsId);
// Filters tweets by ID and returns an array that contains only the tweets.
const uniqueTweets = allTweetsById
.filter(
(tweet, index, self) =>
// If the index of this tweet is the current index, add it, otherwise
// don't add it again.
self.findIndex(selfTweet => selfTweet.id === tweet.id) === index
)
// Return just the tweet text, as we no longer need the ID.
.map(tweet => tweet.text);
// Creates an array of arrays containing the words in the tweets.
const allTextArrays = Array.from(uniqueTweets).map(text =>
text.split(" ")
);
// Flattens the array. This is now one long list of words.
const allListWords = [].concat.apply([], allTextArrays);
// Creates an array of unique words.
const uniqueWords = allListWords.filter(
(value, index, self) => self.indexOf(value) === index
);
// Filters out words from the common word list.
const filteredUniqueWords = uniqueWords.filter(word => !commonWords.includes(word))
// Counts the number of times each unique word occurs in the full list of words.
const wordCounts = filteredUniqueWords.map(word => ({
word,
count: allListWords.filter(w => w === word).length
}));
// Prints the number of tweets crawled and returns the words + word counts.
console.log(`Derived from ${uniqueTweets.length} tweets.`);
wordCounts.sort((a, b) => b.count - a.count);
return wordCounts;
};
// This starts the loop that crawls the page for tweets. It will run indefinitely.
// Run `stopCrawlAndGenerateList()` once you've collected enough tweets.
const fetchTweetsId = setInterval(() => {
// Add collected tweets to the array of previously collected tweets.
allTweetsById.push(...getTweetsById());
// Scroll the page to load more tweets.
window.scrollTo(0, document.body.scrollHeight);
// These actions happen once each second.
}, 1000); // 1000 ms, 1 second
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment