Last active
June 11, 2020 20:32
-
-
Save mknepprath/ec9689643d74c193556e71e7c2e41b56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copy and paste all of this into the console on a page that contains tweets to | |
// start collecting. Run `stopCrawlAndGenerateList()` once you've collected enough | |
// tweets. | |
// Instantiating raw array of tweets. | |
const allTweetsById = []; | |
// Common "words" to filter out based on 4 Twitter accounts' likes. | |
const commonWords = ["Chris","got","of","all","my","on","just","I’ll","have","to","make","a","list","the","little","I","but","also","happy","any","about","are","take","|","for","and","is","built","with","KBLoad","·","day","here","at","long","The","in","it","or","hope","you","enjoy","(and","if","this","been","working","hard","get","when","Williams","2","","ever","not","like","Maybe","some","When","learned","never","even","No","one","other","it,","It's","better","stop","break","than","10","bad","things","be","people","asked","me","try","image73","open","give","If","loving","go","your","new","Show","years","ago","joined","A","lot","more","Just","Matt","Dave","New","It","My","latest","made","Today","first","as","can","finally","into","I'm","We're","week","we","answered","what","good","design","put","case","And","+","show","few","including","proud","that","by","around","our","being","We","these","from","no","think","use","now","learn","leave","work","need","back","making","while","most","they","pay","attention","their","7","group","beautiful","called","views3","MBLoad","school","enough","two","them.","Load","image65","kind","book","That","were","telling","he","was","has","her","white","fall","set","against","during","created","every","5","days","How","how","can't","wait","such","would","keep","saying","they're","because","So","wrote","down","everything","an","still","professional","-","idea","you’re","where","I’m","it’s","Aug","2019","views2","shit","trying","find","world","will","@","moving","content","over","2Replying","famous","last","words","done","I've","come","up","possible","home","table","today","after","months","call","forever","Tweet","wish","could","watch","time","1Replying","Justin","nothing","quite","are.","don’t","know","why","meant","out","gonna","2015","start","based","told","best","great","away","friends","already","buy","want","fucking","added","so","many","Alex","same","thing","really","hate","It’s","too","big","31Replying","remember","them","having","its","can’t","who","three","favorite","love","before","i","help","do","that’s","country","right","tell","image25","GIF1","it.","everyone","image57","Ryan","Not","image69","&","Ben","Sarah","31I","read","confirm","only","doesn’t","copy","thinking","should","minutes","\n\nThe","very","something","then","views1","Steve","amazing","likely","example","history","us","post","women","his","free","In","else","woman","image30","had","mad","man","tried","company","behavior","change","process","direct","nervous","less","much","30Replying","different","1","since","money","own","doing","going","books","hit","chat","Daniel","image36","always","share","piece","/","met","team","it's","thought","included","seems","both","He","experience","business","drink","rules","off","through","companies","written","super","story","well","couple","him","job","real","under","community","opportunity","Is","asking","Nick","helping","seem","shows","please","am","office","month.","I’ve","anything","nice","sitting","reading","listening","tool","me.","become","version","fun","excited","using","eye","across","look","feels","thread","makes","feel","Ashley","29Replying","comes","internet","image40","I’d","used","tweet","hear","someone","now.","word","year","least","once","power","talk","next","David","might","actual","means","pretty","there","currently","13","see","though","Twitter","gets","app","those","isn't","watching","Jared","way","3","sit","Staff","You","don't","place","4","meet","image77","Lee","coming","again","here:","A.","28Replying","that's","did","looking","hiring","understand","folks","may","went","This","actually","ten","month","project","early","KB1","C.","recently","King","that,","totally","support","life","6","seeing","said","live","learning","each","past","we're","another","person","original","GIF3","class","grateful","literally","I'll","you're","reach","sounds","These","often","sure","Sara","color","quickly","27Replying","opinion","let","entire","send","getting","Mark","image75","25I","say","fast","Dan","now,","number","social","media","without","says","car","yet","But","26Replying","seen","complete","works","given","living","personal","images23","Quote","Jessica","25Replying","later","buying","image54","movies","character","started","convinced","important","matter","25","weekend,","care","disagree","threadLoad","this.","Will","several","She","ask","OF","mean","didn't","far","end","day,","expect","older","weeks","body","mass","thank","name","\n","part","huge","plan","does","single","say,","until","probably","movie","Jon","worth","site","building","fix","times","KB14","must","anyone","realized","scene","online","speaking","conference","whole","behind","which","spend","old","Andrew","local","believe","came","Michael","video","straight","All","easy","speak","kids","instead","glad","KB16","almost","sad","KB11","8","write","reason","today,","image58","coffee","needs","talks","members","w/","Being","via","absolutely","top","others","night","twitter","wasn't","allow","small","tomorrow","image37","feeling","signed","needed","12","perfect","miss"] | |
// Crawls the webpage for all currently visible tweets. | |
const getTweetsById = () => { | |
// All tweets include a data attribute 'testid', so we're querying based on that. | |
const tweets = document.querySelectorAll("div[data-testid='tweet']"); | |
// Looping through each of the queried tweets.. | |
// `Array.from()` is necessary to use array methods like .map or .filter. These | |
// methods aren't available for NodeLists (lists of DOM elements). | |
const tweetsArray = Array.from(tweets).map(tweet => { | |
// For each tweet, we're getting an array of links. One of the links contains | |
// the tweet ID - we're searching for it based on the fact that it includes | |
// `/status/` in its URL. | |
const tweetLinks = Array.from(tweet.querySelectorAll("a")).filter(link => | |
link.href.includes("/status/") | |
); | |
// Some tweets may contain links to other tweets, so we're grabbing the first | |
// link. The one we need will always be first - it's attached to the timestamp. | |
const firstTweetLink = tweetLinks[0]; | |
// Each tweet has two children, the profile picture and the content. We want | |
// the latter (`lastChild`). The text section that contains the text has a | |
// `lang` attribute, so we're selecting it based on that. | |
const textElement = tweet.lastChild.querySelector('div[lang="en"]') | |
// Returning the tweet ID along with the tweet text so we can remove duplicates | |
// based on the ID easily. Some tweets don't contain text, so we return an | |
// empty string. | |
return { | |
id: firstTweetLink.href.split("/").pop(), | |
text: textElement ? textElement.innerText : "" | |
}; | |
}); | |
// Return the tweets array we've generated. | |
return tweetsArray; | |
}; | |
// This function stops the webpage crawl and generates the final list of words | |
// sorted by frequency. | |
const stopCrawlAndGenerateList = () => { | |
// Stops the interval (loop) that is crawling the page. | |
clearInterval(fetchTweetsId); | |
// Filters tweets by ID and returns an array that contains only the tweets. | |
const uniqueTweets = allTweetsById | |
.filter( | |
(tweet, index, self) => | |
// If the index of this tweet is the current index, add it, otherwise | |
// don't add it again. | |
self.findIndex(selfTweet => selfTweet.id === tweet.id) === index | |
) | |
// Return just the tweet text, as we no longer need the ID. | |
.map(tweet => tweet.text); | |
// Creates an array of arrays containing the words in the tweets. | |
const allTextArrays = Array.from(uniqueTweets).map(text => | |
text.split(" ") | |
); | |
// Flattens the array. This is now one long list of words. | |
const allListWords = [].concat.apply([], allTextArrays); | |
// Creates an array of unique words. | |
const uniqueWords = allListWords.filter( | |
(value, index, self) => self.indexOf(value) === index | |
); | |
// Filters out words from the common word list. | |
const filteredUniqueWords = uniqueWords.filter(word => !commonWords.includes(word)) | |
// Counts the number of times each unique word occurs in the full list of words. | |
const wordCounts = filteredUniqueWords.map(word => ({ | |
word, | |
count: allListWords.filter(w => w === word).length | |
})); | |
// Prints the number of tweets crawled and returns the words + word counts. | |
console.log(`Derived from ${uniqueTweets.length} tweets.`); | |
wordCounts.sort((a, b) => b.count - a.count); | |
return wordCounts; | |
}; | |
// This starts the loop that crawls the page for tweets. It will run indefinitely. | |
// Run `stopCrawlAndGenerateList()` once you've collected enough tweets. | |
const fetchTweetsId = setInterval(() => { | |
// Add collected tweets to the array of previously collected tweets. | |
allTweetsById.push(...getTweetsById()); | |
// Scroll the page to load more tweets. | |
window.scrollTo(0, document.body.scrollHeight); | |
// These actions happen once each second. | |
}, 1000); // 1000 ms, 1 second |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment