mknepprath/twitter_word_count.js

## twitter_word_count.js
// Copy and paste all of this into the console on a page that contains tweets to
// start collecting. Run `stopCrawlAndGenerateList()` once you've collected enough
// tweets.

// Instantiating raw array of tweets.
const allTweetsById = [];

// Common "words" to filter out based on 4 Twitter accounts' likes.
const commonWords = ["Chris","got","of","all","my","on","just","I’ll","have","to","make","a","list","the","little","I","but","also","happy","any","about","are","take","|","for","and","is","built","with","KBLoad","·","day","here","at","long","The","in","it","or","hope","you","enjoy","(and","if","this","been","working","hard","get","when","Williams","2","","ever","not","like","Maybe","some","When","learned","never","even","No","one","other","it,","It's","better","stop","break","than","10","bad","things","be","people","asked","me","try","image73","open","give","If","loving","go","your","new","Show","years","ago","joined","A","lot","more","Just","Matt","Dave","New","It","My","latest","made","Today","first","as","can","finally","into","I'm","We're","week","we","answered","what","good","design","put","case","And","+","show","few","including","proud","that","by","around","our","being","We","these","from","no","think","use","now","learn","leave","work","need","back","making","while","most","they","pay","attention","their","7","group","beautiful","called","views3","MBLoad","school","enough","two","them.","Load","image65","kind","book","That","were","telling","he","was","has","her","white","fall","set","against","during","created","every","5","days","How","how","can't","wait","such","would","keep","saying","they're","because","So","wrote","down","everything","an","still","professional","-","idea","you’re","where","I’m","it’s","Aug","2019","views2","shit","trying","find","world","will","@","moving","content","over","2Replying","famous","last","words","done","I've","come","up","possible","home","table","today","after","months","call","forever","Tweet","wish","could","watch","time","1Replying","Justin","nothing","quite","are.","don’t","know","why","meant","out","gonna","2015","start","based","told","best","great","away","friends","already","buy","want","fucking","added","so","many","Alex","same","thing","really","hate","It’s","too","big","31Replying","remember","them","having","its","can’t","who","three","favorite","love","before","i","help","do","that’s","country","right","tell","image25","GIF1","it.","everyone","image57","Ryan","Not","image69","&","Ben","Sarah","31I","read","confirm","only","doesn’t","copy","thinking","should","minutes","\n\nThe","very","something","then","views1","Steve","amazing","likely","example","history","us","post","women","his","free","In","else","woman","image30","had","mad","man","tried","company","behavior","change","process","direct","nervous","less","much","30Replying","different","1","since","money","own","doing","going","books","hit","chat","Daniel","image36","always","share","piece","/","met","team","it's","thought","included","seems","both","He","experience","business","drink","rules","off","through","companies","written","super","story","well","couple","him","job","real","under","community","opportunity","Is","asking","Nick","helping","seem","shows","please","am","office","month.","I’ve","anything","nice","sitting","reading","listening","tool","me.","become","version","fun","excited","using","eye","across","look","feels","thread","makes","feel","Ashley","29Replying","comes","internet","image40","I’d","used","tweet","hear","someone","now.","word","year","least","once","power","talk","next","David","might","actual","means","pretty","there","currently","13","see","though","Twitter","gets","app","those","isn't","watching","Jared","way","3","sit","Staff","You","don't","place","4","meet","image77","Lee","coming","again","here:","A.","28Replying","that's","did","looking","hiring","understand","folks","may","went","This","actually","ten","month","project","early","KB1","C.","recently","King","that,","totally","support","life","6","seeing","said","live","learning","each","past","we're","another","person","original","GIF3","class","grateful","literally","I'll","you're","reach","sounds","These","often","sure","Sara","color","quickly","27Replying","opinion","let","entire","send","getting","Mark","image75","25I","say","fast","Dan","now,","number","social","media","without","says","car","yet","But","26Replying","seen","complete","works","given","living","personal","images23","Quote","Jessica","25Replying","later","buying","image54","movies","character","started","convinced","important","matter","25","weekend,","care","disagree","threadLoad","this.","Will","several","She","ask","OF","mean","didn't","far","end","day,","expect","older","weeks","body","mass","thank","name","\n","part","huge","plan","does","single","say,","until","probably","movie","Jon","worth","site","building","fix","times","KB14","must","anyone","realized","scene","online","speaking","conference","whole","behind","which","spend","old","Andrew","local","believe","came","Michael","video","straight","All","easy","speak","kids","instead","glad","KB16","almost","sad","KB11","8","write","reason","today,","image58","coffee","needs","talks","members","w/","Being","via","absolutely","top","others","night","twitter","wasn't","allow","small","tomorrow","image37","feeling","signed","needed","12","perfect","miss"]

// Crawls the webpage for all currently visible tweets.
const getTweetsById = () => {
  // All tweets include a data attribute 'testid', so we're querying based on that.
  const tweets = document.querySelectorAll("div[data-testid='tweet']");

  // Looping through each of the queried tweets..
  // `Array.from()` is necessary to use array methods like .map or .filter. These
  // methods aren't available for NodeLists (lists of DOM elements).
  const tweetsArray = Array.from(tweets).map(tweet => {
    // For each tweet, we're getting an array of links. One of the links contains
    // the tweet ID - we're searching for it based on the fact that it includes
    // `/status/` in its URL.
    const tweetLinks = Array.from(tweet.querySelectorAll("a")).filter(link =>
      link.href.includes("/status/")
    );

    // Some tweets may contain links to other tweets, so we're grabbing the first
    // link. The one we need will always be first - it's attached to the timestamp.
    const firstTweetLink = tweetLinks[0];

    // Each tweet has two children, the profile picture and the content. We want
    // the latter (`lastChild`). The text section that contains the text has a
    // `lang` attribute, so we're selecting it based on that.
    const textElement = tweet.lastChild.querySelector('div[lang="en"]')

    // Returning the tweet ID along with the tweet text so we can remove duplicates
    // based on the ID easily. Some tweets don't contain text, so we return an
    // empty string.
    return {
      id: firstTweetLink.href.split("/").pop(),
      text: textElement ? textElement.innerText : ""
    };
  });

  // Return the tweets array we've generated.
  return tweetsArray;
};

// This function stops the webpage crawl and generates the final list of words
// sorted by frequency.
const stopCrawlAndGenerateList = () => {
  // Stops the interval (loop) that is crawling the page.
  clearInterval(fetchTweetsId);

  // Filters tweets by ID and returns an array that contains only the tweets.
  const uniqueTweets = allTweetsById
    .filter(
      (tweet, index, self) =>
        // If the index of this tweet is the current index, add it, otherwise
        // don't add it again.
        self.findIndex(selfTweet => selfTweet.id === tweet.id) === index
    )
    // Return just the tweet text, as we no longer need the ID.
    .map(tweet => tweet.text);

  // Creates an array of arrays containing the words in the tweets.
  const allTextArrays = Array.from(uniqueTweets).map(text =>
    text.split(" ")
  );

  // Flattens the array. This is now one long list of words.
  const allListWords = [].concat.apply([], allTextArrays);

  // Creates an array of unique words.
  const uniqueWords = allListWords.filter(
    (value, index, self) => self.indexOf(value) === index
  );

  // Filters out words from the common word list.
  const filteredUniqueWords = uniqueWords.filter(word => !commonWords.includes(word))

  // Counts the number of times each unique word occurs in the full list of words.
  const wordCounts = filteredUniqueWords.map(word => ({
    word,
    count: allListWords.filter(w => w === word).length
  }));

  // Prints the number of tweets crawled and returns the words + word counts.
  console.log(`Derived from ${uniqueTweets.length} tweets.`);
  wordCounts.sort((a, b) => b.count - a.count);
  return wordCounts;
};

// This starts the loop that crawls the page for tweets. It will run indefinitely.
// Run `stopCrawlAndGenerateList()` once you've collected enough tweets.
const fetchTweetsId = setInterval(() => {
  // Add collected tweets to the array of previously collected tweets.
  allTweetsById.push(...getTweetsById());

  // Scroll the page to load more tweets.
  window.scrollTo(0, document.body.scrollHeight);

  // These actions happen once each second.
}, 1000); // 1000 ms, 1 second
	// Copy and paste all of this into the console on a page that contains tweets to
	// start collecting. Run `stopCrawlAndGenerateList()` once you've collected enough
	// tweets.

	// Instantiating raw array of tweets.
	const allTweetsById = [];

	// Common "words" to filter out based on 4 Twitter accounts' likes.
	const commonWords = ["Chris","got","of","all","my","on","just","I’ll","have","to","make","a","list","the","little","I","but","also","happy","any","about","are","take","\|","for","and","is","built","with","KBLoad","·","day","here","at","long","The","in","it","or","hope","you","enjoy","(and","if","this","been","working","hard","get","when","Williams","2","","ever","not","like","Maybe","some","When","learned","never","even","No","one","other","it,","It's","better","stop","break","than","10","bad","things","be","people","asked","me","try","image73","open","give","If","loving","go","your","new","Show","years","ago","joined","A","lot","more","Just","Matt","Dave","New","It","My","latest","made","Today","first","as","can","finally","into","I'm","We're","week","we","answered","what","good","design","put","case","And","+","show","few","including","proud","that","by","around","our","being","We","these","from","no","think","use","now","learn","leave","work","need","back","making","while","most","they","pay","attention","their","7","group","beautiful","called","views3","MBLoad","school","enough","two","them.","Load","image65","kind","book","That","were","telling","he","was","has","her","white","fall","set","against","during","created","every","5","days","How","how","can't","wait","such","would","keep","saying","they're","because","So","wrote","down","everything","an","still","professional","-","idea","you’re","where","I’m","it’s","Aug","2019","views2","shit","trying","find","world","will","@","moving","content","over","2Replying","famous","last","words","done","I've","come","up","possible","home","table","today","after","months","call","forever","Tweet","wish","could","watch","time","1Replying","Justin","nothing","quite","are.","don’t","know","why","meant","out","gonna","2015","start","based","told","best","great","away","friends","already","buy","want","fucking","added","so","many","Alex","same","thing","really","hate","It’s","too","big","31Replying","remember","them","having","its","can’t","who","three","favorite","love","before","i","help","do","that’s","country","right","tell","image25","GIF1","it.","everyone","image57","Ryan","Not","image69","&","Ben","Sarah","31I","read","confirm","only","doesn’t","copy","thinking","should","minutes","\n\nThe","very","something","then","views1","Steve","amazing","likely","example","history","us","post","women","his","free","In","else","woman","image30","had","mad","man","tried","company","behavior","change","process","direct","nervous","less","much","30Replying","different","1","since","money","own","doing","going","books","hit","chat","Daniel","image36","always","share","piece","/","met","team","it's","thought","included","seems","both","He","experience","business","drink","rules","off","through","companies","written","super","story","well","couple","him","job","real","under","community","opportunity","Is","asking","Nick","helping","seem","shows","please","am","office","month.","I’ve","anything","nice","sitting","reading","listening","tool","me.","become","version","fun","excited","using","eye","across","look","feels","thread","makes","feel","Ashley","29Replying","comes","internet","image40","I’d","used","tweet","hear","someone","now.","word","year","least","once","power","talk","next","David","might","actual","means","pretty","there","currently","13","see","though","Twitter","gets","app","those","isn't","watching","Jared","way","3","sit","Staff","You","don't","place","4","meet","image77","Lee","coming","again","here:","A.","28Replying","that's","did","looking","hiring","understand","folks","may","went","This","actually","ten","month","project","early","KB1","C.","recently","King","that,","totally","support","life","6","seeing","said","live","learning","each","past","we're","another","person","original","GIF3","class","grateful","literally","I'll","you're","reach","sounds","These","often","sure","Sara","color","quickly","27Replying","opinion","let","entire","send","getting","Mark","image75","25I","say","fast","Dan","now,","number","social","media","without","says","car","yet","But","26Replying","seen","complete","works","given","living","personal","images23","Quote","Jessica","25Replying","later","buying","image54","movies","character","started","convinced","important","matter","25","weekend,","care","disagree","threadLoad","this.","Will","several","She","ask","OF","mean","didn't","far","end","day,","expect","older","weeks","body","mass","thank","name","\n","part","huge","plan","does","single","say,","until","probably","movie","Jon","worth","site","building","fix","times","KB14","must","anyone","realized","scene","online","speaking","conference","whole","behind","which","spend","old","Andrew","local","believe","came","Michael","video","straight","All","easy","speak","kids","instead","glad","KB16","almost","sad","KB11","8","write","reason","today,","image58","coffee","needs","talks","members","w/","Being","via","absolutely","top","others","night","twitter","wasn't","allow","small","tomorrow","image37","feeling","signed","needed","12","perfect","miss"]

	// Crawls the webpage for all currently visible tweets.
	const getTweetsById = () => {
	// All tweets include a data attribute 'testid', so we're querying based on that.
	const tweets = document.querySelectorAll("div[data-testid='tweet']");

	// Looping through each of the queried tweets..
	// `Array.from()` is necessary to use array methods like .map or .filter. These
	// methods aren't available for NodeLists (lists of DOM elements).
	const tweetsArray = Array.from(tweets).map(tweet => {
	// For each tweet, we're getting an array of links. One of the links contains
	// the tweet ID - we're searching for it based on the fact that it includes
	// `/status/` in its URL.
	const tweetLinks = Array.from(tweet.querySelectorAll("a")).filter(link =>
	link.href.includes("/status/")
	);

	// Some tweets may contain links to other tweets, so we're grabbing the first
	// link. The one we need will always be first - it's attached to the timestamp.
	const firstTweetLink = tweetLinks[0];

	// Each tweet has two children, the profile picture and the content. We want
	// the latter (`lastChild`). The text section that contains the text has a
	// `lang` attribute, so we're selecting it based on that.
	const textElement = tweet.lastChild.querySelector('div[lang="en"]')

	// Returning the tweet ID along with the tweet text so we can remove duplicates
	// based on the ID easily. Some tweets don't contain text, so we return an
	// empty string.
	return {
	id: firstTweetLink.href.split("/").pop(),
	text: textElement ? textElement.innerText : ""
	};
	});

	// Return the tweets array we've generated.
	return tweetsArray;
	};

	// This function stops the webpage crawl and generates the final list of words
	// sorted by frequency.
	const stopCrawlAndGenerateList = () => {
	// Stops the interval (loop) that is crawling the page.
	clearInterval(fetchTweetsId);

	// Filters tweets by ID and returns an array that contains only the tweets.
	const uniqueTweets = allTweetsById
	.filter(
	(tweet, index, self) =>
	// If the index of this tweet is the current index, add it, otherwise
	// don't add it again.
	self.findIndex(selfTweet => selfTweet.id === tweet.id) === index
	)
	// Return just the tweet text, as we no longer need the ID.
	.map(tweet => tweet.text);

	// Creates an array of arrays containing the words in the tweets.
	const allTextArrays = Array.from(uniqueTweets).map(text =>
	text.split(" ")
	);

	// Flattens the array. This is now one long list of words.
	const allListWords = [].concat.apply([], allTextArrays);

	// Creates an array of unique words.
	const uniqueWords = allListWords.filter(
	(value, index, self) => self.indexOf(value) === index
	);

	// Filters out words from the common word list.
	const filteredUniqueWords = uniqueWords.filter(word => !commonWords.includes(word))

	// Counts the number of times each unique word occurs in the full list of words.
	const wordCounts = filteredUniqueWords.map(word => ({
	word,
	count: allListWords.filter(w => w === word).length
	}));

	// Prints the number of tweets crawled and returns the words + word counts.
	console.log(`Derived from ${uniqueTweets.length} tweets.`);
	wordCounts.sort((a, b) => b.count - a.count);
	return wordCounts;
	};

	// This starts the loop that crawls the page for tweets. It will run indefinitely.
	// Run `stopCrawlAndGenerateList()` once you've collected enough tweets.
	const fetchTweetsId = setInterval(() => {
	// Add collected tweets to the array of previously collected tweets.
	allTweetsById.push(...getTweetsById());

	// Scroll the page to load more tweets.
	window.scrollTo(0, document.body.scrollHeight);

	// These actions happen once each second.
	}, 1000); // 1000 ms, 1 second