Skip to content

Instantly share code, notes, and snippets.

@vitorio
Created May 12, 2024 07:00
Show Gist options
  • Save vitorio/2c00e0a0dbc0f9abc6c2925d86ac954d to your computer and use it in GitHub Desktop.
Save vitorio/2c00e0a0dbc0f9abc6c2925d86ac954d to your computer and use it in GitHub Desktop.
Naive concordance from Twitter archive
# After you've got a list of words from sorting/filter the generated CSV
# copy and paste this into a running Python REPL and keep re-running the
# last line with each word you want to interrogate through the concordance
from nltk.corpus.reader.markdown import MarkdownCorpusReader
from nltk.text import TextCollection
corpus = MarkdownCorpusReader('<YOUR COMBINED MARKDOWN TWEETS FOLDER>', r'.*\.md')
text = TextCollection(corpus)
text.concordance("work")
# Pre-process your Twitter archive with https://github.com/timhutton/twitter-archive-parser
# Copy all the Markdown files from the individual `parser-output/tweets-md/YYYY` into a single combined folder
# This generates a CSV with all the words and word frequencies for sorting/filtering, but not their contexts
from nltk.corpus.reader.markdown import MarkdownCorpusReader
from nltk.text import TextCollection
from nltk import FreqDist
import csv
corpus = MarkdownCorpusReader('<YOUR COMBINED MARKDOWN TWEETS FOLDER>', r'.*\.md')
text = TextCollection(corpus)
words = [w.lower() for w in corpus.words()]
unique_words = sorted(set(words))
print(len(unique_words))
freqs = FreqDist(words)
with open('freqs.csv', 'w') as csvfile:
freqwriter = csv.writer(csvfile)
for k in freqs:
freqwriter.writerow([k, freqs[k]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment