Created
May 12, 2024 07:00
-
-
Save vitorio/2c00e0a0dbc0f9abc6c2925d86ac954d to your computer and use it in GitHub Desktop.
Naive concordance from Twitter archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# After you've got a list of words from sorting/filter the generated CSV | |
# copy and paste this into a running Python REPL and keep re-running the | |
# last line with each word you want to interrogate through the concordance | |
from nltk.corpus.reader.markdown import MarkdownCorpusReader | |
from nltk.text import TextCollection | |
corpus = MarkdownCorpusReader('<YOUR COMBINED MARKDOWN TWEETS FOLDER>', r'.*\.md') | |
text = TextCollection(corpus) | |
text.concordance("work") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pre-process your Twitter archive with https://github.com/timhutton/twitter-archive-parser | |
# Copy all the Markdown files from the individual `parser-output/tweets-md/YYYY` into a single combined folder | |
# This generates a CSV with all the words and word frequencies for sorting/filtering, but not their contexts | |
from nltk.corpus.reader.markdown import MarkdownCorpusReader | |
from nltk.text import TextCollection | |
from nltk import FreqDist | |
import csv | |
corpus = MarkdownCorpusReader('<YOUR COMBINED MARKDOWN TWEETS FOLDER>', r'.*\.md') | |
text = TextCollection(corpus) | |
words = [w.lower() for w in corpus.words()] | |
unique_words = sorted(set(words)) | |
print(len(unique_words)) | |
freqs = FreqDist(words) | |
with open('freqs.csv', 'w') as csvfile: | |
freqwriter = csv.writer(csvfile) | |
for k in freqs: | |
freqwriter.writerow([k, freqs[k]]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment