Skip to content

Instantly share code, notes, and snippets.

@lamlion
Forked from revox/tweet_word_frequency.py
Created November 17, 2020 09:37
Show Gist options
  • Save lamlion/6f78964bfc7fb33e300300a5083cb312 to your computer and use it in GitHub Desktop.
Save lamlion/6f78964bfc7fb33e300300a5083cb312 to your computer and use it in GitHub Desktop.
import nltk, sys, csv
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from datetime import datetime
import pandas as pd
import numpy as np
import string
from collections import Counter
# function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal
def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
text = text.lower()
tokens = tokenizer.tokenize(text)
return [word for word in tokens if word not in stopwords and not word.isdigit()]
# *** word frequency mining ****
# tokenizer
tweet_tokenizer = TweetTokenizer()
# punctuation list
punct = list(string.punctuation)
# download 127 Englisg stop words
nltk.download('stopwords')
# list of stop words and punctuations
stopword_list = stopwords.words('english') + punct + ['rt', 'via']
# record the number of occurences for each word
tf = Counter()
all_dates = []
with open('brexit_data.csv', 'rU') as inputfile:
tweetreader = csv.reader(inputfile,delimiter='|')
# get the text and the time
for row in tweetreader:
message = row[2]
tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list)
all_dates.append(row[1])
# update word frequency
tf.update(tokens)
# convert the counter to a sorted list (tf_sorted is a list of 2-tuples)
tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True)
# print each word and its frequency
csvfile = open('text_data.csv', 'w')
csvwriter = csv.writer(csvfile)
for item in tf_list_sorted:
print item[0].encode('utf-8'), item[1]
csvwriter.writerow([item[0].encode('utf-8'), item[1]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment