bonzanini/twitter_most_common_words.py

## twitter_most_common_words.py
# Print most common words in a corpus collected from Twitter
#
# Full description:
# http://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/
# http://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
# http://marcobonzanini.com/2015/03/17/mining-twitter-data-with-python-part-3-term-frequencies/
#
# Run:
# python twitter_most_common_words.py <filename.jsonl>

import sys
import json
from collections import Counter
import re
from nltk.corpus import stopwords
import string

punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs

    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)

def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens


if __name__ == '__main__':
    fname = sys.argv[1]

    with open(fname, 'r') as f:
        count_all = Counter()
        for line in f:
            tweet = json.loads(line)
            tokens = preprocess(tweet['text'])
            count_all.update(tokens)
        print(count_all.most_common(5))
	# Print most common words in a corpus collected from Twitter
	#
	# Full description:
	# http://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/
	# http://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
	# http://marcobonzanini.com/2015/03/17/mining-twitter-data-with-python-part-3-term-frequencies/
	#
	# Run:
	# python twitter_most_common_words.py <filename.jsonl>

	import sys
	import json
	from collections import Counter
	import re
	from nltk.corpus import stopwords
	import string

	punctuation = list(string.punctuation)
	stop = stopwords.words('english') + punctuation + ['rt', 'via']

	emoticons_str = r"""
	(?:
	[:=;] # Eyes
	[oO\-]? # Nose (optional)
	[D\)\]\(\]/\\OpP] # Mouth
	)"""

	regex_str = [
	emoticons_str,
	r'<[^>]+>', # HTML tags
	r'(?:@[\w_]+)', # @-mentions
	r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
	r'http[s]?://(?:[a-z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-f][0-9a-f]))+', # URLs

	r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
	r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
	r'(?:[\w_]+)', # other words
	r'(?:\S)' # anything else
	]

	tokens_re = re.compile(r'('+'\|'.join(regex_str)+')', re.VERBOSE \| re.IGNORECASE)
	emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE \| re.IGNORECASE)

	def tokenize(s):
	return tokens_re.findall(s)

	def preprocess(s, lowercase=False):
	tokens = tokenize(s)
	if lowercase:
	tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
	return tokens


	if __name__ == '__main__':
	fname = sys.argv[1]

	with open(fname, 'r') as f:
	count_all = Counter()
	for line in f:
	tweet = json.loads(line)
	tokens = preprocess(tweet['text'])
	count_all.update(tokens)
	print(count_all.most_common(5))