Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# Print most common words in a corpus collected from Twitter
#
# Full description:
# http://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/
# http://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
# http://marcobonzanini.com/2015/03/17/mining-twitter-data-with-python-part-3-term-frequencies/
#
# Run:
# python twitter_most_common_words.py <filename.jsonl>
import sys
import json
from collections import Counter
import re
from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
if __name__ == '__main__':
fname = sys.argv[1]
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
tokens = preprocess(tweet['text'])
count_all.update(tokens)
print(count_all.most_common(5))
@t0in4

This comment has been minimized.

Copy link

@t0in4 t0in4 commented Dec 31, 2015

Hello, i have some problem:
Traceback (most recent call last):
File "counting.py", line 83, in
fname = sys.argv[1]
IndexError: list index out of range

how can i fix this error?

@t0in4

This comment has been minimized.

Copy link

@t0in4 t0in4 commented Dec 31, 2015

Oh, i solve that! sys.argv[1] means, that after 'python counting.py' in command line need an argument. It will look like 'python counting.py stream_seeking.json'

@BFrost888

This comment has been minimized.

Copy link

@BFrost888 BFrost888 commented Jan 7, 2016

Hello,

Using the additional script for bigrams:

from nltk import bigrams
terms_bigram = bigrams(terms_stop)

How can I print the results from terms_bigram?

Thank you.

@Shyamasundar

This comment has been minimized.

Copy link

@Shyamasundar Shyamasundar commented May 4, 2016

Hello BFrost888 ,
I hope the following code solves your problem

`terms_stop = [term for term in featureVector if term not in stopWords]
terms_bigram = bigrams(terms_stop)
for t in terms_bigram:
    print t `
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.