Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
# Print most common words in a corpus collected from Twitter
# Full description:
# Run:
# python <filename.jsonl>
import sys
import json
from collections import Counter
import re
from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
emoticons_str = r"""
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
regex_str = [
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if else token.lower() for token in tokens]
return tokens
if __name__ == '__main__':
fname = sys.argv[1]
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
tokens = preprocess(tweet['text'])
Copy link

t0in4 commented Dec 31, 2015

Hello, i have some problem:
Traceback (most recent call last):
File "", line 83, in
fname = sys.argv[1]
IndexError: list index out of range

how can i fix this error?

Copy link

t0in4 commented Dec 31, 2015

Oh, i solve that! sys.argv[1] means, that after 'python' in command line need an argument. It will look like 'python stream_seeking.json'

Copy link


Using the additional script for bigrams:

from nltk import bigrams
terms_bigram = bigrams(terms_stop)

How can I print the results from terms_bigram?

Thank you.

Copy link

Hello BFrost888 ,
I hope the following code solves your problem

`terms_stop = [term for term in featureVector if term not in stopWords]
terms_bigram = bigrams(terms_stop)
for t in terms_bigram:
    print t `

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment