Created
December 29, 2015 14:59
-
-
Save bonzanini/3fdc080258fc53bcd3fa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Print most common words in a corpus collected from Twitter | |
# | |
# Full description: | |
# http://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/ | |
# http://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/ | |
# http://marcobonzanini.com/2015/03/17/mining-twitter-data-with-python-part-3-term-frequencies/ | |
# | |
# Run: | |
# python twitter_most_common_words.py <filename.jsonl> | |
import sys | |
import json | |
from collections import Counter | |
import re | |
from nltk.corpus import stopwords | |
import string | |
punctuation = list(string.punctuation) | |
stop = stopwords.words('english') + punctuation + ['rt', 'via'] | |
emoticons_str = r""" | |
(?: | |
[:=;] # Eyes | |
[oO\-]? # Nose (optional) | |
[D\)\]\(\]/\\OpP] # Mouth | |
)""" | |
regex_str = [ | |
emoticons_str, | |
r'<[^>]+>', # HTML tags | |
r'(?:@[\w_]+)', # @-mentions | |
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags | |
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs | |
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers | |
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' | |
r'(?:[\w_]+)', # other words | |
r'(?:\S)' # anything else | |
] | |
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE) | |
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE) | |
def tokenize(s): | |
return tokens_re.findall(s) | |
def preprocess(s, lowercase=False): | |
tokens = tokenize(s) | |
if lowercase: | |
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens] | |
return tokens | |
if __name__ == '__main__': | |
fname = sys.argv[1] | |
with open(fname, 'r') as f: | |
count_all = Counter() | |
for line in f: | |
tweet = json.loads(line) | |
tokens = preprocess(tweet['text']) | |
count_all.update(tokens) | |
print(count_all.most_common(5)) |
Hello BFrost888 ,
I hope the following code solves your problem
`terms_stop = [term for term in featureVector if term not in stopWords]
terms_bigram = bigrams(terms_stop)
for t in terms_bigram:
print t `
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello,
Using the additional script for bigrams:
from nltk import bigrams
terms_bigram = bigrams(terms_stop)
How can I print the results from terms_bigram?
Thank you.