Skip to content

Instantly share code, notes, and snippets.

@driscoll
Created October 18, 2011 00:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save driscoll/1294324 to your computer and use it in GitHub Desktop.
Save driscoll/1294324 to your computer and use it in GitHub Desktop.
Gnip JSON output parsing script (DRAFT: 17 Oct 2011)
#!/usr/bin/python
""" TODO
Need better search strategy because
'ows' is matching '#throwstrikes'
Blacklist?
What's up with these blank lines?
"No text field found."
Need progress bar of some sort. Takes 4 ever.
"""
import sys
import re
RE_TEXT_FIELD = re.compile(r'"text":"([^"]*)"')
BLACKLIST = ['throwstrikes']
def die():
print "Usage: {0} keywordfn.txt gnipfn.txt outputfn.json".format(sys.argv[0])
sys.exit(1)
def get_keywords(filename):
"""Read keywords from filename
Return list of keywords
"""
try:
keywords = []
with open(filename,'r') as keywordf:
for line in keywordf:
keywords.append(line.strip().lower())
except:
print "Error reading keyword file."
die()
return keywords
def parse_tweets(keywords, gnipfn, outfn, verbose=False):
"""Parse tweets from file infn for keywords
Write results one at a time to outfn
Return dict of keyword:frequency pairs"""
try:
outputf = open(outfn, 'a')
except:
print "Error opening output file."
die()
try:
gnipf = open(gnipfn, 'r')
except:
print "Error opening file with tweets from Gnip."
die()
frequency = dict([(kw, 0) for kw in keywords])
for line in gnipf:
found = False
for kw in keywords:
try:
text = RE_TEXT_FIELD.search(line).group(1)
except:
print line
print "No text field found."
continue
if (text.find(kw) > -1):
# Need to test for BLACKLIST
frequency[kw] += 1
if verbose: print kw
found = True
if found:
if verbose: print text
outputf.write(line)
outputf.flush()
outputf.close()
return frequency
def report_freq(keywfreq):
"""Print table of keyword:frequency pairs in dict"""
print "Found {0} total matching tweets.".format(sum([freq for freq in keywfreq.values()]))
for kw, freq in keywfreq.iteritems():
print "{0:16}: {1}".format(kw, freq)
if __name__=='__main__':
if len(sys.argv) < 4:
die()
keywfreq = parse_tweets(get_keywords(sys.argv[1]),sys.argv[2],sys.argv[3], verbose=False)
report_freq(keywfreq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment