Skip to content

Instantly share code, notes, and snippets.

@brendano
Last active August 29, 2015 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brendano/6d5e728c2260d6313586 to your computer and use it in GitHub Desktop.
Save brendano/6d5e728c2260d6313586 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys,os,re,json
from hose_util import iterate, lookup
# import geodb
# country_db = geodb.GeoDB.load_geojson_files(['/home/brenocon/geocode/tm_world_borders-0.3.json'])
OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
Separator= r', ?'
LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)
VERBOSE = False
def reject(reason):
global raw,tweet
if not VERBOSE: return
reject_key = "REJECT:" + reason
output = "%s\t%s" % (reject_key, raw)
print output
for raw,tweet in iterate(raw=True):
lat = None
lon = None
loc_type = None
geo = lookup(tweet, 'geo')
if geo and geo['type'] == 'Point':
lat,lon = geo['coordinates']
loc_type = 'OFFICIAL'
else:
loc = lookup(tweet, 'user.location').strip()
if not loc:
reject("NO USERLOC")
continue
m = LatLong.search(loc.encode('utf8'))
if not m:
reject("NO GEO REGEX ::: " + loc.encode('utf8').replace('\t',' ').replace('\n',' '))
continue
lat,lon = m.groups()
loc_type = 'REGEX'
try:
lat=float(lat)
lon=float(lon)
except ValueError:
# rejct("JUNK GEO\t" + json.dumps([lat,lon]))
continue
if (lat,lon)==(0,0) or lat < -90 or lat > 90 or lon < -180 or lon > 180:
reject("JUNK GEO\t" + json.dumps([lat,lon]))
continue
# country = country_db.query_point(lon,lat)
# country = country and country['properties']['ISO3']
record = {}
record['lonlat'] = [lon,lat]
record['loc_type'] = loc_type
record['user_location'] = lookup(tweet, 'user.location')
# record['country_iso3'] = country
out = [
json.dumps(record),
raw
]
if VERBOSE: out = ['ACCEPT'] + out
print '\t'.join(out)
#!/usr/bin/env python
"""
hose_util.iterate() is the most useful thing here.
hose_util.lookup() might be slightly useful.
"""
import sys,time,re
from datetime import datetime
from collections import defaultdict
import json
# Try to get a faster json library
# ujson is fastest .. used to have a bug with >32bit integers, but seems fixed now.
# simplejson is usually slightly faster than 'json'.
# yajl has memory leak with encoding, but decoding seems ok
try:
exec "import ujson as json"
except ImportError:
pass
def iterate(raw=False, filters=True):
"""
Iterate through tweets on stdin, skipping junk.
Yields the tweets as dicts, with a few new created_at_* keys.
Options:
- raw: if True, yield (rawtweetstring, tweetdict) pairs. Good for
filtering applications.
- filters: if True, skip nonmessage events (deletions)
"""
num_valid = 0
has_id = 0
seen_dates = set()
for i,line in enumerate(sys.stdin):
try:
myjson = json.loads(line)
if not isinstance(myjson,dict): raise ValueError("bad line")
num_valid += 1
has_id += ('id' in myjson)
if filters and ('text' not in myjson or 'created_at' not in myjson):
continue
if 'created_at' in myjson:
d = get_date(myjson)
ymd = d.strftime("%Y-%m-%d")
seen_dates.add(ymd)
myjson['created_at_datetime'] = d
myjson['created_at_ymd'] = ymd
myjson['created_at_iso'] = d.strftime("%Y-%m-%dT%H:%M:%S")
if raw:
yield line.strip(), myjson
else:
yield myjson
except ValueError:
pass
#print>>sys.stderr, "bad myjson object: ", line
# if ((i+1) % int(1e6)) == 0:
# print>>sys.stderr, "TWEET ITER: %d processed, %d valid, %d with id" % (i+1, num_valid, has_id)
# if len(seen_dates) > 10:
# print>>sys.stderr, "\t%d seen dates" % (len(seen_dates))
# else:
# print>>sys.stderr, "\tSeen dates: %s" % (sorted(list(seen_dates)),)
def lookup(myjson, k):
# return myjson[k]
if '.' in k:
# jpath path
ks = k.split('.')
v = myjson
for k in ks: v = v.get(k,{})
return v or ""
return myjson.get(k,"")
def parse_date(twitter_lame_datetime_string):
# e.g. the 'created_at' field
return time.strptime(twitter_lame_datetime_string, "%a %b %d %H:%M:%S +0000 %Y")
def get_date(myjson_object):
if 'created_at' not in myjson_object: return None
return datetime(*parse_date(myjson_object['created_at'])[:7])
#datetime(*t[:7]).strftime("%Y-%m-%d")
WS = re.compile(r'[ \t\r\n]+', re.U)
def ws_norm(s):
return unicodify(WS.sub(' ',stringify(s)))
def unicodify(s, encoding='utf8', *args):
if isinstance(s,unicode): return s
if isinstance(s,str): return s.decode(encoding, *args)
return unicode(s)
def stringify(s, encoding='utf8', *args):
if isinstance(s,str): return s
if isinstance(s,unicode): return s.encode(encoding, *args)
return str(s)
def tabjoin(*args):
return u'\t'.join(unicodify(x) for x in args).encode('utf-8')
def uniq_c(seq):
ret = defaultdict(lambda:0)
for x in seq:
ret[x] += 1
return dict(ret)
def commaize(num):
num = list(str(num))
blocks = []
while num:
blocks.append( num[-3:] )
num = num[:-3]
blocks = ["".join(x) for x in reversed(blocks)]
return ",".join(blocks)
if __name__=='__main__':
# print>>sys.stderr, "json module is", json
fields = ['id','created_at_iso','user.screen_name','text']
for raw,tweet in iterate(raw=True):
record = [unicodify(lookup(tweet, f)) for f in fields]
record = [ws_norm(x) for x in record]
record.append(raw)
print u'\t'.join(record).encode('utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment