Last active
August 29, 2015 14:08
-
-
Save brendano/6d5e728c2260d6313586 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys,os,re,json | |
from hose_util import iterate, lookup | |
# import geodb | |
# country_db = geodb.GeoDB.load_geojson_files(['/home/brenocon/geocode/tm_world_borders-0.3.json']) | |
OneCoord = r'([-+]?\d{1,3}\.\d{3,})' | |
Separator= r', ?' | |
LatLong = re.compile(OneCoord + Separator + OneCoord, re.U) | |
VERBOSE = False | |
def reject(reason): | |
global raw,tweet | |
if not VERBOSE: return | |
reject_key = "REJECT:" + reason | |
output = "%s\t%s" % (reject_key, raw) | |
print output | |
for raw,tweet in iterate(raw=True): | |
lat = None | |
lon = None | |
loc_type = None | |
geo = lookup(tweet, 'geo') | |
if geo and geo['type'] == 'Point': | |
lat,lon = geo['coordinates'] | |
loc_type = 'OFFICIAL' | |
else: | |
loc = lookup(tweet, 'user.location').strip() | |
if not loc: | |
reject("NO USERLOC") | |
continue | |
m = LatLong.search(loc.encode('utf8')) | |
if not m: | |
reject("NO GEO REGEX ::: " + loc.encode('utf8').replace('\t',' ').replace('\n',' ')) | |
continue | |
lat,lon = m.groups() | |
loc_type = 'REGEX' | |
try: | |
lat=float(lat) | |
lon=float(lon) | |
except ValueError: | |
# rejct("JUNK GEO\t" + json.dumps([lat,lon])) | |
continue | |
if (lat,lon)==(0,0) or lat < -90 or lat > 90 or lon < -180 or lon > 180: | |
reject("JUNK GEO\t" + json.dumps([lat,lon])) | |
continue | |
# country = country_db.query_point(lon,lat) | |
# country = country and country['properties']['ISO3'] | |
record = {} | |
record['lonlat'] = [lon,lat] | |
record['loc_type'] = loc_type | |
record['user_location'] = lookup(tweet, 'user.location') | |
# record['country_iso3'] = country | |
out = [ | |
json.dumps(record), | |
raw | |
] | |
if VERBOSE: out = ['ACCEPT'] + out | |
print '\t'.join(out) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
hose_util.iterate() is the most useful thing here. | |
hose_util.lookup() might be slightly useful. | |
""" | |
import sys,time,re | |
from datetime import datetime | |
from collections import defaultdict | |
import json | |
# Try to get a faster json library | |
# ujson is fastest .. used to have a bug with >32bit integers, but seems fixed now. | |
# simplejson is usually slightly faster than 'json'. | |
# yajl has memory leak with encoding, but decoding seems ok | |
try: | |
exec "import ujson as json" | |
except ImportError: | |
pass | |
def iterate(raw=False, filters=True): | |
""" | |
Iterate through tweets on stdin, skipping junk. | |
Yields the tweets as dicts, with a few new created_at_* keys. | |
Options: | |
- raw: if True, yield (rawtweetstring, tweetdict) pairs. Good for | |
filtering applications. | |
- filters: if True, skip nonmessage events (deletions) | |
""" | |
num_valid = 0 | |
has_id = 0 | |
seen_dates = set() | |
for i,line in enumerate(sys.stdin): | |
try: | |
myjson = json.loads(line) | |
if not isinstance(myjson,dict): raise ValueError("bad line") | |
num_valid += 1 | |
has_id += ('id' in myjson) | |
if filters and ('text' not in myjson or 'created_at' not in myjson): | |
continue | |
if 'created_at' in myjson: | |
d = get_date(myjson) | |
ymd = d.strftime("%Y-%m-%d") | |
seen_dates.add(ymd) | |
myjson['created_at_datetime'] = d | |
myjson['created_at_ymd'] = ymd | |
myjson['created_at_iso'] = d.strftime("%Y-%m-%dT%H:%M:%S") | |
if raw: | |
yield line.strip(), myjson | |
else: | |
yield myjson | |
except ValueError: | |
pass | |
#print>>sys.stderr, "bad myjson object: ", line | |
# if ((i+1) % int(1e6)) == 0: | |
# print>>sys.stderr, "TWEET ITER: %d processed, %d valid, %d with id" % (i+1, num_valid, has_id) | |
# if len(seen_dates) > 10: | |
# print>>sys.stderr, "\t%d seen dates" % (len(seen_dates)) | |
# else: | |
# print>>sys.stderr, "\tSeen dates: %s" % (sorted(list(seen_dates)),) | |
def lookup(myjson, k): | |
# return myjson[k] | |
if '.' in k: | |
# jpath path | |
ks = k.split('.') | |
v = myjson | |
for k in ks: v = v.get(k,{}) | |
return v or "" | |
return myjson.get(k,"") | |
def parse_date(twitter_lame_datetime_string): | |
# e.g. the 'created_at' field | |
return time.strptime(twitter_lame_datetime_string, "%a %b %d %H:%M:%S +0000 %Y") | |
def get_date(myjson_object): | |
if 'created_at' not in myjson_object: return None | |
return datetime(*parse_date(myjson_object['created_at'])[:7]) | |
#datetime(*t[:7]).strftime("%Y-%m-%d") | |
WS = re.compile(r'[ \t\r\n]+', re.U) | |
def ws_norm(s): | |
return unicodify(WS.sub(' ',stringify(s))) | |
def unicodify(s, encoding='utf8', *args): | |
if isinstance(s,unicode): return s | |
if isinstance(s,str): return s.decode(encoding, *args) | |
return unicode(s) | |
def stringify(s, encoding='utf8', *args): | |
if isinstance(s,str): return s | |
if isinstance(s,unicode): return s.encode(encoding, *args) | |
return str(s) | |
def tabjoin(*args): | |
return u'\t'.join(unicodify(x) for x in args).encode('utf-8') | |
def uniq_c(seq): | |
ret = defaultdict(lambda:0) | |
for x in seq: | |
ret[x] += 1 | |
return dict(ret) | |
def commaize(num): | |
num = list(str(num)) | |
blocks = [] | |
while num: | |
blocks.append( num[-3:] ) | |
num = num[:-3] | |
blocks = ["".join(x) for x in reversed(blocks)] | |
return ",".join(blocks) | |
if __name__=='__main__': | |
# print>>sys.stderr, "json module is", json | |
fields = ['id','created_at_iso','user.screen_name','text'] | |
for raw,tweet in iterate(raw=True): | |
record = [unicodify(lookup(tweet, f)) for f in fields] | |
record = [ws_norm(x) for x in record] | |
record.append(raw) | |
print u'\t'.join(record).encode('utf-8') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment