brendano/geo_filter.py

## geo_filter.py
#!/usr/bin/env python
import sys,os,re,json
from hose_util import iterate, lookup
# import geodb

# country_db = geodb.GeoDB.load_geojson_files(['/home/brenocon/geocode/tm_world_borders-0.3.json'])

OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
Separator= r', ?'
LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)

VERBOSE = False

def reject(reason):
    global raw,tweet
    if not VERBOSE: return
    reject_key = "REJECT:" + reason
    output = "%s\t%s" % (reject_key, raw)
    print output

for raw,tweet in iterate(raw=True):
    lat = None
    lon = None
    loc_type = None

    geo = lookup(tweet, 'geo')
    if geo and geo['type'] == 'Point':
        lat,lon    = geo['coordinates']
        loc_type = 'OFFICIAL'
    else:
        loc = lookup(tweet, 'user.location').strip()
        if not loc:
            reject("NO USERLOC")
            continue
        m = LatLong.search(loc.encode('utf8'))
        if not m:
            reject("NO GEO REGEX ::: " + loc.encode('utf8').replace('\t',' ').replace('\n',' '))
            continue
        lat,lon = m.groups()
        loc_type = 'REGEX'

    try:
        lat=float(lat)
        lon=float(lon)
    except ValueError:
        # rejct("JUNK GEO\t" + json.dumps([lat,lon]))
        continue
    if (lat,lon)==(0,0) or lat < -90 or lat > 90 or lon < -180 or lon > 180:
        reject("JUNK GEO\t" + json.dumps([lat,lon]))
        continue

    # country = country_db.query_point(lon,lat)
    # country = country and country['properties']['ISO3']

    record = {}
    record['lonlat'] = [lon,lat]
    record['loc_type'] = loc_type
    record['user_location'] = lookup(tweet, 'user.location')
    # record['country_iso3'] = country

    out = [
            json.dumps(record),
            raw
    ]
    if VERBOSE: out = ['ACCEPT'] + out

    print '\t'.join(out)

## hose_util.py
#!/usr/bin/env python
"""
hose_util.iterate() is the most useful thing here.
hose_util.lookup() might be slightly useful.
"""

import sys,time,re
from datetime import datetime
from collections import defaultdict

import json
# Try to get a faster json library
# ujson is fastest .. used to have a bug with >32bit integers, but seems fixed now.
# simplejson is usually slightly faster than 'json'.
# yajl has memory leak with encoding, but decoding seems ok
try:
    exec "import ujson as json"
except ImportError:
    pass


def iterate(raw=False, filters=True):
  """
  Iterate through tweets on stdin, skipping junk.
  Yields the tweets as dicts, with a few new created_at_* keys.
  Options:
    - raw: if True, yield (rawtweetstring, tweetdict) pairs.  Good for
      filtering applications.
    - filters: if True, skip nonmessage events (deletions)
  """
  num_valid = 0
  has_id = 0
  seen_dates = set()
  for i,line in enumerate(sys.stdin):
    try:
      myjson = json.loads(line)
      if not isinstance(myjson,dict): raise ValueError("bad line")
      num_valid += 1
      has_id += ('id' in myjson)
      if filters and ('text' not in myjson or 'created_at' not in myjson):
        continue
      if 'created_at' in myjson:
        d = get_date(myjson)
        ymd = d.strftime("%Y-%m-%d")
        seen_dates.add(ymd)
        myjson['created_at_datetime'] = d
        myjson['created_at_ymd'] = ymd
        myjson['created_at_iso'] = d.strftime("%Y-%m-%dT%H:%M:%S")
      if raw:
        yield line.strip(), myjson
      else:
        yield myjson
    except ValueError:
      pass
      #print>>sys.stderr, "bad myjson object: ", line
    # if ((i+1) % int(1e6)) == 0:
    #   print>>sys.stderr, "TWEET ITER: %d processed, %d valid, %d with id" % (i+1, num_valid, has_id)
    #   if len(seen_dates) > 10:
    #     print>>sys.stderr, "\t%d seen dates" % (len(seen_dates))
    #   else:
    #     print>>sys.stderr, "\tSeen dates: %s" % (sorted(list(seen_dates)),)

def lookup(myjson, k):
  # return myjson[k]
  if '.' in k:
    # jpath path
    ks = k.split('.')
    v = myjson
    for k in ks: v = v.get(k,{})
    return v or ""
  return myjson.get(k,"")

def parse_date(twitter_lame_datetime_string):
  # e.g. the 'created_at' field
  return time.strptime(twitter_lame_datetime_string, "%a %b %d %H:%M:%S +0000 %Y")

def get_date(myjson_object):
  if 'created_at' not in myjson_object: return None
  return datetime(*parse_date(myjson_object['created_at'])[:7])

#datetime(*t[:7]).strftime("%Y-%m-%d")

WS = re.compile(r'[ \t\r\n]+', re.U)
def ws_norm(s):
  return unicodify(WS.sub(' ',stringify(s)))

def unicodify(s, encoding='utf8', *args):
  if isinstance(s,unicode): return s
  if isinstance(s,str): return s.decode(encoding, *args)
  return unicode(s)

def stringify(s, encoding='utf8', *args):
  if isinstance(s,str): return s
  if isinstance(s,unicode): return s.encode(encoding, *args)
  return str(s)


def tabjoin(*args):
  return u'\t'.join(unicodify(x) for x in args).encode('utf-8')

def uniq_c(seq):
  ret = defaultdict(lambda:0)
  for x in seq:
    ret[x] += 1
  return dict(ret)

def commaize(num):
  num = list(str(num))
  blocks = []
  while num:
    blocks.append( num[-3:] )
    num = num[:-3]
  blocks = ["".join(x) for x in reversed(blocks)]
  return ",".join(blocks)


if __name__=='__main__':
    # print>>sys.stderr, "json module is", json
    fields = ['id','created_at_iso','user.screen_name','text']
    for raw,tweet in iterate(raw=True):
        record = [unicodify(lookup(tweet, f)) for f in fields]
        record = [ws_norm(x) for x in record]
        record.append(raw)
        print u'\t'.join(record).encode('utf-8')
	#!/usr/bin/env python
	import sys,os,re,json
	from hose_util import iterate, lookup
	# import geodb

	# country_db = geodb.GeoDB.load_geojson_files(['/home/brenocon/geocode/tm_world_borders-0.3.json'])

	OneCoord = r'([-+]?\d{1,3}\.\d{3,})'
	Separator= r', ?'
	LatLong = re.compile(OneCoord + Separator + OneCoord, re.U)

	VERBOSE = False

	def reject(reason):
	global raw,tweet
	if not VERBOSE: return
	reject_key = "REJECT:" + reason
	output = "%s\t%s" % (reject_key, raw)
	print output

	for raw,tweet in iterate(raw=True):
	lat = None
	lon = None
	loc_type = None

	geo = lookup(tweet, 'geo')
	if geo and geo['type'] == 'Point':
	lat,lon = geo['coordinates']
	loc_type = 'OFFICIAL'
	else:
	loc = lookup(tweet, 'user.location').strip()
	if not loc:
	reject("NO USERLOC")
	continue
	m = LatLong.search(loc.encode('utf8'))
	if not m:
	reject("NO GEO REGEX ::: " + loc.encode('utf8').replace('\t',' ').replace('\n',' '))
	continue
	lat,lon = m.groups()
	loc_type = 'REGEX'

	try:
	lat=float(lat)
	lon=float(lon)
	except ValueError:
	# rejct("JUNK GEO\t" + json.dumps([lat,lon]))
	continue
	if (lat,lon)==(0,0) or lat < -90 or lat > 90 or lon < -180 or lon > 180:
	reject("JUNK GEO\t" + json.dumps([lat,lon]))
	continue

	# country = country_db.query_point(lon,lat)
	# country = country and country['properties']['ISO3']

	record = {}
	record['lonlat'] = [lon,lat]
	record['loc_type'] = loc_type
	record['user_location'] = lookup(tweet, 'user.location')
	# record['country_iso3'] = country

	out = [
	json.dumps(record),
	raw
	]
	if VERBOSE: out = ['ACCEPT'] + out

	print '\t'.join(out)
	#!/usr/bin/env python
	"""
	hose_util.iterate() is the most useful thing here.
	hose_util.lookup() might be slightly useful.
	"""

	import sys,time,re
	from datetime import datetime
	from collections import defaultdict

	import json
	# Try to get a faster json library
	# ujson is fastest .. used to have a bug with >32bit integers, but seems fixed now.
	# simplejson is usually slightly faster than 'json'.
	# yajl has memory leak with encoding, but decoding seems ok
	try:
	exec "import ujson as json"
	except ImportError:
	pass


	def iterate(raw=False, filters=True):
	"""
	Iterate through tweets on stdin, skipping junk.
	Yields the tweets as dicts, with a few new created_at_* keys.
	Options:
	- raw: if True, yield (rawtweetstring, tweetdict) pairs. Good for
	filtering applications.
	- filters: if True, skip nonmessage events (deletions)
	"""
	num_valid = 0
	has_id = 0
	seen_dates = set()
	for i,line in enumerate(sys.stdin):
	try:
	myjson = json.loads(line)
	if not isinstance(myjson,dict): raise ValueError("bad line")
	num_valid += 1
	has_id += ('id' in myjson)
	if filters and ('text' not in myjson or 'created_at' not in myjson):
	continue
	if 'created_at' in myjson:
	d = get_date(myjson)
	ymd = d.strftime("%Y-%m-%d")
	seen_dates.add(ymd)
	myjson['created_at_datetime'] = d
	myjson['created_at_ymd'] = ymd
	myjson['created_at_iso'] = d.strftime("%Y-%m-%dT%H:%M:%S")
	if raw:
	yield line.strip(), myjson
	else:
	yield myjson
	except ValueError:
	pass
	#print>>sys.stderr, "bad myjson object: ", line
	# if ((i+1) % int(1e6)) == 0:
	# print>>sys.stderr, "TWEET ITER: %d processed, %d valid, %d with id" % (i+1, num_valid, has_id)
	# if len(seen_dates) > 10:
	# print>>sys.stderr, "\t%d seen dates" % (len(seen_dates))
	# else:
	# print>>sys.stderr, "\tSeen dates: %s" % (sorted(list(seen_dates)),)

	def lookup(myjson, k):
	# return myjson[k]
	if '.' in k:
	# jpath path
	ks = k.split('.')
	v = myjson
	for k in ks: v = v.get(k,{})
	return v or ""
	return myjson.get(k,"")

	def parse_date(twitter_lame_datetime_string):
	# e.g. the 'created_at' field
	return time.strptime(twitter_lame_datetime_string, "%a %b %d %H:%M:%S +0000 %Y")

	def get_date(myjson_object):
	if 'created_at' not in myjson_object: return None
	return datetime(*parse_date(myjson_object['created_at'])[:7])

	#datetime(*t[:7]).strftime("%Y-%m-%d")

	WS = re.compile(r'[ \t\r\n]+', re.U)
	def ws_norm(s):
	return unicodify(WS.sub(' ',stringify(s)))

	def unicodify(s, encoding='utf8', *args):
	if isinstance(s,unicode): return s
	if isinstance(s,str): return s.decode(encoding, *args)
	return unicode(s)

	def stringify(s, encoding='utf8', *args):
	if isinstance(s,str): return s
	if isinstance(s,unicode): return s.encode(encoding, *args)
	return str(s)


	def tabjoin(*args):
	return u'\t'.join(unicodify(x) for x in args).encode('utf-8')

	def uniq_c(seq):
	ret = defaultdict(lambda:0)
	for x in seq:
	ret[x] += 1
	return dict(ret)

	def commaize(num):
	num = list(str(num))
	blocks = []
	while num:
	blocks.append( num[-3:] )
	num = num[:-3]
	blocks = ["".join(x) for x in reversed(blocks)]
	return ",".join(blocks)


	if __name__=='__main__':
	# print>>sys.stderr, "json module is", json
	fields = ['id','created_at_iso','user.screen_name','text']
	for raw,tweet in iterate(raw=True):
	record = [unicodify(lookup(tweet, f)) for f in fields]
	record = [ws_norm(x) for x in record]
	record.append(raw)
	print u'\t'.join(record).encode('utf-8')