philshem/get_geo.py

## get_geo.py
# -*- coding: utf-8 -*-

import glob
import json
import requests

delim = '\t'

def get_language(tweet,lang_dict):

	try:
		tweet = json.loads(tweet)
	except:
		return

	print_list = []

	coords = tweet.get('coordinates',None)

	if coords is not None: # first check if tweet has coordinates

		latlong = coords.get('coordinates',None)
		user_lang = tweet.get('user').get('lang',None)
		tweet_lang = tweet.get('lang',None)

		if latlong is not None and tweet_lang is not None and user_lang is not None: # then check if languages are present

			try:
				user_full_lang = lang_dict.get(user_lang.split('-')[0])
				tweet_full_lang = lang_dict.get(tweet_lang.split('-')[0])
			except:
				print 'ERROR - missing_language',user_lang, lang_dict.get(user_lang)
				print 'ERROR - missing_language',tweet_lang, lang_dict.get(tweet_lang)

			short_country, long_country, region, postal_code = get_country_mapquest(latlong)

			if short_country is not None:
				user_id = tweet.get('user').get('id')
				print_list.append(unicode(user_id))

				user_created_at = tweet.get('user').get('created_at')
				print_list.append(unicode(user_created_at))

				print_list.append(user_lang)
				print_list.append(user_full_lang)

				tweet_id = tweet.get('id')
				print_list.append(unicode(tweet_id))

				tweet_created_at = tweet.get('created_at')
				print_list.append(unicode(tweet_created_at))

				print_list.append(tweet_lang)
				print_list.append(tweet_full_lang)

				print_list.append(unicode(latlong[0]))
				print_list.append(unicode(latlong[1]))

				print_list.append(short_country)
				print_list.append(long_country)

				try:
					print delim.join(print_list).encode('utf-8')
				except:
					pass

def get_country_google(latlong):

	key = None
	urlbase = 'http://maps.googleapis.com/maps/api/geocode/json?latlng='
	url = urlbase + str(latlong[1])+','+str(latlong[0]) # switched order between google and twitter
	if key is not None:
		url += '&key='+str(key)

	# make request to google
	r = requests.get(url)

	# parse data
	data = r.json()

	# get desired country fields
	long_name = ''
	short_name = ''
	region = ''
	postal_code = ''

	for results in data.get('results'):
		#print item
		for item in results:
			if item == 'address_components':
				for elem in results.get(item):
					if 'country' in elem.get('types') or 'political' in elem.get('types'):
						long_name = elem.get('long_name')
						short_name = elem.get('short_name')
					if 'administrative_area_level_1' in elem.get('types'):
						region = elem.get('long_name')
					if 'postal_code' in elem.get('types'):
						postal_code = elem.get('long_name')

	return short_name, long_name, region, postal_code

def get_country_mapquest(latlong):

	# http://open.mapquestapi.com/nominatim/#reverse

	url = 'http://open.mapquestapi.com/nominatim/v1/reverse.php?format=json&lat='+str(latlong[1])+'&lon='+str(latlong[0])
	r = requests.get(url)

	if r.status_code != 200:
		print r.status_code
		return None,None,None,None
	else:
		data = r.json()

		if data.get('address') is not None:
			short_country = data.get('address').get('country_code')
			long_country = data.get('address').get('country')
			region = data.get('address').get('state')
			postal_code = data.get('address').get('postcode')

			return short_country, long_country, region, postal_code
		else:
			print data
			return None,None,None,None

def get_country_opencage(latlong):
	pass
#if __name__ == "__main__":

#	print get_country_mapquest([42.5,8.2])

## join_csv.py
# -*- coding: utf-8 -*-

import glob
import os

output_csv="ch_tweets.csv"
lineend = '\n'
delim = '\t'

try:
    os.remove(output_csv)
except OSError:
    pass

line_count = 0
with open(output_csv,'wb') as output:
	# header line
	header = ['user_id','user_created_at','user_lang_code','user_lang_name', \
		'tweet_id','tweet_created_at','tweet_lang_code','tweet_lang_name', \
		'tweet_longitude','tweet_latitude','tweet_country_code','tweet_country_name','tweet_region_name','tweet_postal_code']
	output.write(delim.join(header)+lineend)


	for csv_fyle in glob.glob('_data/*.csv'):
		tmp_line = 0

		with open(csv_fyle) as infile:
			for line in infile:
				output.write(line)
				line_count += 1
				tmp_line += 1
		print 'Read',tmp_line,'from:',csv_fyle

print
print 'Done writing: ',output_csv
print 'Total lines: ',line_count

## twitterstream_ch.py
import oauth2 as oauth
import urllib2 as urllib
import time

from get_geo import get_language

# https://raw.githubusercontent.com/philshem/datasci_course_materials/master/assignment1/twitterstream.py

access_token_key = "REDACTED"
access_token_secret = "REDACTED"

consumer_key = "REDACTED"
consumer_secret = "REDACTED"

_debug = 0

# https://dev.twitter.com/streaming/overview/request-parameters#language
# http://tools.ietf.org/html/bcp47

from collections import defaultdict
lang_dict = defaultdict(unicode)

#from run_lang_dict import read_lang_dict  # http://tools.ietf.org/html/bcp47
#lang_dict = read_lang_dict()

with open('written_in.csv','rb') as written_in:
	for line in written_in:
		lang_code = line.replace('\n','').split('\t')[1]
		lang_name = line.replace('\n','').split('\t')[0]
		lang_dict[lang_code] = lang_name

#print lang_dict

oauth_token    = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=consumer_key, secret=consumer_secret)

signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()

http_method = "GET"


http_handler  = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)

'''
Construct, sign, and open a twitter request
using the hard-coded credentials above.
'''
def twitterreq(url, method, parameters):
	req = oauth.Request.from_consumer_and_token(oauth_consumer,token=oauth_token,http_method=http_method,http_url=url,parameters=parameters)

	req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)

	headers = req.to_header()

	if http_method == "POST":
		encoded_post_data = req.to_postdata()
	else:
		encoded_post_data = None
		url = req.to_url()

	opener = urllib.OpenerDirector()
	opener.add_handler(http_handler)
	opener.add_handler(https_handler)

	response = opener.open(url, encoded_post_data)

	return response

def fetchsamples():

	url = 'https://stream.twitter.com/1.1/statuses/filter.json?'+'language='+','.join([key for key in lang_dict])+'&locations=5.955870,45.818020,10.492030,47.808380'
	#print url

	parameters = []

	response = twitterreq(url, "GET", parameters)
	for line in response:
		tweet = line.strip()

		#print tweet # debugging

		get_language(tweet,lang_dict)


if __name__ == '__main__':

	fetchsamples()

## written_in.csv
Language Name	Language Code
Amharic	am
Arabic	ar
Bulgarian	bg
Bengali	bn
Tibetan	bo
Cherokee	chr
Danish	da
German	de
Maldivian	dv
Greek	el
English	en
Spanish	es
Persian	fa
Finnish	fi
French	fr
Gujarati	gu
Hebrew	iw
Hindi	hi
Hungarian	hu
Armenian	hy
Indonesian	in
Icelandic	is
Italian	it
Inuktitut	iu
Japanese	ja
Georgian	ka
Khmer	km
Kannada	kn
Korean	ko
Lao	lo
Lithuanian	lt
Malaysian	ml
Myanmar	my
Nepali	ne
Dutch	nl
Norwegian	no
Oriya	or
Panjabi	pa
Polish	pl
Portuguese	pt
Russian	ru
Sinhala	si
Swedish	sv
Tamil	ta
Telugu	te
Thai	th
Tagalog	tl
Turkish	tr
Urdu	ur
Vietnamese	vi
Chinese	zh
	# -- coding: utf-8 --

	import glob
	import json
	import requests

	delim = '\t'

	def get_language(tweet,lang_dict):

	try:
	tweet = json.loads(tweet)
	except:
	return

	print_list = []

	coords = tweet.get('coordinates',None)

	if coords is not None: # first check if tweet has coordinates

	latlong = coords.get('coordinates',None)
	user_lang = tweet.get('user').get('lang',None)
	tweet_lang = tweet.get('lang',None)

	if latlong is not None and tweet_lang is not None and user_lang is not None: # then check if languages are present

	try:
	user_full_lang = lang_dict.get(user_lang.split('-')[0])
	tweet_full_lang = lang_dict.get(tweet_lang.split('-')[0])
	except:
	print 'ERROR - missing_language',user_lang, lang_dict.get(user_lang)
	print 'ERROR - missing_language',tweet_lang, lang_dict.get(tweet_lang)

	short_country, long_country, region, postal_code = get_country_mapquest(latlong)

	if short_country is not None:
	user_id = tweet.get('user').get('id')
	print_list.append(unicode(user_id))

	user_created_at = tweet.get('user').get('created_at')
	print_list.append(unicode(user_created_at))

	print_list.append(user_lang)
	print_list.append(user_full_lang)

	tweet_id = tweet.get('id')
	print_list.append(unicode(tweet_id))

	tweet_created_at = tweet.get('created_at')
	print_list.append(unicode(tweet_created_at))

	print_list.append(tweet_lang)
	print_list.append(tweet_full_lang)

	print_list.append(unicode(latlong[0]))
	print_list.append(unicode(latlong[1]))

	print_list.append(short_country)
	print_list.append(long_country)

	try:
	print delim.join(print_list).encode('utf-8')
	except:
	pass

	def get_country_google(latlong):

	key = None
	urlbase = 'http://maps.googleapis.com/maps/api/geocode/json?latlng='
	url = urlbase + str(latlong[1])+','+str(latlong[0]) # switched order between google and twitter
	if key is not None:
	url += '&key='+str(key)

	# make request to google
	r = requests.get(url)

	# parse data
	data = r.json()

	# get desired country fields
	long_name = ''
	short_name = ''
	region = ''
	postal_code = ''

	for results in data.get('results'):
	#print item
	for item in results:
	if item == 'address_components':
	for elem in results.get(item):
	if 'country' in elem.get('types') or 'political' in elem.get('types'):
	long_name = elem.get('long_name')
	short_name = elem.get('short_name')
	if 'administrative_area_level_1' in elem.get('types'):
	region = elem.get('long_name')
	if 'postal_code' in elem.get('types'):
	postal_code = elem.get('long_name')

	return short_name, long_name, region, postal_code

	def get_country_mapquest(latlong):

	# http://open.mapquestapi.com/nominatim/#reverse

	url = 'http://open.mapquestapi.com/nominatim/v1/reverse.php?format=json&lat='+str(latlong[1])+'&lon='+str(latlong[0])
	r = requests.get(url)

	if r.status_code != 200:
	print r.status_code
	return None,None,None,None
	else:
	data = r.json()

	if data.get('address') is not None:
	short_country = data.get('address').get('country_code')
	long_country = data.get('address').get('country')
	region = data.get('address').get('state')
	postal_code = data.get('address').get('postcode')

	return short_country, long_country, region, postal_code
	else:
	print data
	return None,None,None,None

	def get_country_opencage(latlong):
	pass
	#if __name__ == "__main__":

	# print get_country_mapquest([42.5,8.2])
	# -- coding: utf-8 --

	import glob
	import os

	output_csv="ch_tweets.csv"
	lineend = '\n'
	delim = '\t'

	try:
	os.remove(output_csv)
	except OSError:
	pass

	line_count = 0
	with open(output_csv,'wb') as output:
	# header line
	header = ['user_id','user_created_at','user_lang_code','user_lang_name', \
	'tweet_id','tweet_created_at','tweet_lang_code','tweet_lang_name', \
	'tweet_longitude','tweet_latitude','tweet_country_code','tweet_country_name','tweet_region_name','tweet_postal_code']
	output.write(delim.join(header)+lineend)


	for csv_fyle in glob.glob('_data/*.csv'):
	tmp_line = 0

	with open(csv_fyle) as infile:
	for line in infile:
	output.write(line)
	line_count += 1
	tmp_line += 1
	print 'Read',tmp_line,'from:',csv_fyle

	print
	print 'Done writing: ',output_csv
	print 'Total lines: ',line_count
	import oauth2 as oauth
	import urllib2 as urllib
	import time

	from get_geo import get_language

	# https://raw.githubusercontent.com/philshem/datasci_course_materials/master/assignment1/twitterstream.py

	access_token_key = "REDACTED"
	access_token_secret = "REDACTED"

	consumer_key = "REDACTED"
	consumer_secret = "REDACTED"

	_debug = 0

	# https://dev.twitter.com/streaming/overview/request-parameters#language
	# http://tools.ietf.org/html/bcp47

	from collections import defaultdict
	lang_dict = defaultdict(unicode)

	#from run_lang_dict import read_lang_dict # http://tools.ietf.org/html/bcp47
	#lang_dict = read_lang_dict()

	with open('written_in.csv','rb') as written_in:
	for line in written_in:
	lang_code = line.replace('\n','').split('\t')[1]
	lang_name = line.replace('\n','').split('\t')[0]
	lang_dict[lang_code] = lang_name

	#print lang_dict

	oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
	oauth_consumer = oauth.Consumer(key=consumer_key, secret=consumer_secret)

	signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()

	http_method = "GET"


	http_handler = urllib.HTTPHandler(debuglevel=_debug)
	https_handler = urllib.HTTPSHandler(debuglevel=_debug)

	'''
	Construct, sign, and open a twitter request
	using the hard-coded credentials above.
	'''
	def twitterreq(url, method, parameters):
	req = oauth.Request.from_consumer_and_token(oauth_consumer,token=oauth_token,http_method=http_method,http_url=url,parameters=parameters)

	req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)

	headers = req.to_header()

	if http_method == "POST":
	encoded_post_data = req.to_postdata()
	else:
	encoded_post_data = None
	url = req.to_url()

	opener = urllib.OpenerDirector()
	opener.add_handler(http_handler)
	opener.add_handler(https_handler)

	response = opener.open(url, encoded_post_data)

	return response

	def fetchsamples():

	url = 'https://stream.twitter.com/1.1/statuses/filter.json?'+'language='+','.join([key for key in lang_dict])+'&locations=5.955870,45.818020,10.492030,47.808380'
	#print url

	parameters = []

	response = twitterreq(url, "GET", parameters)
	for line in response:
	tweet = line.strip()

	#print tweet # debugging

	get_language(tweet,lang_dict)


	if __name__ == '__main__':

	fetchsamples()
	Language Name Language Code
	Amharic am
	Arabic ar
	Bulgarian bg
	Bengali bn
	Tibetan bo
	Cherokee chr
	Danish da
	German de
	Maldivian dv
	Greek el
	English en
	Spanish es
	Persian fa
	Finnish fi
	French fr
	Gujarati gu
	Hebrew iw
	Hindi hi
	Hungarian hu
	Armenian hy
	Indonesian in
	Icelandic is
	Italian it
	Inuktitut iu
	Japanese ja
	Georgian ka
	Khmer km
	Kannada kn
	Korean ko
	Lao lo
	Lithuanian lt
	Malaysian ml
	Myanmar my
	Nepali ne
	Dutch nl
	Norwegian no
	Oriya or
	Panjabi pa
	Polish pl
	Portuguese pt
	Russian ru
	Sinhala si
	Swedish sv
	Tamil ta
	Telugu te
	Thai th
	Tagalog tl
	Turkish tr
	Urdu ur
	Vietnamese vi
	Chinese zh