Skip to content

Instantly share code, notes, and snippets.

@philshem
Last active January 13, 2018 15:34
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save philshem/685cfcf5e3acf3d3351d to your computer and use it in GitHub Desktop.
Save philshem/685cfcf5e3acf3d3351d to your computer and use it in GitHub Desktop.
Source code for "Switzerland Tweets" dataviz
# -*- coding: utf-8 -*-
import glob
import json
import requests
delim = '\t'
def get_language(tweet,lang_dict):
try:
tweet = json.loads(tweet)
except:
return
print_list = []
coords = tweet.get('coordinates',None)
if coords is not None: # first check if tweet has coordinates
latlong = coords.get('coordinates',None)
user_lang = tweet.get('user').get('lang',None)
tweet_lang = tweet.get('lang',None)
if latlong is not None and tweet_lang is not None and user_lang is not None: # then check if languages are present
try:
user_full_lang = lang_dict.get(user_lang.split('-')[0])
tweet_full_lang = lang_dict.get(tweet_lang.split('-')[0])
except:
print 'ERROR - missing_language',user_lang, lang_dict.get(user_lang)
print 'ERROR - missing_language',tweet_lang, lang_dict.get(tweet_lang)
short_country, long_country, region, postal_code = get_country_mapquest(latlong)
if short_country is not None:
user_id = tweet.get('user').get('id')
print_list.append(unicode(user_id))
user_created_at = tweet.get('user').get('created_at')
print_list.append(unicode(user_created_at))
print_list.append(user_lang)
print_list.append(user_full_lang)
tweet_id = tweet.get('id')
print_list.append(unicode(tweet_id))
tweet_created_at = tweet.get('created_at')
print_list.append(unicode(tweet_created_at))
print_list.append(tweet_lang)
print_list.append(tweet_full_lang)
print_list.append(unicode(latlong[0]))
print_list.append(unicode(latlong[1]))
print_list.append(short_country)
print_list.append(long_country)
try:
print delim.join(print_list).encode('utf-8')
except:
pass
def get_country_google(latlong):
key = None
urlbase = 'http://maps.googleapis.com/maps/api/geocode/json?latlng='
url = urlbase + str(latlong[1])+','+str(latlong[0]) # switched order between google and twitter
if key is not None:
url += '&key='+str(key)
# make request to google
r = requests.get(url)
# parse data
data = r.json()
# get desired country fields
long_name = ''
short_name = ''
region = ''
postal_code = ''
for results in data.get('results'):
#print item
for item in results:
if item == 'address_components':
for elem in results.get(item):
if 'country' in elem.get('types') or 'political' in elem.get('types'):
long_name = elem.get('long_name')
short_name = elem.get('short_name')
if 'administrative_area_level_1' in elem.get('types'):
region = elem.get('long_name')
if 'postal_code' in elem.get('types'):
postal_code = elem.get('long_name')
return short_name, long_name, region, postal_code
def get_country_mapquest(latlong):
# http://open.mapquestapi.com/nominatim/#reverse
url = 'http://open.mapquestapi.com/nominatim/v1/reverse.php?format=json&lat='+str(latlong[1])+'&lon='+str(latlong[0])
r = requests.get(url)
if r.status_code != 200:
print r.status_code
return None,None,None,None
else:
data = r.json()
if data.get('address') is not None:
short_country = data.get('address').get('country_code')
long_country = data.get('address').get('country')
region = data.get('address').get('state')
postal_code = data.get('address').get('postcode')
return short_country, long_country, region, postal_code
else:
print data
return None,None,None,None
def get_country_opencage(latlong):
pass
#if __name__ == "__main__":
# print get_country_mapquest([42.5,8.2])
# -*- coding: utf-8 -*-
import glob
import os
output_csv="ch_tweets.csv"
lineend = '\n'
delim = '\t'
try:
os.remove(output_csv)
except OSError:
pass
line_count = 0
with open(output_csv,'wb') as output:
# header line
header = ['user_id','user_created_at','user_lang_code','user_lang_name', \
'tweet_id','tweet_created_at','tweet_lang_code','tweet_lang_name', \
'tweet_longitude','tweet_latitude','tweet_country_code','tweet_country_name','tweet_region_name','tweet_postal_code']
output.write(delim.join(header)+lineend)
for csv_fyle in glob.glob('_data/*.csv'):
tmp_line = 0
with open(csv_fyle) as infile:
for line in infile:
output.write(line)
line_count += 1
tmp_line += 1
print 'Read',tmp_line,'from:',csv_fyle
print
print 'Done writing: ',output_csv
print 'Total lines: ',line_count
import oauth2 as oauth
import urllib2 as urllib
import time
from get_geo import get_language
# https://raw.githubusercontent.com/philshem/datasci_course_materials/master/assignment1/twitterstream.py
access_token_key = "REDACTED"
access_token_secret = "REDACTED"
consumer_key = "REDACTED"
consumer_secret = "REDACTED"
_debug = 0
# https://dev.twitter.com/streaming/overview/request-parameters#language
# http://tools.ietf.org/html/bcp47
from collections import defaultdict
lang_dict = defaultdict(unicode)
#from run_lang_dict import read_lang_dict # http://tools.ietf.org/html/bcp47
#lang_dict = read_lang_dict()
with open('written_in.csv','rb') as written_in:
for line in written_in:
lang_code = line.replace('\n','').split('\t')[1]
lang_name = line.replace('\n','').split('\t')[0]
lang_dict[lang_code] = lang_name
#print lang_dict
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=consumer_key, secret=consumer_secret)
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()
http_method = "GET"
http_handler = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)
'''
Construct, sign, and open a twitter request
using the hard-coded credentials above.
'''
def twitterreq(url, method, parameters):
req = oauth.Request.from_consumer_and_token(oauth_consumer,token=oauth_token,http_method=http_method,http_url=url,parameters=parameters)
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
headers = req.to_header()
if http_method == "POST":
encoded_post_data = req.to_postdata()
else:
encoded_post_data = None
url = req.to_url()
opener = urllib.OpenerDirector()
opener.add_handler(http_handler)
opener.add_handler(https_handler)
response = opener.open(url, encoded_post_data)
return response
def fetchsamples():
url = 'https://stream.twitter.com/1.1/statuses/filter.json?'+'language='+','.join([key for key in lang_dict])+'&locations=5.955870,45.818020,10.492030,47.808380'
#print url
parameters = []
response = twitterreq(url, "GET", parameters)
for line in response:
tweet = line.strip()
#print tweet # debugging
get_language(tweet,lang_dict)
if __name__ == '__main__':
fetchsamples()
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
Language Name Language Code
Amharic am
Arabic ar
Bulgarian bg
Bengali bn
Tibetan bo
Cherokee chr
Danish da
German de
Maldivian dv
Greek el
English en
Spanish es
Persian fa
Finnish fi
French fr
Gujarati gu
Hebrew iw
Hindi hi
Hungarian hu
Armenian hy
Indonesian in
Icelandic is
Italian it
Inuktitut iu
Japanese ja
Georgian ka
Khmer km
Kannada kn
Korean ko
Lao lo
Lithuanian lt
Malaysian ml
Myanmar my
Nepali ne
Dutch nl
Norwegian no
Oriya or
Panjabi pa
Polish pl
Portuguese pt
Russian ru
Sinhala si
Swedish sv
Tamil ta
Telugu te
Thai th
Tagalog tl
Turkish tr
Urdu ur
Vietnamese vi
Chinese zh
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment