Last active
January 13, 2018 15:34
-
-
Save philshem/685cfcf5e3acf3d3351d to your computer and use it in GitHub Desktop.
Source code for "Switzerland Tweets" dataviz
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import glob | |
import json | |
import requests | |
delim = '\t' | |
def get_language(tweet,lang_dict): | |
try: | |
tweet = json.loads(tweet) | |
except: | |
return | |
print_list = [] | |
coords = tweet.get('coordinates',None) | |
if coords is not None: # first check if tweet has coordinates | |
latlong = coords.get('coordinates',None) | |
user_lang = tweet.get('user').get('lang',None) | |
tweet_lang = tweet.get('lang',None) | |
if latlong is not None and tweet_lang is not None and user_lang is not None: # then check if languages are present | |
try: | |
user_full_lang = lang_dict.get(user_lang.split('-')[0]) | |
tweet_full_lang = lang_dict.get(tweet_lang.split('-')[0]) | |
except: | |
print 'ERROR - missing_language',user_lang, lang_dict.get(user_lang) | |
print 'ERROR - missing_language',tweet_lang, lang_dict.get(tweet_lang) | |
short_country, long_country, region, postal_code = get_country_mapquest(latlong) | |
if short_country is not None: | |
user_id = tweet.get('user').get('id') | |
print_list.append(unicode(user_id)) | |
user_created_at = tweet.get('user').get('created_at') | |
print_list.append(unicode(user_created_at)) | |
print_list.append(user_lang) | |
print_list.append(user_full_lang) | |
tweet_id = tweet.get('id') | |
print_list.append(unicode(tweet_id)) | |
tweet_created_at = tweet.get('created_at') | |
print_list.append(unicode(tweet_created_at)) | |
print_list.append(tweet_lang) | |
print_list.append(tweet_full_lang) | |
print_list.append(unicode(latlong[0])) | |
print_list.append(unicode(latlong[1])) | |
print_list.append(short_country) | |
print_list.append(long_country) | |
try: | |
print delim.join(print_list).encode('utf-8') | |
except: | |
pass | |
def get_country_google(latlong): | |
key = None | |
urlbase = 'http://maps.googleapis.com/maps/api/geocode/json?latlng=' | |
url = urlbase + str(latlong[1])+','+str(latlong[0]) # switched order between google and twitter | |
if key is not None: | |
url += '&key='+str(key) | |
# make request to google | |
r = requests.get(url) | |
# parse data | |
data = r.json() | |
# get desired country fields | |
long_name = '' | |
short_name = '' | |
region = '' | |
postal_code = '' | |
for results in data.get('results'): | |
#print item | |
for item in results: | |
if item == 'address_components': | |
for elem in results.get(item): | |
if 'country' in elem.get('types') or 'political' in elem.get('types'): | |
long_name = elem.get('long_name') | |
short_name = elem.get('short_name') | |
if 'administrative_area_level_1' in elem.get('types'): | |
region = elem.get('long_name') | |
if 'postal_code' in elem.get('types'): | |
postal_code = elem.get('long_name') | |
return short_name, long_name, region, postal_code | |
def get_country_mapquest(latlong): | |
# http://open.mapquestapi.com/nominatim/#reverse | |
url = 'http://open.mapquestapi.com/nominatim/v1/reverse.php?format=json&lat='+str(latlong[1])+'&lon='+str(latlong[0]) | |
r = requests.get(url) | |
if r.status_code != 200: | |
print r.status_code | |
return None,None,None,None | |
else: | |
data = r.json() | |
if data.get('address') is not None: | |
short_country = data.get('address').get('country_code') | |
long_country = data.get('address').get('country') | |
region = data.get('address').get('state') | |
postal_code = data.get('address').get('postcode') | |
return short_country, long_country, region, postal_code | |
else: | |
print data | |
return None,None,None,None | |
def get_country_opencage(latlong): | |
pass | |
#if __name__ == "__main__": | |
# print get_country_mapquest([42.5,8.2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import glob | |
import os | |
output_csv="ch_tweets.csv" | |
lineend = '\n' | |
delim = '\t' | |
try: | |
os.remove(output_csv) | |
except OSError: | |
pass | |
line_count = 0 | |
with open(output_csv,'wb') as output: | |
# header line | |
header = ['user_id','user_created_at','user_lang_code','user_lang_name', \ | |
'tweet_id','tweet_created_at','tweet_lang_code','tweet_lang_name', \ | |
'tweet_longitude','tweet_latitude','tweet_country_code','tweet_country_name','tweet_region_name','tweet_postal_code'] | |
output.write(delim.join(header)+lineend) | |
for csv_fyle in glob.glob('_data/*.csv'): | |
tmp_line = 0 | |
with open(csv_fyle) as infile: | |
for line in infile: | |
output.write(line) | |
line_count += 1 | |
tmp_line += 1 | |
print 'Read',tmp_line,'from:',csv_fyle | |
print 'Done writing: ',output_csv | |
print 'Total lines: ',line_count |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import oauth2 as oauth | |
import urllib2 as urllib | |
import time | |
from get_geo import get_language | |
# https://raw.githubusercontent.com/philshem/datasci_course_materials/master/assignment1/twitterstream.py | |
access_token_key = "REDACTED" | |
access_token_secret = "REDACTED" | |
consumer_key = "REDACTED" | |
consumer_secret = "REDACTED" | |
_debug = 0 | |
# https://dev.twitter.com/streaming/overview/request-parameters#language | |
# http://tools.ietf.org/html/bcp47 | |
from collections import defaultdict | |
lang_dict = defaultdict(unicode) | |
#from run_lang_dict import read_lang_dict # http://tools.ietf.org/html/bcp47 | |
#lang_dict = read_lang_dict() | |
with open('written_in.csv','rb') as written_in: | |
for line in written_in: | |
lang_code = line.replace('\n','').split('\t')[1] | |
lang_name = line.replace('\n','').split('\t')[0] | |
lang_dict[lang_code] = lang_name | |
#print lang_dict | |
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret) | |
oauth_consumer = oauth.Consumer(key=consumer_key, secret=consumer_secret) | |
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1() | |
http_method = "GET" | |
http_handler = urllib.HTTPHandler(debuglevel=_debug) | |
https_handler = urllib.HTTPSHandler(debuglevel=_debug) | |
''' | |
Construct, sign, and open a twitter request | |
using the hard-coded credentials above. | |
''' | |
def twitterreq(url, method, parameters): | |
req = oauth.Request.from_consumer_and_token(oauth_consumer,token=oauth_token,http_method=http_method,http_url=url,parameters=parameters) | |
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token) | |
headers = req.to_header() | |
if http_method == "POST": | |
encoded_post_data = req.to_postdata() | |
else: | |
encoded_post_data = None | |
url = req.to_url() | |
opener = urllib.OpenerDirector() | |
opener.add_handler(http_handler) | |
opener.add_handler(https_handler) | |
response = opener.open(url, encoded_post_data) | |
return response | |
def fetchsamples(): | |
url = 'https://stream.twitter.com/1.1/statuses/filter.json?'+'language='+','.join([key for key in lang_dict])+'&locations=5.955870,45.818020,10.492030,47.808380' | |
#print url | |
parameters = [] | |
response = twitterreq(url, "GET", parameters) | |
for line in response: | |
tweet = line.strip() | |
#print tweet # debugging | |
get_language(tweet,lang_dict) | |
if __name__ == '__main__': | |
fetchsamples() |
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Language Name Language Code | |
Amharic am | |
Arabic ar | |
Bulgarian bg | |
Bengali bn | |
Tibetan bo | |
Cherokee chr | |
Danish da | |
German de | |
Maldivian dv | |
Greek el | |
English en | |
Spanish es | |
Persian fa | |
Finnish fi | |
French fr | |
Gujarati gu | |
Hebrew iw | |
Hindi hi | |
Hungarian hu | |
Armenian hy | |
Indonesian in | |
Icelandic is | |
Italian it | |
Inuktitut iu | |
Japanese ja | |
Georgian ka | |
Khmer km | |
Kannada kn | |
Korean ko | |
Lao lo | |
Lithuanian lt | |
Malaysian ml | |
Myanmar my | |
Nepali ne | |
Dutch nl | |
Norwegian no | |
Oriya or | |
Panjabi pa | |
Polish pl | |
Portuguese pt | |
Russian ru | |
Sinhala si | |
Swedish sv | |
Tamil ta | |
Telugu te | |
Thai th | |
Tagalog tl | |
Turkish tr | |
Urdu ur | |
Vietnamese vi | |
Chinese zh |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment