Last active
August 29, 2015 14:10
-
-
Save shivamy/39f744c2cd5d8bcf3650 to your computer and use it in GitHub Desktop.
Nasa Orion Journey Names Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/Users/shiva/anaconda/bin/python | |
# -*- coding: utf-8 -*- | |
import os | |
import sys | |
import re | |
import locale | |
import pprint | |
import scraperwiki | |
from bs4 import BeautifulSoup | |
from collections import defaultdict | |
class NasaData(): | |
nasa_file_path = "/tmp/nasa_orion_reg_by_country.txt" | |
ctry_file_path = "/tmp/countrycode_org_data.txt" | |
nasa_site = "http://mars.nasa.gov/participate/send-your-name/orion-first-flight/world-participation-map/" | |
ctry_site = "http://countrycode.org/" | |
metrics_file_path = "/tmp/nasa_metrics_by_country.txt" | |
def __init__(self): | |
pass | |
def get_nasa_entries(): | |
''' | |
Scrape NASA Orion participants count by country data | |
Ouptput to file nasa_orion_reg_by_country.txt | |
Args: None | |
''' | |
html = scraperwiki.scrape( NasaData.nasa_site ) | |
soup = BeautifulSoup( html ) | |
out_file = NasaData.nasa_file_path | |
if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10: | |
print "Warning: " + out_file + " exists. Continuing without scraping NASA data.\n" | |
return False | |
countries = soup.find( 'ul', class_='countryList' ) | |
with open( out_file, 'wt' ) as fh: | |
for country in countries.findAll('li'): | |
c_name = country.find('div', class_='countryName').text | |
c_num = country.find('div', class_='countNumber').text.strip() | |
# line = c_name + "," + c_num + "\n" | |
line = ''.join([c_name, ',', c_num, '\n']) | |
fh.write(line) | |
return True | |
# HTML table id: main_table_blue with header | |
# Typical row: [ <td align="left"> <a href="country_detail.cfm?countryID=1">Afghanistan</a> </td>, | |
# <td align="center">AF / AFG </td>, | |
# <td align="center">93 </td>, | |
# <td align="center">28,396,000 <span class="rank">(43) </span></td>, | |
# <td align="center">652,230 <span class="rank">(42) </span></td>, | |
# <td align="center"> 22.27 Billion <span class="rank">(114) </span></td> ] | |
def get_country_details(): | |
''' | |
Scrape countrycode data including population, gdp, area, etc. | |
Dump output to file countrycode_org_data.txt | |
Args: None | |
''' | |
html = scraperwiki.scrape(NasaData.ctry_site) | |
soup = BeautifulSoup(html) | |
out_file = NasaData.ctry_file_path | |
if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10: | |
print "Warning: " + out_file + " exists. Continuing without scraping COUNTRY_CODE data.\n" | |
return False | |
cnty_table = soup.find( lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "main_table_blue" ) | |
countries = cnty_table.findAll( lambda tag: tag.name == 'tr' ) | |
with open( out_file, 'wt' ) as fh: | |
for country in ( countries ): | |
cnty_str = '|' | |
cnty_attr = country.findAll( lambda tag: tag.name == 'th' ) | |
if ( cnty_attr ): | |
for attr in ( cnty_attr ): | |
cnty_str += attr.contents[0] + "|" | |
else: | |
cnty_attr = country.findAll( lambda tag: tag.name == 'td' ) | |
if ( cnty_attr ): | |
for ix, val in ( enumerate(cnty_attr) ): | |
if ix == 0: | |
cnty_str += val.findAll( lambda tag: tag.name == 'a' )[0].string + "|" # Get country name | |
else: | |
cnty_str += val.contents[0].strip() + "|" # Get country attrs | |
# print cnty_str | |
fh.write( cnty_str + "\n" ) | |
return True | |
def join_country_data(): | |
''' | |
Join two data sets by country name and write to file nasa_metrics_by_country.txt | |
country names and its metrics | |
Args: None | |
''' | |
fh = open( NasaData.metrics_file_path, 'wt' ) | |
# Country names lowercased, removed leading "The ", removed leading/trailing and extra spaces | |
nasa_data = defaultdict(list) | |
cc_org_data = {} | |
for line in open( NasaData.nasa_file_path, 'rt' ): | |
ln_els = line.strip('\n').split(',') | |
ln_els[0] = ln_els[0].lower() | |
ln_els[0] = re.sub(r'(^[Tt]he\s+)', '', ln_els[0]) | |
ln_els[0] = re.sub(r'(\s{2,})', ' ', ln_els[0]) | |
nasa_data[ln_els[0]].append(ln_els[1]) # orion_vote appended | |
# nasa_data dict appended with country data. key:country => values[orion_votes, pop., area, gdp] | |
for l_num, line in enumerate( open( NasaData.ctry_file_path, 'rt') ): | |
# line: |Afghanistan|AF / AFG|93|28,396,000|652,230|22.27 Billion| | |
if l_num == 0: continue # Skip header | |
ln_els = line.strip('\n').split('|') | |
ln_els[1] = ln_els[1].lower() | |
ln_els[1] = re.sub(r'(^[Tt]he\s+)', '', ln_els[1]) | |
ln_els[1] = re.sub(r'(\s{2,})', ' ', ln_els[1]) | |
# Strip out comma in pop(element 4) and area (5) | |
nasa_data[ln_els[1]].append( ln_els[4].translate(None, ',') ) # pop appended | |
nasa_data[ln_els[1]].append( ln_els[5].translate(None, ',') ) # area appended | |
# Normalize gdp to millions | |
gdp = re.match( r'(\d+\.?\d*)', ln_els[6] ).group(0) | |
gdp = float(gdp) | |
if re.search( r'(Billion)', ln_els[6], re.I ): | |
gdp = gdp * 1000 | |
elif re.search( r'(Trillion)', ln_els[6], re.I ): | |
gdp = gdp * 1000000 | |
nasa_data[ln_els[1]].append( gdp ) # gdp appended | |
# TODO: Some country names are not standard in NASA data. Example French Guiana is either Guiana or Guyana | |
# Delete what is not found in country code data or match countries with hard coded values | |
locale.setlocale(locale.LC_ALL, '') | |
for cn in sorted(nasa_data): # country name | |
# array has all nasa_votes, pop., sq miles, gdp and has pop > 0 and gdp > 0. Capitalize name. | |
if len(nasa_data[cn]) > 3 and int(nasa_data[cn][1]) > 0 and int(nasa_data[cn][3]) > 0: | |
l = ( cn.title() + ":" + nasa_data[cn][0] | |
+ ":" + locale.format( '%d', int(nasa_data[cn][1]), 1 ) # pop | |
+ ":" + str( round( float( nasa_data[cn][0] ) * 10000/ int(nasa_data[cn][1]), 5 )) # per 10K pop | |
+ ":" + locale.format( '%d', int(nasa_data[cn][2]), 1 ) # area | |
+ ":" + str( round( float( nasa_data[cn][0]) * 1000 / int(nasa_data[cn][2]), 5 )) # per 1K sq mile | |
+ ":" + locale.format( '%d', int(nasa_data[cn][3]), 1 ) # gdp | |
+ ":" + str( round( float( nasa_data[cn][0]) * 1000 / nasa_data[cn][3], 5 )) # per Billion $ gdp | |
+ "\n" | |
) | |
fh.write(l) | |
return True | |
if __name__ == "__main__": | |
get_nasa_entries() | |
get_country_details() | |
join_country_data() | |
exit( 0 ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment