|
#!/Users/shiva/anaconda/bin/python |
|
# -*- coding: utf-8 -*- |
|
|
|
import os |
|
import sys |
|
import re |
|
import locale |
|
import pprint |
|
import scraperwiki |
|
from bs4 import BeautifulSoup |
|
from collections import defaultdict |
|
|
|
|
|
class NasaData(): |
|
nasa_file_path = "/tmp/nasa_orion_reg_by_country.txt" |
|
ctry_file_path = "/tmp/countrycode_org_data.txt" |
|
nasa_site = "http://mars.nasa.gov/participate/send-your-name/orion-first-flight/world-participation-map/" |
|
ctry_site = "http://countrycode.org/" |
|
metrics_file_path = "/tmp/nasa_metrics_by_country.txt" |
|
|
|
def __init__(self): |
|
pass |
|
|
|
|
|
def get_nasa_entries(): |
|
''' |
|
Scrape NASA Orion participants count by country data |
|
Ouptput to file nasa_orion_reg_by_country.txt |
|
Args: None |
|
''' |
|
|
|
html = scraperwiki.scrape( NasaData.nasa_site ) |
|
soup = BeautifulSoup( html ) |
|
|
|
out_file = NasaData.nasa_file_path |
|
if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10: |
|
print "Warning: " + out_file + " exists. Continuing without scraping NASA data.\n" |
|
return False |
|
|
|
countries = soup.find( 'ul', class_='countryList' ) |
|
with open( out_file, 'wt' ) as fh: |
|
for country in countries.findAll('li'): |
|
c_name = country.find('div', class_='countryName').text |
|
c_num = country.find('div', class_='countNumber').text.strip() |
|
# line = c_name + "," + c_num + "\n" |
|
line = ''.join([c_name, ',', c_num, '\n']) |
|
fh.write(line) |
|
|
|
return True |
|
|
|
|
|
# HTML table id: main_table_blue with header |
|
# Typical row: [ <td align="left"> <a href="country_detail.cfm?countryID=1">Afghanistan</a> </td>, |
|
# <td align="center">AF / AFG </td>, |
|
# <td align="center">93 </td>, |
|
# <td align="center">28,396,000 <span class="rank">(43) </span></td>, |
|
# <td align="center">652,230 <span class="rank">(42) </span></td>, |
|
# <td align="center"> 22.27 Billion <span class="rank">(114) </span></td> ] |
|
def get_country_details(): |
|
''' |
|
Scrape countrycode data including population, gdp, area, etc. |
|
Dump output to file countrycode_org_data.txt |
|
Args: None |
|
''' |
|
|
|
html = scraperwiki.scrape(NasaData.ctry_site) |
|
soup = BeautifulSoup(html) |
|
|
|
out_file = NasaData.ctry_file_path |
|
if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10: |
|
print "Warning: " + out_file + " exists. Continuing without scraping COUNTRY_CODE data.\n" |
|
return False |
|
|
|
cnty_table = soup.find( lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "main_table_blue" ) |
|
countries = cnty_table.findAll( lambda tag: tag.name == 'tr' ) |
|
with open( out_file, 'wt' ) as fh: |
|
for country in ( countries ): |
|
cnty_str = '|' |
|
|
|
cnty_attr = country.findAll( lambda tag: tag.name == 'th' ) |
|
if ( cnty_attr ): |
|
for attr in ( cnty_attr ): |
|
cnty_str += attr.contents[0] + "|" |
|
else: |
|
cnty_attr = country.findAll( lambda tag: tag.name == 'td' ) |
|
if ( cnty_attr ): |
|
for ix, val in ( enumerate(cnty_attr) ): |
|
if ix == 0: |
|
cnty_str += val.findAll( lambda tag: tag.name == 'a' )[0].string + "|" # Get country name |
|
else: |
|
cnty_str += val.contents[0].strip() + "|" # Get country attrs |
|
|
|
# print cnty_str |
|
fh.write( cnty_str + "\n" ) |
|
|
|
return True |
|
|
|
|
|
def join_country_data(): |
|
''' |
|
Join two data sets by country name and write to file nasa_metrics_by_country.txt |
|
country names and its metrics |
|
Args: None |
|
''' |
|
fh = open( NasaData.metrics_file_path, 'wt' ) |
|
# Country names lowercased, removed leading "The ", removed leading/trailing and extra spaces |
|
nasa_data = defaultdict(list) |
|
cc_org_data = {} |
|
|
|
for line in open( NasaData.nasa_file_path, 'rt' ): |
|
ln_els = line.strip('\n').split(',') |
|
ln_els[0] = ln_els[0].lower() |
|
ln_els[0] = re.sub(r'(^[Tt]he\s+)', '', ln_els[0]) |
|
ln_els[0] = re.sub(r'(\s{2,})', ' ', ln_els[0]) |
|
nasa_data[ln_els[0]].append(ln_els[1]) # orion_vote appended |
|
|
|
# nasa_data dict appended with country data. key:country => values[orion_votes, pop., area, gdp] |
|
for l_num, line in enumerate( open( NasaData.ctry_file_path, 'rt') ): |
|
# line: |Afghanistan|AF / AFG|93|28,396,000|652,230|22.27 Billion| |
|
if l_num == 0: continue # Skip header |
|
|
|
ln_els = line.strip('\n').split('|') |
|
ln_els[1] = ln_els[1].lower() |
|
ln_els[1] = re.sub(r'(^[Tt]he\s+)', '', ln_els[1]) |
|
ln_els[1] = re.sub(r'(\s{2,})', ' ', ln_els[1]) |
|
|
|
# Strip out comma in pop(element 4) and area (5) |
|
nasa_data[ln_els[1]].append( ln_els[4].translate(None, ',') ) # pop appended |
|
nasa_data[ln_els[1]].append( ln_els[5].translate(None, ',') ) # area appended |
|
|
|
# Normalize gdp to millions |
|
gdp = re.match( r'(\d+\.?\d*)', ln_els[6] ).group(0) |
|
gdp = float(gdp) |
|
if re.search( r'(Billion)', ln_els[6], re.I ): |
|
gdp = gdp * 1000 |
|
elif re.search( r'(Trillion)', ln_els[6], re.I ): |
|
gdp = gdp * 1000000 |
|
nasa_data[ln_els[1]].append( gdp ) # gdp appended |
|
|
|
|
|
# TODO: Some country names are not standard in NASA data. Example French Guiana is either Guiana or Guyana |
|
# Delete what is not found in country code data or match countries with hard coded values |
|
|
|
|
|
locale.setlocale(locale.LC_ALL, '') |
|
for cn in sorted(nasa_data): # country name |
|
# array has all nasa_votes, pop., sq miles, gdp and has pop > 0 and gdp > 0. Capitalize name. |
|
if len(nasa_data[cn]) > 3 and int(nasa_data[cn][1]) > 0 and int(nasa_data[cn][3]) > 0: |
|
l = ( cn.title() + ":" + nasa_data[cn][0] |
|
+ ":" + locale.format( '%d', int(nasa_data[cn][1]), 1 ) # pop |
|
+ ":" + str( round( float( nasa_data[cn][0] ) * 10000/ int(nasa_data[cn][1]), 5 )) # per 10K pop |
|
+ ":" + locale.format( '%d', int(nasa_data[cn][2]), 1 ) # area |
|
+ ":" + str( round( float( nasa_data[cn][0]) * 1000 / int(nasa_data[cn][2]), 5 )) # per 1K sq mile |
|
+ ":" + locale.format( '%d', int(nasa_data[cn][3]), 1 ) # gdp |
|
+ ":" + str( round( float( nasa_data[cn][0]) * 1000 / nasa_data[cn][3], 5 )) # per Billion $ gdp |
|
+ "\n" |
|
) |
|
fh.write(l) |
|
|
|
return True |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
get_nasa_entries() |
|
get_country_details() |
|
join_country_data() |
|
exit( 0 ) |