#!/Users/shiva/anaconda/bin/python
# -*- coding: utf-8 -*-
import os
import sys
import re
import locale
import pprint
import scraperwiki
from bs4 import BeautifulSoup
from collections import defaultdict
class NasaData():
nasa_file_path = "/tmp/nasa_orion_reg_by_country.txt"
ctry_file_path = "/tmp/countrycode_org_data.txt"
nasa_site = "http://mars.nasa.gov/participate/send-your-name/orion-first-flight/world-participation-map/"
ctry_site = "http://countrycode.org/"
metrics_file_path = "/tmp/nasa_metrics_by_country.txt"
def __init__(self):
pass
def get_nasa_entries():
'''
Scrape NASA Orion participants count by country data
Ouptput to file nasa_orion_reg_by_country.txt
Args: None
'''
html = scraperwiki.scrape( NasaData.nasa_site )
soup = BeautifulSoup( html )
out_file = NasaData.nasa_file_path
if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10:
print "Warning: " + out_file + " exists. Continuing without scraping NASA data.\n"
return False
countries = soup.find( 'ul', class_='countryList' )
with open( out_file, 'wt' ) as fh:
for country in countries.findAll('li'):
c_name = country.find('div', class_='countryName').text
c_num = country.find('div', class_='countNumber').text.strip()
# line = c_name + "," + c_num + "\n"
line = ''.join([c_name, ',', c_num, '\n'])
fh.write(line)
return True
# HTML table id: main_table_blue with header
# Typical row: [ <td align="left"> <a href="country_detail.cfm?countryID=1">Afghanistan</a> </td>,
# <td align="center">AF / AFG </td>,
# <td align="center">93 </td>,
# <td align="center">28,396,000 <span class="rank">(43) </span></td>,
# <td align="center">652,230 <span class="rank">(42) </span></td>,
# <td align="center"> 22.27 Billion <span class="rank">(114) </span></td> ]
def get_country_details():
'''
Scrape countrycode data including population, gdp, area, etc.
Dump output to file countrycode_org_data.txt
Args: None
'''
html = scraperwiki.scrape(NasaData.ctry_site)
soup = BeautifulSoup(html)
out_file = NasaData.ctry_file_path
if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10:
print "Warning: " + out_file + " exists. Continuing without scraping COUNTRY_CODE data.\n"
return False
cnty_table = soup.find( lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "main_table_blue" )
countries = cnty_table.findAll( lambda tag: tag.name == 'tr' )
with open( out_file, 'wt' ) as fh:
for country in ( countries ):
cnty_str = '|'
cnty_attr = country.findAll( lambda tag: tag.name == 'th' )
if ( cnty_attr ):
for attr in ( cnty_attr ):
cnty_str += attr.contents[0] + "|"
else:
cnty_attr = country.findAll( lambda tag: tag.name == 'td' )
if ( cnty_attr ):
for ix, val in ( enumerate(cnty_attr) ):
if ix == 0:
cnty_str += val.findAll( lambda tag: tag.name == 'a' )[0].string + "|" # Get country name
else:
cnty_str += val.contents[0].strip() + "|" # Get country attrs
# print cnty_str
fh.write( cnty_str + "\n" )
return True
def join_country_data():
'''
Join two data sets by country name and write to file nasa_metrics_by_country.txt
country names and its metrics
Args: None
'''
fh = open( NasaData.metrics_file_path, 'wt' )
# Country names lowercased, removed leading "The ", removed leading/trailing and extra spaces
nasa_data = defaultdict(list)
cc_org_data = {}
for line in open( NasaData.nasa_file_path, 'rt' ):
ln_els = line.strip('\n').split(',')
ln_els[0] = ln_els[0].lower()
ln_els[0] = re.sub(r'(^[Tt]he\s+)', '', ln_els[0])
ln_els[0] = re.sub(r'(\s{2,})', ' ', ln_els[0])
nasa_data[ln_els[0]].append(ln_els[1]) # orion_vote appended
# nasa_data dict appended with country data. key:country => values[orion_votes, pop., area, gdp]
for l_num, line in enumerate( open( NasaData.ctry_file_path, 'rt') ):
# line: |Afghanistan|AF / AFG|93|28,396,000|652,230|22.27 Billion|
if l_num == 0: continue # Skip header
ln_els = line.strip('\n').split('|')
ln_els[1] = ln_els[1].lower()
ln_els[1] = re.sub(r'(^[Tt]he\s+)', '', ln_els[1])
ln_els[1] = re.sub(r'(\s{2,})', ' ', ln_els[1])
# Strip out comma in pop(element 4) and area (5)
nasa_data[ln_els[1]].append( ln_els[4].translate(None, ',') ) # pop appended
nasa_data[ln_els[1]].append( ln_els[5].translate(None, ',') ) # area appended
# Normalize gdp to millions
gdp = re.match( r'(\d+\.?\d*)', ln_els[6] ).group(0)
gdp = float(gdp)
if re.search( r'(Billion)', ln_els[6], re.I ):
gdp = gdp * 1000
elif re.search( r'(Trillion)', ln_els[6], re.I ):
gdp = gdp * 1000000
nasa_data[ln_els[1]].append( gdp ) # gdp appended
# TODO: Some country names are not standard in NASA data. Example French Guiana is either Guiana or Guyana
# Delete what is not found in country code data or match countries with hard coded values
locale.setlocale(locale.LC_ALL, '')
for cn in sorted(nasa_data): # country name
# array has all nasa_votes, pop., sq miles, gdp and has pop > 0 and gdp > 0. Capitalize name.
if len(nasa_data[cn]) > 3 and int(nasa_data[cn][1]) > 0 and int(nasa_data[cn][3]) > 0:
l = ( cn.title() + ":" + nasa_data[cn][0]
+ ":" + locale.format( '%d', int(nasa_data[cn][1]), 1 ) # pop
+ ":" + str( round( float( nasa_data[cn][0] ) * 10000/ int(nasa_data[cn][1]), 5 )) # per 10K pop
+ ":" + locale.format( '%d', int(nasa_data[cn][2]), 1 ) # area
+ ":" + str( round( float( nasa_data[cn][0]) * 1000 / int(nasa_data[cn][2]), 5 )) # per 1K sq mile
+ ":" + locale.format( '%d', int(nasa_data[cn][3]), 1 ) # gdp
+ ":" + str( round( float( nasa_data[cn][0]) * 1000 / nasa_data[cn][3], 5 )) # per Billion $ gdp
+ "\n"
)
fh.write(l)
return True
if __name__ == "__main__":
get_nasa_entries()
get_country_details()
join_country_data()
exit( 0 )