shivamy/nasa_orion_journey_names.py

## nasa_orion_journey_names.py
#!/Users/shiva/anaconda/bin/python
# -*- coding: utf-8 -*-

import os
import sys
import re
import locale
import pprint
import scraperwiki
from bs4 import BeautifulSoup
from collections import defaultdict


class NasaData():
    nasa_file_path    = "/tmp/nasa_orion_reg_by_country.txt"
    ctry_file_path    = "/tmp/countrycode_org_data.txt"
    nasa_site         = "http://mars.nasa.gov/participate/send-your-name/orion-first-flight/world-participation-map/"
    ctry_site         = "http://countrycode.org/"
    metrics_file_path = "/tmp/nasa_metrics_by_country.txt"

    def __init__(self):
        pass


def get_nasa_entries():
    '''
        Scrape NASA Orion participants count by country data
        Ouptput to file nasa_orion_reg_by_country.txt
        Args: None
    '''

    html = scraperwiki.scrape( NasaData.nasa_site )
    soup = BeautifulSoup( html )

    out_file = NasaData.nasa_file_path
    if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10:
        print "Warning: " + out_file + " exists. Continuing without scraping NASA data.\n"
        return False

    countries = soup.find( 'ul', class_='countryList' )
    with open( out_file, 'wt' ) as fh:
        for country in countries.findAll('li'):
            c_name = country.find('div', class_='countryName').text
            c_num  = country.find('div', class_='countNumber').text.strip()
            # line = c_name +  ","  + c_num + "\n"
            line = ''.join([c_name, ',', c_num, '\n'])
            fh.write(line)

    return True


# HTML table id: main_table_blue with header
# Typical row: [ <td align="left"> <a href="country_detail.cfm?countryID=1">Afghanistan</a> </td>,
#                <td align="center">AF / AFG </td>,
#                <td align="center">93 </td>,
#                <td align="center">28,396,000 <span class="rank">(43) </span></td>,
#                <td align="center">652,230 <span class="rank">(42) </span></td>,
#                <td align="center"> 22.27 Billion <span class="rank">(114) </span></td>  ]
def get_country_details():
    '''
        Scrape countrycode data including population, gdp, area, etc.
        Dump output to file countrycode_org_data.txt
        Args: None
    '''

    html = scraperwiki.scrape(NasaData.ctry_site)
    soup = BeautifulSoup(html)

    out_file = NasaData.ctry_file_path
    if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10:
        print "Warning: " + out_file + " exists. Continuing without scraping COUNTRY_CODE data.\n"
        return False

    cnty_table = soup.find( lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "main_table_blue" )
    countries = cnty_table.findAll( lambda tag: tag.name == 'tr' )
    with open( out_file, 'wt' ) as fh:
        for country in ( countries ):
            cnty_str = '|'

            cnty_attr = country.findAll( lambda tag: tag.name == 'th' )
            if ( cnty_attr ):
                for attr in ( cnty_attr ):
                    cnty_str += attr.contents[0] + "|"
            else:
                cnty_attr = country.findAll( lambda tag: tag.name == 'td' )
                if ( cnty_attr ):
                    for ix, val in ( enumerate(cnty_attr) ):
                        if ix == 0:
                            cnty_str += val.findAll( lambda tag: tag.name == 'a' )[0].string + "|"  # Get country name
                        else:
                            cnty_str += val.contents[0].strip() + "|"                               # Get country attrs

            # print cnty_str
            fh.write( cnty_str + "\n" )

    return True


def join_country_data():
    '''
        Join two data sets by country name and write to file nasa_metrics_by_country.txt
        country names and its metrics
        Args: None
    '''
    fh = open( NasaData.metrics_file_path, 'wt' )
    # Country names lowercased, removed leading "The ", removed leading/trailing and extra spaces
    nasa_data = defaultdict(list)
    cc_org_data = {}

    for line in open( NasaData.nasa_file_path, 'rt' ):
        ln_els = line.strip('\n').split(',')
        ln_els[0] = ln_els[0].lower()
        ln_els[0] = re.sub(r'(^[Tt]he\s+)', '', ln_els[0])
        ln_els[0] = re.sub(r'(\s{2,})', ' ', ln_els[0])
        nasa_data[ln_els[0]].append(ln_els[1])              # orion_vote appended

    # nasa_data dict appended with country data. key:country => values[orion_votes, pop., area, gdp]
    for l_num, line in enumerate( open( NasaData.ctry_file_path, 'rt') ):
        # line: |Afghanistan|AF / AFG|93|28,396,000|652,230|22.27 Billion|
        if l_num == 0: continue   # Skip header

        ln_els = line.strip('\n').split('|')
        ln_els[1] = ln_els[1].lower()
        ln_els[1] = re.sub(r'(^[Tt]he\s+)', '', ln_els[1])
        ln_els[1] = re.sub(r'(\s{2,})', ' ', ln_els[1])

        # Strip out comma in pop(element 4) and area (5)
        nasa_data[ln_els[1]].append( ln_els[4].translate(None, ',') )   # pop appended
        nasa_data[ln_els[1]].append( ln_els[5].translate(None, ',') )   # area appended

        # Normalize gdp to millions
        gdp = re.match( r'(\d+\.?\d*)', ln_els[6] ).group(0)
        gdp = float(gdp)
        if re.search( r'(Billion)', ln_els[6], re.I ):
            gdp = gdp * 1000
        elif re.search( r'(Trillion)', ln_els[6], re.I ):
            gdp = gdp * 1000000
        nasa_data[ln_els[1]].append( gdp )                              # gdp appended


    # TODO: Some country names are not standard in NASA data. Example French Guiana is either Guiana or Guyana
    # Delete what is not found in country code data or match countries with hard coded values


    locale.setlocale(locale.LC_ALL, '')
    for cn in sorted(nasa_data):  # country name
        # array has all nasa_votes, pop., sq miles, gdp and has pop > 0 and gdp > 0.  Capitalize name.
        if len(nasa_data[cn]) > 3 and int(nasa_data[cn][1]) > 0 and int(nasa_data[cn][3]) > 0:
            l = ( cn.title() + ":" + nasa_data[cn][0]
                    + ":" + locale.format( '%d', int(nasa_data[cn][1]), 1 )                                 # pop
                    + ":" + str( round( float( nasa_data[cn][0] ) * 10000/ int(nasa_data[cn][1]), 5 ))      # per 10K pop
                    + ":" + locale.format( '%d', int(nasa_data[cn][2]), 1 )                                 # area
                    + ":" + str( round( float( nasa_data[cn][0]) * 1000 / int(nasa_data[cn][2]), 5 ))       # per 1K sq mile
                    + ":" + locale.format( '%d', int(nasa_data[cn][3]), 1 )                                 # gdp
                    + ":" + str( round( float( nasa_data[cn][0]) * 1000 / nasa_data[cn][3],      5 ))       # per Billion $ gdp
                    + "\n"
                )
            fh.write(l)

    return True


if __name__ == "__main__":
    get_nasa_entries()
    get_country_details()
    join_country_data()
    exit( 0 )
	#!/Users/shiva/anaconda/bin/python
	# -- coding: utf-8 --

	import os
	import sys
	import re
	import locale
	import pprint
	import scraperwiki
	from bs4 import BeautifulSoup
	from collections import defaultdict


	class NasaData():
	nasa_file_path = "/tmp/nasa_orion_reg_by_country.txt"
	ctry_file_path = "/tmp/countrycode_org_data.txt"
	nasa_site = "http://mars.nasa.gov/participate/send-your-name/orion-first-flight/world-participation-map/"
	ctry_site = "http://countrycode.org/"
	metrics_file_path = "/tmp/nasa_metrics_by_country.txt"

	def __init__(self):
	pass


	def get_nasa_entries():
	'''
	Scrape NASA Orion participants count by country data
	Ouptput to file nasa_orion_reg_by_country.txt
	Args: None
	'''

	html = scraperwiki.scrape( NasaData.nasa_site )
	soup = BeautifulSoup( html )

	out_file = NasaData.nasa_file_path
	if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10:
	print "Warning: " + out_file + " exists. Continuing without scraping NASA data.\n"
	return False

	countries = soup.find( 'ul', class_='countryList' )
	with open( out_file, 'wt' ) as fh:
	for country in countries.findAll('li'):
	c_name = country.find('div', class_='countryName').text
	c_num = country.find('div', class_='countNumber').text.strip()
	# line = c_name + "," + c_num + "\n"
	line = ''.join([c_name, ',', c_num, '\n'])
	fh.write(line)

	return True


	# HTML table id: main_table_blue with header
	# Typical row: [ <td align="left"> <a href="country_detail.cfm?countryID=1">Afghanistan</a> </td>,
	# <td align="center">AF / AFG </td>,
	# <td align="center">93 </td>,
	# <td align="center">28,396,000 <span class="rank">(43) </span></td>,
	# <td align="center">652,230 <span class="rank">(42) </span></td>,
	# <td align="center"> 22.27 Billion <span class="rank">(114) </span></td> ]
	def get_country_details():
	'''
	Scrape countrycode data including population, gdp, area, etc.
	Dump output to file countrycode_org_data.txt
	Args: None
	'''

	html = scraperwiki.scrape(NasaData.ctry_site)
	soup = BeautifulSoup(html)

	out_file = NasaData.ctry_file_path
	if os.path.exists( out_file ) and os.path.getsize( out_file ) > 10:
	print "Warning: " + out_file + " exists. Continuing without scraping COUNTRY_CODE data.\n"
	return False

	cnty_table = soup.find( lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "main_table_blue" )
	countries = cnty_table.findAll( lambda tag: tag.name == 'tr' )
	with open( out_file, 'wt' ) as fh:
	for country in ( countries ):
	cnty_str = '\|'

	cnty_attr = country.findAll( lambda tag: tag.name == 'th' )
	if ( cnty_attr ):
	for attr in ( cnty_attr ):
	cnty_str += attr.contents[0] + "\|"
	else:
	cnty_attr = country.findAll( lambda tag: tag.name == 'td' )
	if ( cnty_attr ):
	for ix, val in ( enumerate(cnty_attr) ):
	if ix == 0:
	cnty_str += val.findAll( lambda tag: tag.name == 'a' )[0].string + "\|" # Get country name
	else:
	cnty_str += val.contents[0].strip() + "\|" # Get country attrs

	# print cnty_str
	fh.write( cnty_str + "\n" )

	return True


	def join_country_data():
	'''
	Join two data sets by country name and write to file nasa_metrics_by_country.txt
	country names and its metrics
	Args: None
	'''
	fh = open( NasaData.metrics_file_path, 'wt' )
	# Country names lowercased, removed leading "The ", removed leading/trailing and extra spaces
	nasa_data = defaultdict(list)
	cc_org_data = {}

	for line in open( NasaData.nasa_file_path, 'rt' ):
	ln_els = line.strip('\n').split(',')
	ln_els[0] = ln_els[0].lower()
	ln_els[0] = re.sub(r'(^[Tt]he\s+)', '', ln_els[0])
	ln_els[0] = re.sub(r'(\s{2,})', ' ', ln_els[0])
	nasa_data[ln_els[0]].append(ln_els[1]) # orion_vote appended

	# nasa_data dict appended with country data. key:country => values[orion_votes, pop., area, gdp]
	for l_num, line in enumerate( open( NasaData.ctry_file_path, 'rt') ):
	# line: \|Afghanistan\|AF / AFG\|93\|28,396,000\|652,230\|22.27 Billion\|
	if l_num == 0: continue # Skip header

	ln_els = line.strip('\n').split('\|')
	ln_els[1] = ln_els[1].lower()
	ln_els[1] = re.sub(r'(^[Tt]he\s+)', '', ln_els[1])
	ln_els[1] = re.sub(r'(\s{2,})', ' ', ln_els[1])

	# Strip out comma in pop(element 4) and area (5)
	nasa_data[ln_els[1]].append( ln_els[4].translate(None, ',') ) # pop appended
	nasa_data[ln_els[1]].append( ln_els[5].translate(None, ',') ) # area appended

	# Normalize gdp to millions
	gdp = re.match( r'(\d+\.?\d*)', ln_els[6] ).group(0)
	gdp = float(gdp)
	if re.search( r'(Billion)', ln_els[6], re.I ):
	gdp = gdp * 1000
	elif re.search( r'(Trillion)', ln_els[6], re.I ):
	gdp = gdp * 1000000
	nasa_data[ln_els[1]].append( gdp ) # gdp appended


	# TODO: Some country names are not standard in NASA data. Example French Guiana is either Guiana or Guyana
	# Delete what is not found in country code data or match countries with hard coded values


	locale.setlocale(locale.LC_ALL, '')
	for cn in sorted(nasa_data): # country name
	# array has all nasa_votes, pop., sq miles, gdp and has pop > 0 and gdp > 0. Capitalize name.
	if len(nasa_data[cn]) > 3 and int(nasa_data[cn][1]) > 0 and int(nasa_data[cn][3]) > 0:
	l = ( cn.title() + ":" + nasa_data[cn][0]
	+ ":" + locale.format( '%d', int(nasa_data[cn][1]), 1 ) # pop
	+ ":" + str( round( float( nasa_data[cn][0] ) * 10000/ int(nasa_data[cn][1]), 5 )) # per 10K pop
	+ ":" + locale.format( '%d', int(nasa_data[cn][2]), 1 ) # area
	+ ":" + str( round( float( nasa_data[cn][0]) * 1000 / int(nasa_data[cn][2]), 5 )) # per 1K sq mile
	+ ":" + locale.format( '%d', int(nasa_data[cn][3]), 1 ) # gdp
	+ ":" + str( round( float( nasa_data[cn][0]) * 1000 / nasa_data[cn][3], 5 )) # per Billion $ gdp
	+ "\n"
	)
	fh.write(l)

	return True



	if __name__ == "__main__":
	get_nasa_entries()
	get_country_details()
	join_country_data()
	exit( 0 )