Skip to content

Instantly share code, notes, and snippets.

@jeremyfromearth
Created March 9, 2011 20:36
Show Gist options
  • Save jeremyfromearth/862954 to your computer and use it in GitHub Desktop.
Save jeremyfromearth/862954 to your computer and use it in GitHub Desktop.
Uses BeautifulSoup to scrape planetary data from three html tables of solarviews.com and converts it to a single xml file
#!/usr/bin/env python
import urllib
from xml.etree.ElementTree import Element, ElementTree, SubElement, dump
from BeautifulSoup import BeautifulSoup
import re
import sys
# create a root xml node
root = Element('data')
# container for orbitals
orbitals = SubElement(root, 'orbitals')
# maps are used to convert unusable or obscure column names to more readable ones
h_map = {}
h_map['#'] = 'index'
o_map = {}
o_map['O. Per.'] = 'orbital_period'
o_map['Incl.'] = 'orbital_inclination'
o_map['Tilt'] = 'tilt_of_axis'
o_map['Eccen.'] = 'orbital_eccentricity'
o_map['R. Period'] = 'siderial_rotational'
o_map['M.O.V.'] = 'mean_orbital_velocity'
p_map = {}
p_map['Vo'] = 'visible_magnitude_at_opposition'
p_map['Distance'] = 'distance_from_orbited_body'
p_map['Escape'] = 'escape_velocity'
p_map['Albedo'] = 'visual_geometric_albedo'
# each page is parsed by the same function
def main():
parse_page('http://www.solarviews.com/eng/data.htm', 0, h_map)
parse_page('http://www.solarviews.com/eng/data1.htm', 2, o_map)
parse_page('http://www.solarviews.com/eng/data2.htm', 2, p_map)
save()
# url - url to the page
# table_index - some pages have more tables than others
# field_name_map - a dictionary containing translated column names
def parse_page(url, table_index, field_name_map = None):
f = urllib.urlopen(url)
s = f.read()
soup = BeautifulSoup(s)
tables = soup.findAll('table')
table = tables[table_index]
rows = table.findAll('tr')
columns = rows[0].findAll('th')
for i in range( 1, len(rows)):
row = rows[i]
if len(row) > 1:
name = row.findAll('th')[0].text.lower()
remaining = row.findAll('td')
node = get_orbital_node_by_name(name)
for j in range(1, len(columns)):
field_name = columns[j].text
unit = ''
value = remaining[j-1].text.lower()
pattern = re.search( '[(].*[)]\Z', field_name )
if pattern:
unit = pattern.group(0).replace('(', '').replace(')', '')
field_name = field_name.replace(pattern.group(0), '')
if field_name_map:
try:
field_name = field_name_map[field_name]
except:
pass
element = SubElement( node, field_name.lower() )
element.text = value
element.attrib['units'] = unit
# looks up a node in the xml based on the name of the planet
# if no node exists a new one is created
def get_orbital_node_by_name(name):
for item in orbitals:
if item.attrib.get('name') == name:
return item
new_orbital = SubElement(orbitals, 'orbital')
new_orbital.attrib['name'] = name
SubElement( new_orbital, 'name' ).text = name
return new_orbital
# right the generated xml to a file
def save():
f = open( sys.path[0]+'/orbitals.xml', 'w' )
ElementTree( root ).write(f, 'UTF-8')
f.close()
pass
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment