Created
March 9, 2011 20:36
-
-
Save jeremyfromearth/862954 to your computer and use it in GitHub Desktop.
Uses BeautifulSoup to scrape planetary data from three html tables of solarviews.com and converts it to a single xml file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import urllib | |
from xml.etree.ElementTree import Element, ElementTree, SubElement, dump | |
from BeautifulSoup import BeautifulSoup | |
import re | |
import sys | |
# create a root xml node | |
root = Element('data') | |
# container for orbitals | |
orbitals = SubElement(root, 'orbitals') | |
# maps are used to convert unusable or obscure column names to more readable ones | |
h_map = {} | |
h_map['#'] = 'index' | |
o_map = {} | |
o_map['O. Per.'] = 'orbital_period' | |
o_map['Incl.'] = 'orbital_inclination' | |
o_map['Tilt'] = 'tilt_of_axis' | |
o_map['Eccen.'] = 'orbital_eccentricity' | |
o_map['R. Period'] = 'siderial_rotational' | |
o_map['M.O.V.'] = 'mean_orbital_velocity' | |
p_map = {} | |
p_map['Vo'] = 'visible_magnitude_at_opposition' | |
p_map['Distance'] = 'distance_from_orbited_body' | |
p_map['Escape'] = 'escape_velocity' | |
p_map['Albedo'] = 'visual_geometric_albedo' | |
# each page is parsed by the same function | |
def main(): | |
parse_page('http://www.solarviews.com/eng/data.htm', 0, h_map) | |
parse_page('http://www.solarviews.com/eng/data1.htm', 2, o_map) | |
parse_page('http://www.solarviews.com/eng/data2.htm', 2, p_map) | |
save() | |
# url - url to the page | |
# table_index - some pages have more tables than others | |
# field_name_map - a dictionary containing translated column names | |
def parse_page(url, table_index, field_name_map = None): | |
f = urllib.urlopen(url) | |
s = f.read() | |
soup = BeautifulSoup(s) | |
tables = soup.findAll('table') | |
table = tables[table_index] | |
rows = table.findAll('tr') | |
columns = rows[0].findAll('th') | |
for i in range( 1, len(rows)): | |
row = rows[i] | |
if len(row) > 1: | |
name = row.findAll('th')[0].text.lower() | |
remaining = row.findAll('td') | |
node = get_orbital_node_by_name(name) | |
for j in range(1, len(columns)): | |
field_name = columns[j].text | |
unit = '' | |
value = remaining[j-1].text.lower() | |
pattern = re.search( '[(].*[)]\Z', field_name ) | |
if pattern: | |
unit = pattern.group(0).replace('(', '').replace(')', '') | |
field_name = field_name.replace(pattern.group(0), '') | |
if field_name_map: | |
try: | |
field_name = field_name_map[field_name] | |
except: | |
pass | |
element = SubElement( node, field_name.lower() ) | |
element.text = value | |
element.attrib['units'] = unit | |
# looks up a node in the xml based on the name of the planet | |
# if no node exists a new one is created | |
def get_orbital_node_by_name(name): | |
for item in orbitals: | |
if item.attrib.get('name') == name: | |
return item | |
new_orbital = SubElement(orbitals, 'orbital') | |
new_orbital.attrib['name'] = name | |
SubElement( new_orbital, 'name' ).text = name | |
return new_orbital | |
# right the generated xml to a file | |
def save(): | |
f = open( sys.path[0]+'/orbitals.xml', 'w' ) | |
ElementTree( root ).write(f, 'UTF-8') | |
f.close() | |
pass | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment