Skip to content

Instantly share code, notes, and snippets.

@jcsalterego
Forked from jjjjeeffff/gist:170850
Created August 20, 2009 05:04
Show Gist options
  • Save jcsalterego/170851 to your computer and use it in GitHub Desktop.
Save jcsalterego/170851 to your computer and use it in GitHub Desktop.
from pprint import pprint
import re
import sys
import urllib2
URL = "http://www.modifiedcartrader.com/information/newestCars.aspx"
TRANSLATIONS = {(1, "Label1"): "mileage",
(1, "Label2"): "hp",
(1, "Panel1"): "make",
(2, "Label1"): "hp",
(2, "Label2"): "hpx",
(2, "Panel1"): "transmission"}
def info_transform(item, depth=1):
"""Transforms
"""
id, field = item[0].replace("ctl", "").split("_")[-2:]
id = int(id)
if (depth, field) in TRANSLATIONS:
field = TRANSLATIONS[(depth, field)]
value = " ".join([word for word in item[1].split(" ")
if word])
if not value:
return None
# check for digits
value_ = value.replace(",", "")
if value_.isdigit():
value = int(value_)
return [id, field, value]
def main(argv):
try:
doc = file('cache.html').read()
except:
doc = urllib2.urlopen(URL).read()
file('cache.html', 'w').write(doc)
doc = doc.replace("\n", "").replace("\r", "")
ids = re.compile(r'id="(.+?)"').findall(doc)
ids = [tag for tag in ids if 'GridView1_ctl' in tag]
info_re = re.compile(r'id="(.+?)".*?>([^<]+)</')
info = [[el.strip() for el in tag]
for tag
in info_re.findall(doc)
if tag[0] in ids]
# transform keys
info = [info_transform(item) for item in info
if info_transform(item)]
pprint(info)
info_re = re.compile(r'id="(.+?)".*?><.*?>([^<]+)</')
info = [[el.strip() for el in tag]
for tag
in info_re.findall(doc)
if tag[0] in ids]
info = [info_transform(item, depth=2) for item in info
if info_transform(item)]
pprint(info)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment