Skip to content

Instantly share code, notes, and snippets.

@odedlaz
Last active December 1, 2015 18:03
Show Gist options
  • Save odedlaz/684e02aef699667b4545 to your computer and use it in GitHub Desktop.
Save odedlaz/684e02aef699667b4545 to your computer and use it in GitHub Desktop.
download all mathmaticians from enealogy.math.ndsu.nodak.edu to a csv file
from collections import defaultdict
from pyquery import PyQuery
import csv
# the format for the id
url_format = url_format = "http://genealogy.math.ndsu.nodak.edu/id.php?id=%d"
# the response message when there is no id
# I hope there are no spaces in between, that is:
# id 3 exists, id 4 doesn't but id 5 does exist.
# if that's the case, it's going to be alittle bit more trickey
# talk to me if that's the case
missing_response = "You have specified an ID that does not exist in the database"
# start with the first id
person_id = 1
# setting up the csv writer
# csv is a format that excel knows how to open. really easy!
# we write during scraping, not after - to save memory
# change this path to whatever you like
csvfile = open('/tmp/names.csv', 'w')
print("writing all data to: %s" % csvfile.name)
# write the columns
writer = csv.DictWriter(csvfile, fieldnames=['person', 'pupil'])
writer.writeheader()
# change this to while person_id < 10 to check it's working
# btw, it's working ;)
while True:
print("trying to fetch details for id: %d" % person_id)
# using requests to download the page
response = requests.get(url_format % person_id)
# unfortunately, when the person is missing
# they still respond with return code 200 instead of 404
if missing_response in response.text:
print("we're done! id %d doesn't exist" % person_id)
break
# use PyQuery to select the elements we want from the page
q = PyQuery(response.text)
# get person name. this is the h2 header.
# you can find it by inspecting the source of the html
person_name = q("h2").text().strip()
print("found a new person! his name is: %s" % person_name)
# this is a shorthand. we find all the pupils,
# transform the names to text and save it
# I looked up in stackoverflow how to get the first child of every td
# tr is the row, td is the cell. we need the first cell of every row.
pupils = map(lambda x: PyQuery(x).text().strip(), q("table tr td:first-child"))
print("person_name: %s | pupils: %s" % (person_name, pupils))
# if there are no pupils -> don't write anything.
# if the are, add them to the file.
for pupil in pupils:
writer.writerow({'person': person_name, 'pupil': pupil})
# increment the id. not using a foreach because we don't know when to stop...
person_id += 1
# flush the items in memory to the file every 10 people
# so if we crash, only maximum of 10 values are lost
if person_id % 10:
csvfile.flush()
# remember to close the file at the end!
csvfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment