odedlaz/mathmatician.py

## mathmatician.py
from collections import defaultdict
from pyquery import PyQuery
import csv

# the format for the id
url_format = url_format = "http://genealogy.math.ndsu.nodak.edu/id.php?id=%d"

# the response message when there is no id
# I hope there are no spaces in between, that is:
# id 3 exists, id 4 doesn't but id 5 does exist.
# if that's the case, it's going to be alittle bit more trickey
# talk to me if that's the case

missing_response = "You have specified an ID that does not exist in the database"

# start with the first id
person_id = 1

# setting up the csv writer
# csv is a format that excel knows how to open. really easy!
# we write during scraping, not after - to save memory
# change this path to whatever you like
csvfile = open('/tmp/names.csv', 'w')

print("writing all data to: %s" % csvfile.name)
# write the columns
writer = csv.DictWriter(csvfile, fieldnames=['person', 'pupil'])
writer.writeheader()

# change this to while person_id < 10 to check it's working
# btw, it's working ;)
while True:
  print("trying to fetch details for id: %d" % person_id)
  # using requests to download the page
  response = requests.get(url_format % person_id)

  # unfortunately, when the person is missing
  # they still respond with return code 200 instead of 404
  if missing_response in response.text:
    print("we're done! id %d doesn't exist" % person_id)
    break
  # use PyQuery to select the elements we want from the page
  q = PyQuery(response.text)

  # get person name. this is the h2 header.
  # you can find it by inspecting the source of the html
  person_name = q("h2").text().strip()

  print("found a new person! his name is: %s" % person_name)

  # this is a shorthand. we find all the pupils,
  # transform the names to text and save it
  # I looked up in stackoverflow how to get the first child of every td
  # tr is the row, td is the cell. we need the first cell of every row.
  pupils = map(lambda x: PyQuery(x).text().strip(), q("table tr td:first-child"))
  print("person_name: %s | pupils: %s" % (person_name, pupils))

  # if there are no pupils -> don't write anything.
  # if the are, add them to the file.
  for pupil in pupils:
      writer.writerow({'person': person_name, 'pupil': pupil})

  # increment the id. not using a foreach because we don't know when to stop...
  person_id += 1

  # flush the items in memory to the file every 10 people
  # so if we crash, only maximum of 10 values are lost
  if person_id % 10:
    csvfile.flush()

# remember to close the file at the end!
csvfile.close()
	from collections import defaultdict
	from pyquery import PyQuery
	import csv

	# the format for the id
	url_format = url_format = "http://genealogy.math.ndsu.nodak.edu/id.php?id=%d"

	# the response message when there is no id
	# I hope there are no spaces in between, that is:
	# id 3 exists, id 4 doesn't but id 5 does exist.
	# if that's the case, it's going to be alittle bit more trickey
	# talk to me if that's the case

	missing_response = "You have specified an ID that does not exist in the database"

	# start with the first id
	person_id = 1

	# setting up the csv writer
	# csv is a format that excel knows how to open. really easy!
	# we write during scraping, not after - to save memory
	# change this path to whatever you like
	csvfile = open('/tmp/names.csv', 'w')

	print("writing all data to: %s" % csvfile.name)
	# write the columns
	writer = csv.DictWriter(csvfile, fieldnames=['person', 'pupil'])
	writer.writeheader()

	# change this to while person_id < 10 to check it's working
	# btw, it's working ;)
	while True:
	print("trying to fetch details for id: %d" % person_id)
	# using requests to download the page
	response = requests.get(url_format % person_id)

	# unfortunately, when the person is missing
	# they still respond with return code 200 instead of 404
	if missing_response in response.text:
	print("we're done! id %d doesn't exist" % person_id)
	break
	# use PyQuery to select the elements we want from the page
	q = PyQuery(response.text)

	# get person name. this is the h2 header.
	# you can find it by inspecting the source of the html
	person_name = q("h2").text().strip()

	print("found a new person! his name is: %s" % person_name)

	# this is a shorthand. we find all the pupils,
	# transform the names to text and save it
	# I looked up in stackoverflow how to get the first child of every td
	# tr is the row, td is the cell. we need the first cell of every row.
	pupils = map(lambda x: PyQuery(x).text().strip(), q("table tr td:first-child"))
	print("person_name: %s \| pupils: %s" % (person_name, pupils))

	# if there are no pupils -> don't write anything.
	# if the are, add them to the file.
	for pupil in pupils:
	writer.writerow({'person': person_name, 'pupil': pupil})

	# increment the id. not using a foreach because we don't know when to stop...
	person_id += 1

	# flush the items in memory to the file every 10 people
	# so if we crash, only maximum of 10 values are lost
	if person_id % 10:
	csvfile.flush()

	# remember to close the file at the end!
	csvfile.close()