Skip to content

Instantly share code, notes, and snippets.

@rneiss
Created February 1, 2017 16:07
Show Gist options
  • Save rneiss/bce9f2a7d31cd4b5977100f954727720 to your computer and use it in GitHub Desktop.
Save rneiss/bce9f2a7d31cd4b5977100f954727720 to your computer and use it in GitHub Desktop.
USHMM St. Louis Manifest Scraper Script
from bs4 import BeautifulSoup
from urllib2 import urlopen
pid = 1
while pid < 938:
section_url = 'https://www.ushmm.org/online/st-louis/detail.php?PassengerId='+str(pid)
html = urlopen(section_url).read()
soup = BeautifulSoup(html, "html5lib")
content = soup.find("div", "main text")
name = content.find("h1").string
name = ' '.join(reversed(name.split(','))).strip()
birthDate = ""
lastLocation = ""
status = ""
imgURL = ""
img = content.find("img")
if img:
imgURL = img['src']
detailContent = content.findAll("div", "detail_view")
for eachDetail in detailContent:
detailTitle = eachDetail.find("div", "detail_title").string
if detailTitle == u'\xa0':
status = eachDetail.find("div", "detail_content").string.strip()
if detailTitle == "Birth Date:":
birthDate = eachDetail.find("div", "detail_content").string.strip()
if detailTitle == "Last Known Location:":
lastLocation = eachDetail.find("div", "detail_content").string.strip()
print str(pid)
if status == "perished":
with open("stLouis.txt", "a") as myfile:
myfile.write(str(pid)+","+name.encode('utf-8')+","+status.encode('utf-8')+","+lastLocation.encode('utf-8')+","+imgURL.encode('utf-8')+"\n")
pid = pid + 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment