Skip to content

Instantly share code, notes, and snippets.

@tim-fan
Last active May 17, 2016 01:48
Show Gist options
  • Save tim-fan/d911f8a62dc16449fb19f09e416fe0a9 to your computer and use it in GitHub Desktop.
Save tim-fan/d911f8a62dc16449fb19f09e416fe0a9 to your computer and use it in GitHub Desktop.
Quick web-scraper for Te Papa online collections. Original request: "I wanna scrape these pages: 'http://collections.tepapa.govt.nz/Object/46209'. Object 0 to 10,000."
from bs4 import BeautifulSoup
import requests
import json
def unlistIfSingle(givenList):
"""
Remove first element from given list if it is the only element in the list.
"""
if len(givenList) == 1:
output = givenList[0]
else:
output = givenList
return output
def parseTable(table):
"""
Parse the object page's specification table into a dictionary.
"""
if table is None:
return dict()
tableDict = {}
tableRows = table('tr')
for row in tableRows:
rowItems = row('td')
key, value = (rowItems[0].string, list(rowItems[1].stripped_strings))
value = unlistIfSingle(value)
tableDict[key] = value
return tableDict
def parseRelatedInfo(relatedInfoDiv):
"""
Parse the object page's related-info section into a dictionary.
"""
if relatedInfoDiv is None:
return dict()
relatedInfoDict = {}
relatedInfo = relatedInfoDiv.find('div', class_="webpart").find_all((['strong', 'a']))
#related info should consist of 'strong' tags followed by one or more 'a' tags
for element in relatedInfo:
elementVal = element.string
if element.name == 'strong':
#current element is a param name
currentParam = elementVal
relatedInfoDict[currentParam] = []
else:
#current element is a param value
relatedInfoDict[currentParam].append(elementVal)
#final processing - remove values from list if they are the only list element
#(don't return single element lists)
relatedInfoDict = { key : unlistIfSingle(val) for key,val in relatedInfoDict.items()}
return relatedInfoDict
def parseObjectPage(htmlText):
"""
Extract object info from given html, returning dictionary of parsed info.
"""
soup = BeautifulSoup(htmlText)
#Two main fields to grab: the specifications table, and the related info
specTable = soup.find('table', class_='specifications')
specDict = parseTable(specTable)
relatedInfoDiv = soup.find('div', class_='related-info')
relatedInfoDict = parseRelatedInfo(relatedInfoDiv)
#merge parsed results to single dict
allInfoDict = specDict.copy()
allInfoDict.update(relatedInfoDict)
return allInfoDict
def getObjectInfo(objectIndex):
"""
Get and parse data for object of given index.
E.g:
>>> getObjectInfo(46209)
{'Belonged to:': 'Seddon, Richard',
'Classification': 'swords',
'Credit line': 'Gift of Dame Elizabeth Knox Gilmer, 1955',
'Dimensions': ['Overall: \r\n945mm (Length) \r\n x 110mm (Width) \r\n x 60mm (Depth)',
'Overall: \r\n830mm (Length) \r\n x 25mm (Width) \r\n x 20mm (Depth)',
'Overall: \r\n1010mm (Length) \r\n x 170mm (Width) \r\n x 15mm (Depth)',
'Overall: \r\n950mm (Length) \r\n x 180mm (Width) \r\n x 35mm (Depth)',
'Overall: \r\n990mm (Length) \r\n x 110mm (Width) \r\n x 60mm (Depth)'],
'Made by:': 'Hill Brothers',
'Made in:': 'England (United Kingdom)',
'Made of:': 'steel',
'Materials': 'steel',
'Medium summary': 'Blade is steel, hilt is gilt brass.',
'Part of:': 'History collection',
'Production': 'Hill Brothers (manufacturer(s)), 1897, England',
'Refers to:': 'Hill Brothers',
'Registration number': 'PC000760',
'Title': 'Ceremonial sword ("Court Sword").',
'Type of:': 'swords',
'objectIndex': 46209,
'objectUrl': 'http://collections.tepapa.govt.nz/Object/46209'}
"""
#print(objectIndex)
url = 'http://collections.tepapa.govt.nz/Object/'+str(objectIndex)
success = False
while not success:
try:
r = requests.get(url)
success = True
except:
print('Request failed. Retrying...')
pageResults = parseObjectPage(r.text)
pageResults['objectIndex'] = objectIndex
pageResults['objectUrl'] = url
return pageResults
def runChunkedRequests():
"""
Get info for objects in groups of 100, saving a .json output file for each group, for a total of 10,000 objects.
"""
for i in range(100):
iChunkResults = [getObjectInfo(j) for j in range(i*100, (i+1)*100)]
outfileName = 'chunk'+str(i)+'.json'
with open(outfileName, 'w') as outfile:
json.dump(iChunkResults, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment