tim-fan/tepapaCollectionsScrape.py

## tepapaCollectionsScrape.py
from bs4 import BeautifulSoup
import requests
import json

def unlistIfSingle(givenList):
    """
    Remove first element from given list if it is the only element in the list.
    """
    if len(givenList) == 1:
        output = givenList[0]
    else:
        output = givenList
    return output


def parseTable(table):
    """
    Parse the object page's specification table into a dictionary.
    """
    if table is None:
        return dict()

    tableDict = {}
    tableRows = table('tr')
    for row in tableRows:
        rowItems = row('td')
        key, value = (rowItems[0].string, list(rowItems[1].stripped_strings))
        value = unlistIfSingle(value)
        tableDict[key] = value
    return tableDict

def parseRelatedInfo(relatedInfoDiv):
    """
    Parse the object page's related-info section into a dictionary.
    """
    if relatedInfoDiv is None:
        return dict()

    relatedInfoDict = {}
    relatedInfo = relatedInfoDiv.find('div', class_="webpart").find_all((['strong', 'a']))
    #related info should consist of 'strong' tags followed by one or more 'a' tags
    for element in relatedInfo:
        elementVal = element.string
        if element.name == 'strong':
            #current element is a param name
            currentParam = elementVal
            relatedInfoDict[currentParam] = []
        else:
            #current element is a param value
            relatedInfoDict[currentParam].append(elementVal)

    #final processing - remove values from list if they are the only list element
    #(don't return single element lists)
    relatedInfoDict = { key : unlistIfSingle(val) for key,val in relatedInfoDict.items()}

    return relatedInfoDict


def parseObjectPage(htmlText):
    """
    Extract object info from given html, returning dictionary of parsed info.
    """
    soup = BeautifulSoup(htmlText)

    #Two main fields to grab: the specifications table, and the related info
    specTable = soup.find('table', class_='specifications')
    specDict = parseTable(specTable)

    relatedInfoDiv = soup.find('div', class_='related-info')
    relatedInfoDict = parseRelatedInfo(relatedInfoDiv)

    #merge parsed results to single dict
    allInfoDict = specDict.copy()
    allInfoDict.update(relatedInfoDict)
    return allInfoDict

def getObjectInfo(objectIndex):
    """
    Get and parse data for object of given index.

    E.g:
    >>> getObjectInfo(46209)
    {'Belonged to:': 'Seddon, Richard',
     'Classification': 'swords',
     'Credit line': 'Gift of Dame Elizabeth Knox Gilmer, 1955',
     'Dimensions': ['Overall:  \r\n945mm  (Length) \r\n                        x 110mm  (Width) \r\n                        x 60mm  (Depth)',
      'Overall:  \r\n830mm  (Length) \r\n                        x 25mm  (Width) \r\n                        x 20mm  (Depth)',
      'Overall:  \r\n1010mm  (Length) \r\n                        x 170mm  (Width) \r\n                        x 15mm  (Depth)',
      'Overall:  \r\n950mm  (Length) \r\n                        x 180mm  (Width) \r\n                        x 35mm  (Depth)',
      'Overall:  \r\n990mm  (Length) \r\n                        x 110mm  (Width) \r\n                        x 60mm  (Depth)'],
     'Made by:': 'Hill Brothers',
     'Made in:': 'England (United Kingdom)',
     'Made of:': 'steel',
     'Materials': 'steel',
     'Medium summary': 'Blade is steel,  hilt is gilt brass.',
     'Part of:': 'History collection',
     'Production': 'Hill Brothers (manufacturer(s)), 1897, England',
     'Refers to:': 'Hill Brothers',
     'Registration number': 'PC000760',
     'Title': 'Ceremonial sword   ("Court Sword").',
     'Type of:': 'swords',
     'objectIndex': 46209,
     'objectUrl': 'http://collections.tepapa.govt.nz/Object/46209'}
    """

    #print(objectIndex)
    url = 'http://collections.tepapa.govt.nz/Object/'+str(objectIndex)
    success = False
    while not success:
        try:
            r = requests.get(url)
            success = True
        except:
            print('Request failed. Retrying...')
    pageResults = parseObjectPage(r.text)
    pageResults['objectIndex'] = objectIndex
    pageResults['objectUrl'] = url
    return pageResults

def runChunkedRequests():
    """
    Get info for objects in groups of 100, saving a .json output file for each group, for a total of 10,000 objects.
    """
    for i in range(100):
        iChunkResults = [getObjectInfo(j) for j in range(i*100, (i+1)*100)]
        outfileName = 'chunk'+str(i)+'.json'
        with open(outfileName, 'w') as outfile:
            json.dump(iChunkResults, outfile)
	from bs4 import BeautifulSoup
	import requests
	import json

	def unlistIfSingle(givenList):
	"""
	Remove first element from given list if it is the only element in the list.
	"""
	if len(givenList) == 1:
	output = givenList[0]
	else:
	output = givenList
	return output


	def parseTable(table):
	"""
	Parse the object page's specification table into a dictionary.
	"""
	if table is None:
	return dict()

	tableDict = {}
	tableRows = table('tr')
	for row in tableRows:
	rowItems = row('td')
	key, value = (rowItems[0].string, list(rowItems[1].stripped_strings))
	value = unlistIfSingle(value)
	tableDict[key] = value
	return tableDict

	def parseRelatedInfo(relatedInfoDiv):
	"""
	Parse the object page's related-info section into a dictionary.
	"""
	if relatedInfoDiv is None:
	return dict()

	relatedInfoDict = {}
	relatedInfo = relatedInfoDiv.find('div', class_="webpart").find_all((['strong', 'a']))
	#related info should consist of 'strong' tags followed by one or more 'a' tags
	for element in relatedInfo:
	elementVal = element.string
	if element.name == 'strong':
	#current element is a param name
	currentParam = elementVal
	relatedInfoDict[currentParam] = []
	else:
	#current element is a param value
	relatedInfoDict[currentParam].append(elementVal)

	#final processing - remove values from list if they are the only list element
	#(don't return single element lists)
	relatedInfoDict = { key : unlistIfSingle(val) for key,val in relatedInfoDict.items()}

	return relatedInfoDict


	def parseObjectPage(htmlText):
	"""
	Extract object info from given html, returning dictionary of parsed info.
	"""
	soup = BeautifulSoup(htmlText)

	#Two main fields to grab: the specifications table, and the related info
	specTable = soup.find('table', class_='specifications')
	specDict = parseTable(specTable)

	relatedInfoDiv = soup.find('div', class_='related-info')
	relatedInfoDict = parseRelatedInfo(relatedInfoDiv)

	#merge parsed results to single dict
	allInfoDict = specDict.copy()
	allInfoDict.update(relatedInfoDict)
	return allInfoDict

	def getObjectInfo(objectIndex):
	"""
	Get and parse data for object of given index.

	E.g:
	>>> getObjectInfo(46209)
	{'Belonged to:': 'Seddon, Richard',
	'Classification': 'swords',
	'Credit line': 'Gift of Dame Elizabeth Knox Gilmer, 1955',
	'Dimensions': ['Overall: \r\n945mm (Length) \r\n x 110mm (Width) \r\n x 60mm (Depth)',
	'Overall: \r\n830mm (Length) \r\n x 25mm (Width) \r\n x 20mm (Depth)',
	'Overall: \r\n1010mm (Length) \r\n x 170mm (Width) \r\n x 15mm (Depth)',
	'Overall: \r\n950mm (Length) \r\n x 180mm (Width) \r\n x 35mm (Depth)',
	'Overall: \r\n990mm (Length) \r\n x 110mm (Width) \r\n x 60mm (Depth)'],
	'Made by:': 'Hill Brothers',
	'Made in:': 'England (United Kingdom)',
	'Made of:': 'steel',
	'Materials': 'steel',
	'Medium summary': 'Blade is steel, hilt is gilt brass.',
	'Part of:': 'History collection',
	'Production': 'Hill Brothers (manufacturer(s)), 1897, England',
	'Refers to:': 'Hill Brothers',
	'Registration number': 'PC000760',
	'Title': 'Ceremonial sword ("Court Sword").',
	'Type of:': 'swords',
	'objectIndex': 46209,
	'objectUrl': 'http://collections.tepapa.govt.nz/Object/46209'}
	"""

	#print(objectIndex)
	url = 'http://collections.tepapa.govt.nz/Object/'+str(objectIndex)
	success = False
	while not success:
	try:
	r = requests.get(url)
	success = True
	except:
	print('Request failed. Retrying...')
	pageResults = parseObjectPage(r.text)
	pageResults['objectIndex'] = objectIndex
	pageResults['objectUrl'] = url
	return pageResults

	def runChunkedRequests():
	"""
	Get info for objects in groups of 100, saving a .json output file for each group, for a total of 10,000 objects.
	"""
	for i in range(100):
	iChunkResults = [getObjectInfo(j) for j in range(i100, (i+1)100)]
	outfileName = 'chunk'+str(i)+'.json'
	with open(outfileName, 'w') as outfile:
	json.dump(iChunkResults, outfile)