Skip to content

Instantly share code, notes, and snippets.

@georgevanburgh
Created May 18, 2016 11:34
Show Gist options
  • Save georgevanburgh/7d9175f85beb9d7da736c2b7c88ee754 to your computer and use it in GitHub Desktop.
Save georgevanburgh/7d9175f85beb9d7da736c2b7c88ee754 to your computer and use it in GitHub Desktop.
import microdata
from xml.dom.minidom import parse
import json
import urllib
SITEMAP = "http://www.bbc.co.uk/food/sitemap.xml"
RECIPE_SUBDOMAIN = "http://www.bbc.co.uk/food/recipes/"
def getRecipe(givenRecipeURL):
recipe = microdata.get_items(urllib.urlopen(givenRecipeURL))[0]
return recipe
def getRecipeURLs():
recipeURLs = []
try:
xml = urllib.urlopen(SITEMAP)
dom = parse(xml)
for node in dom.getElementsByTagName("loc"):
url = node.childNodes[0].data
if RECIPE_SUBDOMAIN in url:
recipeURLs.append(url)
except e as Exception:
print e
return recipeURLs
if __name__ == '__main__':
recipeURLs = getRecipeURLs()
for i, recipeURL in enumerate(recipeURLs):
# Get the recipe
recipe = getRecipe(recipeURL).json_dict()
# Add an ID
recipe['id'] = recipeURL
for j, instruction in enumerate(recipe['properties']['recipeInstructions']):
recipe['properties']['recipeInstructions'][j] = instruction.strip()
# Write the json out to a file
f = open('data/' + str(i) + '.json', 'w')
f.write(unicode(json.dumps(recipe)))
f.close()
print "Saved recipe {}".format(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment