Skip to content

Instantly share code, notes, and snippets.

@LvanWissen
Last active November 12, 2020 17:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LvanWissen/1aa74dee61e85923826b6372a24f9367 to your computer and use it in GitHub Desktop.
Save LvanWissen/1aa74dee61e85923826b6372a24f9367 to your computer and use it in GitHub Desktop.
Download van metadata (csv + json) van alle affiches uit het Afficheproject van het Geheugen van Nederland (Delpher)
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
URL = "https://geheugen.delpher.nl/nl/geheugen/results?query=Eye+AND+afficheproject&maxperpage=1000&coll=ngvn&page="
def main(destination, pages, URL=URL):
data = getData(pages=pages, URL=URL)
with open(f'{destination}.json', 'w') as outfile:
json.dump(data, outfile, indent=4)
df = pd.DataFrame(data)
df.T.to_csv(f'{destination}.csv', sep=';')
def getData(pages, URL):
data = dict()
for i in range(1, pages+1):
fetchUrl = URL + str(i)
r = requests.get(fetchUrl)
html = r.text
data = parseHTML(html, data=data)
return data
def parseHTML(html, data):
soup = BeautifulSoup(html)
articles = soup.find_all('article')
for a in articles:
uri = a.attrs['data-identifier']
metadata = json.loads(a.attrs['data-metadata'])
data[uri] = metadata
return data
if __name__ == "__main__":
main(destination="eye_afficheproject", pages=52, URL=URL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment