Skip to content

Instantly share code, notes, and snippets.

@DIRKMJK

DIRKMJK/delpher.md

Created May 24, 2020
Embed
What would you like to do?
Count articles on Delpher

Counting articles on Delpher

Delpher is a huge archive containing digitized historic newspapers, journals, books and other sources. Developed by the National Library of the Netherlands, it is a valuable resource for both academic and informal research. I have myself used it to analyse Dutch words for bicycle.

As far as I know, there is no api to access Delpher data. For my bicycle terms analysis, I manually looked up the number of search results per decade. It would be rather laborious to look up results per year, especially if you’d want to do so for a number of terms.

Therefore, I wrote a Python script that will look up the number of results per year for a given query. Optionally, it will also look up metadata for the first 50 results per year (publication, date of publication, title and snippet), but this will take longer.

"""
Search for term on Delpher newspaper archive and look up
number of results per year
"""
import time
import re
import datetime
import urllib.parse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
USER = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
URL_START = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&page=1&maxperpage=50&coll=ddd'
URL_CENTURY = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=0%7C{}%7C&page=1&coll=ddd'
URL_DECADE = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=1%7C{}%7C{}%7C&page=1&coll=ddd'
URL_YEAR = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=2%7C{}%7C{}%7C{}%7C&page=1&maxperpage=50&coll=ddd'
SLEEP = 5
VALUES = re.compile(r'([0-9]{4}) \(([0-9]*)\)')
LABEL = 'facet-value__label'
PERIOD = 'facet__periode'
def get_browser():
"""Create browser instance"""
options = Options()
# options.headless = True
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--remote-debugging-port=9222")
options.add_argument('--no-sandbox')
options.add_argument("user-agent={}".format(USER))
browser = webdriver.Chrome(options=options)
return browser
def go_to(url, browser):
"""Go to url"""
browser.get(url)
attempts = 0
while browser.current_url != url:
attempts += 1
if attempts > 2:
print('try again')
return go_to(url, browser)
time.sleep(SLEEP)
return browser
def parse_meta(browser):
"""Extract article metadata"""
meta = []
articles = []
while len(articles) == 0:
els = [e.text for e in browser.find_elements_by_tag_name('h1')]
if 'Geen resultaten gevonden' in els:
return [], browser
articles = browser.find_elements_by_tag_name('article')
time.sleep(SLEEP)
for article in articles:
title = article.find_element_by_tag_name('h2').text
try:
snippet = article.find_element_by_class_name('snippet').text
except NoSuchElementException:
snippet = None
publication, date = [
e.text for e in
article.find_elements_by_tag_name('dd')
][:2]
meta.append({
'title': title,
'snippet': snippet,
'publication': publication,
'date': date
})
return meta, browser
def get_article_counts(query, get_metadata=False, exact=True):
"""Look up number of articles containing term, per year"""
browser = get_browser()
article_counts = []
if exact and query != '':
q = '%22{}%22'.format(query)
else:
q = query
q = urllib.parse.quote(q)
if get_metadata:
metadata = []
for century in range(17, 22):
label_cent = '{}e_eeuw'.format(century)
for decade in range(10):
cent_min_one = century - 1
label_dec = f'{cent_min_one}{decade}0-{cent_min_one}{decade}9'
url = URL_DECADE.format(q, label_cent, label_dec)
browser = go_to(url, browser)
labels = []
while len(labels) == 0:
time.sleep(SLEEP)
options = browser.find_elements_by_class_name(LABEL)
labels = [
o.text for o in options
if o.get_attribute('for').startswith(PERIOD)
and o.get_attribute('for').split('|')[-2] != label_dec
]
els = [e.text for e in browser.find_elements_by_tag_name('h1')]
if 'Geen resultaten gevonden' in els:
break
counts = [
{'year': year, 'count': count} for year, count in
[VALUES.findall(l)[0] for l in labels]
]
article_counts.extend(counts)
if not counts:
continue
if get_metadata == 'sample':
for year in range(10):
label_yr = '{}{}{}'.format(century - 1, decade, year)
url = URL_YEAR.format(q, label_cent, label_dec, label_yr)
browser = go_to(url, browser)
meta, browser = parse_meta(browser)
metadata.extend(meta)
current_hm = datetime.datetime.now().strftime('%H:%M')
print(label_cent, len(article_counts), current_hm)
if get_metadata is not False:
print(len(metadata))
browser.close()
if get_metadata is not False:
return article_counts, metadata
return article_counts
if __name__ == '__main__':
query = input('query: ')
print(get_article_counts(query))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.