DIRKMJK/delpher.md

## delpher.md

      
    Raw
  

              delpher.md
            
          
    Counting articles on Delpher

Delpher is a huge archive containing digitized historic newspapers, journals, books and other sources. Developed by the National Library of the Netherlands, it is a valuable resource for both academic and informal research. I have myself used it to analyse Dutch words for bicycle.
As far as I know, there is no api to access Delpher data. For my bicycle terms analysis, I manually looked up the number of search results per decade. It would be rather laborious to look up results per year, especially if you’d want to do so for a number of terms.
Therefore, I wrote a Python script that will look up the number of results per year for a given query. Optionally, it will also look up metadata for the first 50 results per year (publication, date of publication, title and snippet), but this will take longer.

  
## delpher.py
"""
Search for term on Delpher newspaper archive and look up
number of results per year
"""

import time
import re
import datetime
import urllib.parse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException


USER = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
URL_START = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&page=1&maxperpage=50&coll=ddd'
URL_CENTURY = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=0%7C{}%7C&page=1&coll=ddd'
URL_DECADE = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=1%7C{}%7C{}%7C&page=1&coll=ddd'
URL_YEAR = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=2%7C{}%7C{}%7C{}%7C&page=1&maxperpage=50&coll=ddd'
SLEEP = 5
VALUES = re.compile(r'([0-9]{4})[\s]+\(([0-9]*)\)')
LABEL = 'label--icon-text'
PERIOD = 'facet__periode'


def get_browser():
    """Create browser instance"""
    options = Options()
#     options.headless = True
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("--remote-debugging-port=9222")
    options.add_argument('--no-sandbox')
    options.add_argument("user-agent={}".format(USER))
    browser = webdriver.Chrome(options=options)
    return browser


def go_to(url, browser):
    """Go to url"""
    browser.get(url)
    attempts = 0
    while browser.current_url != url:
        attempts += 1
        if attempts > 2:
            print('try again')
            return go_to(url, browser)
        time.sleep(SLEEP)
    return browser


def parse_meta(browser):
    """Extract article metadata"""
    meta = []
    articles = []
    while len(articles) == 0:
        els = [e.text for e in browser.find_elements_by_tag_name('h1')]
        if 'Geen krantenartikelen gevonden' in els:
            return [], browser
        articles = browser.find_elements_by_tag_name('article')
        time.sleep(SLEEP)
    for article in articles:
        title = article.find_element_by_tag_name('h2').text
        try:
            snippet = article.find_element_by_class_name('snippet').text
        except NoSuchElementException:
            snippet = None
        publication, date = [
            e.text for e in
            article.find_elements_by_tag_name('dd')
        ][:2]
        meta.append({
            'title': title,
            'snippet': snippet,
            'publication': publication,
            'date': date
        })
    return meta, browser


def get_article_counts(query, get_metadata=False, exact=True):
    """Look up number of articles containing term, per year"""
    browser = get_browser()
    article_counts = []
    if exact and query != '':
        q = '%22{}%22'.format(query)
    else:
        q = query
    q = urllib.parse.quote(q)
    if get_metadata:
        metadata = []
    for century in range(17, 22):
        label_cent = '{}e_eeuw'.format(century)
        for decade in range(10):
            cent_min_one = century - 1
            label_dec = f'{cent_min_one}{decade}0-{cent_min_one}{decade}9'
            url = URL_DECADE.format(q, label_cent, label_dec)
            browser = go_to(url, browser)
            labels = []
            while len(labels) == 0:
                time.sleep(SLEEP)
                options = browser.find_elements_by_class_name(LABEL)
                labels = [
                    o.text for o in options
                    if o.get_attribute('for').startswith(PERIOD)
                    and o.text != label_dec
                ]
                els = [e.text for e in browser.find_elements_by_tag_name('h1')]
                if 'Geen krantenartikelen gevonden' in els:
                    break

            counts = [
                {'year': year, 'count': count} for year, count in
                [VALUES.findall(l)[0] for l in labels]
            ]
            article_counts.extend(counts)
            if not counts:
                continue
            if get_metadata == 'sample':
                for year in range(10):
                    label_yr = '{}{}{}'.format(century - 1, decade, year)
                    url = URL_YEAR.format(q, label_cent, label_dec, label_yr)
                    browser = go_to(url, browser)
                    meta, browser = parse_meta(browser)
                    metadata.extend(meta)
        current_hm = datetime.datetime.now().strftime('%H:%M')
        print(label_cent, len(article_counts), current_hm)
        if get_metadata is not False:
            print(len(metadata))
    browser.close()
    if get_metadata is not False:
        return article_counts, metadata
    return article_counts


if __name__ == '__main__':
    query = input('query: ')
    print(get_article_counts(query))
	"""
	Search for term on Delpher newspaper archive and look up
	number of results per year
	"""

	import time
	import re
	import datetime
	import urllib.parse
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.common.exceptions import NoSuchElementException


	USER = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
	URL_START = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&page=1&maxperpage=50&coll=ddd'
	URL_CENTURY = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=0%7C{}%7C&page=1&coll=ddd'
	URL_DECADE = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=1%7C{}%7C{}%7C&page=1&coll=ddd'
	URL_YEAR = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=2%7C{}%7C{}%7C{}%7C&page=1&maxperpage=50&coll=ddd'
	SLEEP = 5
	VALUES = re.compile(r'([0-9]{4})[\s]+\(([0-9]*)\)')
	LABEL = 'label--icon-text'
	PERIOD = 'facet__periode'


	def get_browser():
	"""Create browser instance"""
	options = Options()
	# options.headless = True
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument("--remote-debugging-port=9222")
	options.add_argument('--no-sandbox')
	options.add_argument("user-agent={}".format(USER))
	browser = webdriver.Chrome(options=options)
	return browser


	def go_to(url, browser):
	"""Go to url"""
	browser.get(url)
	attempts = 0
	while browser.current_url != url:
	attempts += 1
	if attempts > 2:
	print('try again')
	return go_to(url, browser)
	time.sleep(SLEEP)
	return browser


	def parse_meta(browser):
	"""Extract article metadata"""
	meta = []
	articles = []
	while len(articles) == 0:
	els = [e.text for e in browser.find_elements_by_tag_name('h1')]
	if 'Geen krantenartikelen gevonden' in els:
	return [], browser
	articles = browser.find_elements_by_tag_name('article')
	time.sleep(SLEEP)
	for article in articles:
	title = article.find_element_by_tag_name('h2').text
	try:
	snippet = article.find_element_by_class_name('snippet').text
	except NoSuchElementException:
	snippet = None
	publication, date = [
	e.text for e in
	article.find_elements_by_tag_name('dd')
	][:2]
	meta.append({
	'title': title,
	'snippet': snippet,
	'publication': publication,
	'date': date
	})
	return meta, browser


	def get_article_counts(query, get_metadata=False, exact=True):
	"""Look up number of articles containing term, per year"""
	browser = get_browser()
	article_counts = []
	if exact and query != '':
	q = '%22{}%22'.format(query)
	else:
	q = query
	q = urllib.parse.quote(q)
	if get_metadata:
	metadata = []
	for century in range(17, 22):
	label_cent = '{}e_eeuw'.format(century)
	for decade in range(10):
	cent_min_one = century - 1
	label_dec = f'{cent_min_one}{decade}0-{cent_min_one}{decade}9'
	url = URL_DECADE.format(q, label_cent, label_dec)
	browser = go_to(url, browser)
	labels = []
	while len(labels) == 0:
	time.sleep(SLEEP)
	options = browser.find_elements_by_class_name(LABEL)
	labels = [
	o.text for o in options
	if o.get_attribute('for').startswith(PERIOD)
	and o.text != label_dec
	]
	els = [e.text for e in browser.find_elements_by_tag_name('h1')]
	if 'Geen krantenartikelen gevonden' in els:
	break

	counts = [
	{'year': year, 'count': count} for year, count in
	[VALUES.findall(l)[0] for l in labels]
	]
	article_counts.extend(counts)
	if not counts:
	continue
	if get_metadata == 'sample':
	for year in range(10):
	label_yr = '{}{}{}'.format(century - 1, decade, year)
	url = URL_YEAR.format(q, label_cent, label_dec, label_yr)
	browser = go_to(url, browser)
	meta, browser = parse_meta(browser)
	metadata.extend(meta)
	current_hm = datetime.datetime.now().strftime('%H:%M')
	print(label_cent, len(article_counts), current_hm)
	if get_metadata is not False:
	print(len(metadata))
	browser.close()
	if get_metadata is not False:
	return article_counts, metadata
	return article_counts


	if __name__ == '__main__':
	query = input('query: ')
	print(get_article_counts(query))