|
""" |
|
Search for term on Delpher newspaper archive and look up |
|
number of results per year |
|
""" |
|
|
|
import time |
|
import re |
|
import datetime |
|
import urllib.parse |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.common.exceptions import NoSuchElementException |
|
|
|
|
|
USER = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36" |
|
URL_START = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&page=1&maxperpage=50&coll=ddd' |
|
URL_CENTURY = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=0%7C{}%7C&page=1&coll=ddd' |
|
URL_DECADE = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=1%7C{}%7C{}%7C&page=1&coll=ddd' |
|
URL_YEAR = 'https://www.delpher.nl/nl/kranten/results?query={}&facets%5Btype%5D%5B%5D=artikel&facets%5Bspatial%5D%5B%5D=Landelijk&facets%5Bspatial%5D%5B%5D=Regionaal%7Clokaal&facets%5Bperiode%5D%5B%5D=2%7C{}%7C{}%7C{}%7C&page=1&maxperpage=50&coll=ddd' |
|
SLEEP = 5 |
|
VALUES = re.compile(r'([0-9]{4})[\s]+\(([0-9]*)\)') |
|
LABEL = 'label--icon-text' |
|
PERIOD = 'facet__periode' |
|
|
|
|
|
def get_browser(): |
|
"""Create browser instance""" |
|
options = Options() |
|
# options.headless = True |
|
options.add_argument('--disable-dev-shm-usage') |
|
options.add_argument("--remote-debugging-port=9222") |
|
options.add_argument('--no-sandbox') |
|
options.add_argument("user-agent={}".format(USER)) |
|
browser = webdriver.Chrome(options=options) |
|
return browser |
|
|
|
|
|
def go_to(url, browser): |
|
"""Go to url""" |
|
browser.get(url) |
|
attempts = 0 |
|
while browser.current_url != url: |
|
attempts += 1 |
|
if attempts > 2: |
|
print('try again') |
|
return go_to(url, browser) |
|
time.sleep(SLEEP) |
|
return browser |
|
|
|
|
|
def parse_meta(browser): |
|
"""Extract article metadata""" |
|
meta = [] |
|
articles = [] |
|
while len(articles) == 0: |
|
els = [e.text for e in browser.find_elements_by_tag_name('h1')] |
|
if 'Geen krantenartikelen gevonden' in els: |
|
return [], browser |
|
articles = browser.find_elements_by_tag_name('article') |
|
time.sleep(SLEEP) |
|
for article in articles: |
|
title = article.find_element_by_tag_name('h2').text |
|
try: |
|
snippet = article.find_element_by_class_name('snippet').text |
|
except NoSuchElementException: |
|
snippet = None |
|
publication, date = [ |
|
e.text for e in |
|
article.find_elements_by_tag_name('dd') |
|
][:2] |
|
meta.append({ |
|
'title': title, |
|
'snippet': snippet, |
|
'publication': publication, |
|
'date': date |
|
}) |
|
return meta, browser |
|
|
|
|
|
def get_article_counts(query, get_metadata=False, exact=True): |
|
"""Look up number of articles containing term, per year""" |
|
browser = get_browser() |
|
article_counts = [] |
|
if exact and query != '': |
|
q = '%22{}%22'.format(query) |
|
else: |
|
q = query |
|
q = urllib.parse.quote(q) |
|
if get_metadata: |
|
metadata = [] |
|
for century in range(17, 22): |
|
label_cent = '{}e_eeuw'.format(century) |
|
for decade in range(10): |
|
cent_min_one = century - 1 |
|
label_dec = f'{cent_min_one}{decade}0-{cent_min_one}{decade}9' |
|
url = URL_DECADE.format(q, label_cent, label_dec) |
|
browser = go_to(url, browser) |
|
labels = [] |
|
while len(labels) == 0: |
|
time.sleep(SLEEP) |
|
options = browser.find_elements_by_class_name(LABEL) |
|
labels = [ |
|
o.text for o in options |
|
if o.get_attribute('for').startswith(PERIOD) |
|
and o.text != label_dec |
|
] |
|
els = [e.text for e in browser.find_elements_by_tag_name('h1')] |
|
if 'Geen krantenartikelen gevonden' in els: |
|
break |
|
|
|
counts = [ |
|
{'year': year, 'count': count} for year, count in |
|
[VALUES.findall(l)[0] for l in labels] |
|
] |
|
article_counts.extend(counts) |
|
if not counts: |
|
continue |
|
if get_metadata == 'sample': |
|
for year in range(10): |
|
label_yr = '{}{}{}'.format(century - 1, decade, year) |
|
url = URL_YEAR.format(q, label_cent, label_dec, label_yr) |
|
browser = go_to(url, browser) |
|
meta, browser = parse_meta(browser) |
|
metadata.extend(meta) |
|
current_hm = datetime.datetime.now().strftime('%H:%M') |
|
print(label_cent, len(article_counts), current_hm) |
|
if get_metadata is not False: |
|
print(len(metadata)) |
|
browser.close() |
|
if get_metadata is not False: |
|
return article_counts, metadata |
|
return article_counts |
|
|
|
|
|
if __name__ == '__main__': |
|
query = input('query: ') |
|
print(get_article_counts(query)) |