Created
December 21, 2021 20:08
-
-
Save wincentbalin/c650d10e63754eca2a9bb8186ba7f35b to your computer and use it in GitHub Desktop.
ORACC cuneiform text scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Export cuneiform corpus from ORACC | |
""" | |
import sys | |
import os | |
import argparse | |
import logging | |
import re | |
import shutil | |
from collections import defaultdict | |
from urllib.parse import urljoin | |
from selenium import webdriver | |
from selenium.webdriver.remote import webelement | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.common.exceptions import NoSuchElementException, TimeoutException | |
ORACC_SITE = 'http://oracc.museum.upenn.edu' # short URL: 'http://oracc.org' | |
RE_OBJECT_NUMBER = re.compile(r'([A-Z][0-9]{6})') | |
RE_CUNEIFORM_HREF = re.compile(r'javascript:cuneifyPopup\(\'([\w/]+)\',\'(\w+)\'\)') | |
RE_REPLACE_ANNOTATION = re.compile(r'[\[\]A-Za-z0-9#⸢⸣?₅�]+') | |
RE_NON_EMPTY = re.compile(r'\S+') | |
XPATH_CUNEIFY_LINES = \ | |
'//table[@class="cuneify-text"]/tbody/tr[contains(@class, "cuneify-line")]/td/p[@class="cuneify-content"]' | |
EXPORTED_FILES = set() | |
LANGUAGE_BINS = {'he': ['Hellenistic (281-261)'], | |
'ah': ['Achaemenid (539-530)'], | |
'na': ['Neo-Assyrian', | |
'Neo-Assyrian (668-ca. 631)', | |
'Neo-Assyrian (671-670)', | |
'Neo-Assyrian (673, 672)', | |
'Neo-Assyrian (676)', | |
'Neo-Assyrian (676-669)', | |
'Neo-Assyrian (677)', | |
'Neo-Assyrian (680-669)', | |
'Neo-Assyrian (688-681)', | |
'Neo-Assyrian (704-681)', | |
'Neo-Assyrian (744-727)', | |
'Neo-Assyrian (1068-1047)'], | |
'nb': ['Neo-Babylonian', | |
'Neo-Babylonian (562–560)', | |
'Neo-Babylonian (1025-1008)', | |
'Neo-Babylonian (1068-1047)', | |
'Neo-Babylonian (1081-1069)'], | |
'mb': ['Middle Babylonian (early 10th century)', | |
'Middle Babylonian (984-979)', | |
'Middle Babylonian (987-985)', | |
'Middle Babylonian (1004-988)', | |
'Middle Babylonian (1007-1005)', | |
'Middle Babylonian (1025-1008)', | |
'Middle Babylonian (1033-1026)', | |
'Middle Babylonian (1068-1047)', | |
'Middle Babylonian (1081-1069)', | |
'Middle Babylonian (1099-1082)', | |
'Middle Babylonian (1103-1100)', | |
'Middle Babylonian (1139-1132)']} | |
LANGUAGE_BINS_FOR_EXPORTED_FILES = defaultdict(list) | |
def get_language(details: list) -> str: | |
"""Get language name from list of details.""" | |
for detail in details: | |
for lang, lang_bin in LANGUAGE_BINS.items(): | |
if detail in lang_bin: | |
return lang | |
else: | |
raise ValueError('Unknown language') | |
def get_object_number(numbers: list) -> str: | |
"""Get object number even from complicated formats.""" | |
for number in numbers: | |
match = RE_OBJECT_NUMBER.search(number) | |
if match: | |
return match.group(1) | |
else: | |
raise ValueError('Unknown object number') | |
def wait_for_xpath(wd: webdriver, xpath: str, timeout=10): | |
WebDriverWait(wd, timeout).until((EC.presence_of_element_located((By.XPATH, xpath)))) | |
def store_cuneiform(args: argparse.Namespace, lines: list, corpus_name: str, object_name: str): | |
if not args.annotations: | |
mangled = [RE_REPLACE_ANNOTATION.sub('', line) for line in lines] | |
lines = mangled | |
contents = '\n'.join(lines) | |
if not RE_NON_EMPTY.search(contents): | |
return | |
# Write cuneiform characters into file | |
dirname = os.path.join(args.directory, corpus_name.replace('/', os.path.sep)) | |
if not os.path.exists(dirname): | |
os.makedirs(dirname, 0o755) | |
filename = os.path.join(dirname, '{}.txt'.format(object_name)) | |
with open(filename, 'w', encoding='utf-8') as f: | |
print(contents, file=f) | |
if args.corpus_file: | |
EXPORTED_FILES.add(filename) | |
def extract_cuneiform(args: argparse.Namespace, wd: webdriver, cuneified_link: webelement, window_handles: list): | |
corpus_name, object_name = RE_CUNEIFORM_HREF.match(cuneified_link.get_attribute('href')).groups() | |
if args.language_dir: | |
xpath_details = '//div[contains(@class, "xmdoutline")]/h3[text()="Details"]/following::ul[1]/li' | |
details = [el.text for el in wd.find_elements_by_xpath(xpath_details)] | |
language = get_language(details) | |
LANGUAGE_BINS_FOR_EXPORTED_FILES[language].append((corpus_name, object_name)) | |
cuneified_link.click() | |
if wd.name == 'firefox': | |
WebDriverWait(wd, 10).until(EC.new_window_is_opened(window_handles)) | |
wd.switch_to.window('cuneified') | |
# Workaround to ensure that the cuneify-text table exists | |
try: | |
wait_for_xpath(wd, '//table[@class="cuneify-text"]') | |
except TimeoutException: | |
try: | |
wd.find_element_by_xpath('//table[@class="cuneify-text"]/tbody') | |
except NoSuchElementException: | |
wd.close() | |
return | |
cuneiform_lines = [line.text for line in wd.find_elements_by_xpath(XPATH_CUNEIFY_LINES)] | |
store_cuneiform(args, cuneiform_lines, corpus_name, object_name) | |
wd.close() | |
def export_corpus(args: argparse.Namespace, wd: webdriver, name: str): | |
logging.info('Opening corpus {}'.format(name)) | |
wd.get(urljoin(ORACC_SITE, '{}/{}'.format(name, 'corpus'))) | |
xpath_designations_table = '//div[@id="p3right"]/table[@class="xmd"]' | |
try: | |
wd.find_element_by_xpath(xpath_designations_table) | |
except NoSuchElementException: | |
logging.warning('No objects found in the corpus {}'.format(name)) | |
return | |
# Save all object link texts, as we go to object pages and back to object list | |
object_link_texts = [link.text for link in wd.find_elements_by_xpath('//tr[not(@class)]/td[2]/a')] | |
logging.debug('Found {} objects'.format(len(object_link_texts))) | |
for object_link_text in object_link_texts: | |
# Open object page | |
object_link = wd.find_element_by_link_text(object_link_text) | |
object_link.click() | |
xpath_outline_panel = '//div[contains(@class, "xmdoutline")]' | |
wait_for_xpath(wd, xpath_outline_panel) | |
xpath_object_numbers = '//div[contains(@class, "xmdoutline")]/h3[text()="Numbers"]/following::ul[1]/li' | |
object_number = get_object_number([el.text for el in wd.find_elements_by_xpath(xpath_object_numbers)]) | |
logging.info('Processing object {}'.format(object_number)) | |
# Prepare for switching windows | |
window_handles = wd.window_handles | |
oracc_window_handle = wd.current_window_handle | |
# Search for Cuneified link | |
try: | |
# Open Cuneified link and process text | |
cuneified_link = wd.find_element_by_link_text('Cuneified') | |
extract_cuneiform(args, wd, cuneified_link, window_handles) | |
except NoSuchElementException: | |
pass | |
# Return to original window | |
wd.switch_to.window(oracc_window_handle) | |
# Go back | |
wd.back() | |
wait_for_xpath(wd, xpath_designations_table) | |
def export_all(args: argparse.Namespace): | |
# Initialise webdriver | |
if args.browser == 'firefox': | |
wd = webdriver.Firefox() | |
elif args.browser == 'phantomjs': | |
wd = webdriver.PhantomJS() | |
else: | |
raise NotImplementedError('Software not tested with browser {}'.format(args.browser)) | |
# Get list of projects | |
logging.info('Getting list of projects') | |
wd.get(urljoin(ORACC_SITE, 'projectlist.html')) | |
xpath_projects = '//div[@class="projects"]/div[@class="subproject-entry"]/h2[@class="proj-head"]/a' | |
wait_for_xpath(wd, xpath_projects) | |
project_names = [el.get_attribute('href').replace(ORACC_SITE, '') for el in wd.find_elements_by_xpath(xpath_projects)] | |
logging.info('Available projects:') | |
for project_index, project_name in enumerate(project_names, 1): | |
logging.info('{}. {}'.format(project_index, project_name)) | |
if args.starting_index: | |
del project_names[0:args.starting_index-1] | |
logging.info('Starting with {}'.format(project_names[0])) | |
for project_name in project_names: | |
if args.corpora and project_name in args.corpora or not args.corpora: | |
export_corpus(args, wd, project_name) | |
wd.quit() | |
if args.language_dir: | |
os.makedirs(args.language_dir, 0o755, exist_ok=True) | |
for language, exported_files in LANGUAGE_BINS_FOR_EXPORTED_FILES.items(): | |
language_corpus_filename = os.path.join(args.language_dir, 'corpus_{}.txt'.format(language)) | |
logging.info('Creating corpus file {}...'.format(language_corpus_filename)) | |
with open(language_corpus_filename, 'w', encoding='utf-8') as language_corpus: | |
for corpus_name, object_name in exported_files: | |
exported_file = os.path.join(corpus_name.replace('/', os.path.sep), '{}.txt'.format(object_name)) | |
try: | |
with open(exported_file, 'r', encoding='utf-8') as inp: | |
logging.debug('Processing {}...'.format(exported_file)) | |
shutil.copyfileobj(inp, language_corpus) | |
print('', file=language_corpus) # add newline at the end of every file | |
except FileNotFoundError: | |
logging.warning('File {} not found'.format(exported_file)) | |
if args.corpus_file: | |
logging.info('Creating corpus file {}...'.format(args.corpus_file)) | |
with open(args.corpus_file, 'w', encoding='utf-8') as corpus: | |
for exported_file in sorted(EXPORTED_FILES): | |
try: | |
with open(exported_file, 'r', encoding='utf-8') as inp: | |
logging.debug('Processing {}...'.format(exported_file)) | |
shutil.copyfileobj(inp, corpus) | |
print('', file=corpus) # add newline at the end of every file | |
except FileNotFoundError: | |
logging.warning('File {} not found'.format(exported_file)) | |
def main(): | |
argparser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__) | |
argparser.add_argument('-d', '--directory', help='Output directory', default=os.getcwd()) | |
argparser.add_argument('-a', '--annotations', help='Do not remove annotations') | |
argparser.add_argument('-c', '--corpora', help='Download only these corpora') | |
argparser.add_argument('-s', '--starting_index', type=int, help='Index of corpus to start with') | |
argparser.add_argument('-f', '--corpus_file', help='Specify corpus file') | |
argparser.add_argument('-b', '--browser', choices=['firefox', 'phantomjs'], default='firefox', | |
help='Browser to use for accessing ORACC') | |
argparser.add_argument('-l', '--language_dir', help='Directory with language bins') | |
args = argparser.parse_args() | |
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO) | |
export_all(args) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment