Skip to content

Instantly share code, notes, and snippets.

@wincentbalin
Created December 21, 2021 20:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wincentbalin/c650d10e63754eca2a9bb8186ba7f35b to your computer and use it in GitHub Desktop.
Save wincentbalin/c650d10e63754eca2a9bb8186ba7f35b to your computer and use it in GitHub Desktop.
ORACC cuneiform text scraper
#!/usr/bin/env python
"""Export cuneiform corpus from ORACC
"""
import sys
import os
import argparse
import logging
import re
import shutil
from collections import defaultdict
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
ORACC_SITE = 'http://oracc.museum.upenn.edu' # short URL: 'http://oracc.org'
RE_OBJECT_NUMBER = re.compile(r'([A-Z][0-9]{6})')
RE_CUNEIFORM_HREF = re.compile(r'javascript:cuneifyPopup\(\'([\w/]+)\',\'(\w+)\'\)')
RE_REPLACE_ANNOTATION = re.compile(r'[\[\]A-Za-z0-9#⸢⸣?₅�]+')
RE_NON_EMPTY = re.compile(r'\S+')
XPATH_CUNEIFY_LINES = \
'//table[@class="cuneify-text"]/tbody/tr[contains(@class, "cuneify-line")]/td/p[@class="cuneify-content"]'
EXPORTED_FILES = set()
LANGUAGE_BINS = {'he': ['Hellenistic (281-261)'],
'ah': ['Achaemenid (539-530)'],
'na': ['Neo-Assyrian',
'Neo-Assyrian (668-ca. 631)',
'Neo-Assyrian (671-670)',
'Neo-Assyrian (673, 672)',
'Neo-Assyrian (676)',
'Neo-Assyrian (676-669)',
'Neo-Assyrian (677)',
'Neo-Assyrian (680-669)',
'Neo-Assyrian (688-681)',
'Neo-Assyrian (704-681)',
'Neo-Assyrian (744-727)',
'Neo-Assyrian (1068-1047)'],
'nb': ['Neo-Babylonian',
'Neo-Babylonian (562–560)',
'Neo-Babylonian (1025-1008)',
'Neo-Babylonian (1068-1047)',
'Neo-Babylonian (1081-1069)'],
'mb': ['Middle Babylonian (early 10th century)',
'Middle Babylonian (984-979)',
'Middle Babylonian (987-985)',
'Middle Babylonian (1004-988)',
'Middle Babylonian (1007-1005)',
'Middle Babylonian (1025-1008)',
'Middle Babylonian (1033-1026)',
'Middle Babylonian (1068-1047)',
'Middle Babylonian (1081-1069)',
'Middle Babylonian (1099-1082)',
'Middle Babylonian (1103-1100)',
'Middle Babylonian (1139-1132)']}
LANGUAGE_BINS_FOR_EXPORTED_FILES = defaultdict(list)
def get_language(details: list) -> str:
"""Get language name from list of details."""
for detail in details:
for lang, lang_bin in LANGUAGE_BINS.items():
if detail in lang_bin:
return lang
else:
raise ValueError('Unknown language')
def get_object_number(numbers: list) -> str:
"""Get object number even from complicated formats."""
for number in numbers:
match = RE_OBJECT_NUMBER.search(number)
if match:
return match.group(1)
else:
raise ValueError('Unknown object number')
def wait_for_xpath(wd: webdriver, xpath: str, timeout=10):
WebDriverWait(wd, timeout).until((EC.presence_of_element_located((By.XPATH, xpath))))
def store_cuneiform(args: argparse.Namespace, lines: list, corpus_name: str, object_name: str):
if not args.annotations:
mangled = [RE_REPLACE_ANNOTATION.sub('', line) for line in lines]
lines = mangled
contents = '\n'.join(lines)
if not RE_NON_EMPTY.search(contents):
return
# Write cuneiform characters into file
dirname = os.path.join(args.directory, corpus_name.replace('/', os.path.sep))
if not os.path.exists(dirname):
os.makedirs(dirname, 0o755)
filename = os.path.join(dirname, '{}.txt'.format(object_name))
with open(filename, 'w', encoding='utf-8') as f:
print(contents, file=f)
if args.corpus_file:
EXPORTED_FILES.add(filename)
def extract_cuneiform(args: argparse.Namespace, wd: webdriver, cuneified_link: webelement, window_handles: list):
corpus_name, object_name = RE_CUNEIFORM_HREF.match(cuneified_link.get_attribute('href')).groups()
if args.language_dir:
xpath_details = '//div[contains(@class, "xmdoutline")]/h3[text()="Details"]/following::ul[1]/li'
details = [el.text for el in wd.find_elements_by_xpath(xpath_details)]
language = get_language(details)
LANGUAGE_BINS_FOR_EXPORTED_FILES[language].append((corpus_name, object_name))
cuneified_link.click()
if wd.name == 'firefox':
WebDriverWait(wd, 10).until(EC.new_window_is_opened(window_handles))
wd.switch_to.window('cuneified')
# Workaround to ensure that the cuneify-text table exists
try:
wait_for_xpath(wd, '//table[@class="cuneify-text"]')
except TimeoutException:
try:
wd.find_element_by_xpath('//table[@class="cuneify-text"]/tbody')
except NoSuchElementException:
wd.close()
return
cuneiform_lines = [line.text for line in wd.find_elements_by_xpath(XPATH_CUNEIFY_LINES)]
store_cuneiform(args, cuneiform_lines, corpus_name, object_name)
wd.close()
def export_corpus(args: argparse.Namespace, wd: webdriver, name: str):
logging.info('Opening corpus {}'.format(name))
wd.get(urljoin(ORACC_SITE, '{}/{}'.format(name, 'corpus')))
xpath_designations_table = '//div[@id="p3right"]/table[@class="xmd"]'
try:
wd.find_element_by_xpath(xpath_designations_table)
except NoSuchElementException:
logging.warning('No objects found in the corpus {}'.format(name))
return
# Save all object link texts, as we go to object pages and back to object list
object_link_texts = [link.text for link in wd.find_elements_by_xpath('//tr[not(@class)]/td[2]/a')]
logging.debug('Found {} objects'.format(len(object_link_texts)))
for object_link_text in object_link_texts:
# Open object page
object_link = wd.find_element_by_link_text(object_link_text)
object_link.click()
xpath_outline_panel = '//div[contains(@class, "xmdoutline")]'
wait_for_xpath(wd, xpath_outline_panel)
xpath_object_numbers = '//div[contains(@class, "xmdoutline")]/h3[text()="Numbers"]/following::ul[1]/li'
object_number = get_object_number([el.text for el in wd.find_elements_by_xpath(xpath_object_numbers)])
logging.info('Processing object {}'.format(object_number))
# Prepare for switching windows
window_handles = wd.window_handles
oracc_window_handle = wd.current_window_handle
# Search for Cuneified link
try:
# Open Cuneified link and process text
cuneified_link = wd.find_element_by_link_text('Cuneified')
extract_cuneiform(args, wd, cuneified_link, window_handles)
except NoSuchElementException:
pass
# Return to original window
wd.switch_to.window(oracc_window_handle)
# Go back
wd.back()
wait_for_xpath(wd, xpath_designations_table)
def export_all(args: argparse.Namespace):
# Initialise webdriver
if args.browser == 'firefox':
wd = webdriver.Firefox()
elif args.browser == 'phantomjs':
wd = webdriver.PhantomJS()
else:
raise NotImplementedError('Software not tested with browser {}'.format(args.browser))
# Get list of projects
logging.info('Getting list of projects')
wd.get(urljoin(ORACC_SITE, 'projectlist.html'))
xpath_projects = '//div[@class="projects"]/div[@class="subproject-entry"]/h2[@class="proj-head"]/a'
wait_for_xpath(wd, xpath_projects)
project_names = [el.get_attribute('href').replace(ORACC_SITE, '') for el in wd.find_elements_by_xpath(xpath_projects)]
logging.info('Available projects:')
for project_index, project_name in enumerate(project_names, 1):
logging.info('{}. {}'.format(project_index, project_name))
if args.starting_index:
del project_names[0:args.starting_index-1]
logging.info('Starting with {}'.format(project_names[0]))
for project_name in project_names:
if args.corpora and project_name in args.corpora or not args.corpora:
export_corpus(args, wd, project_name)
wd.quit()
if args.language_dir:
os.makedirs(args.language_dir, 0o755, exist_ok=True)
for language, exported_files in LANGUAGE_BINS_FOR_EXPORTED_FILES.items():
language_corpus_filename = os.path.join(args.language_dir, 'corpus_{}.txt'.format(language))
logging.info('Creating corpus file {}...'.format(language_corpus_filename))
with open(language_corpus_filename, 'w', encoding='utf-8') as language_corpus:
for corpus_name, object_name in exported_files:
exported_file = os.path.join(corpus_name.replace('/', os.path.sep), '{}.txt'.format(object_name))
try:
with open(exported_file, 'r', encoding='utf-8') as inp:
logging.debug('Processing {}...'.format(exported_file))
shutil.copyfileobj(inp, language_corpus)
print('', file=language_corpus) # add newline at the end of every file
except FileNotFoundError:
logging.warning('File {} not found'.format(exported_file))
if args.corpus_file:
logging.info('Creating corpus file {}...'.format(args.corpus_file))
with open(args.corpus_file, 'w', encoding='utf-8') as corpus:
for exported_file in sorted(EXPORTED_FILES):
try:
with open(exported_file, 'r', encoding='utf-8') as inp:
logging.debug('Processing {}...'.format(exported_file))
shutil.copyfileobj(inp, corpus)
print('', file=corpus) # add newline at the end of every file
except FileNotFoundError:
logging.warning('File {} not found'.format(exported_file))
def main():
argparser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
argparser.add_argument('-d', '--directory', help='Output directory', default=os.getcwd())
argparser.add_argument('-a', '--annotations', help='Do not remove annotations')
argparser.add_argument('-c', '--corpora', help='Download only these corpora')
argparser.add_argument('-s', '--starting_index', type=int, help='Index of corpus to start with')
argparser.add_argument('-f', '--corpus_file', help='Specify corpus file')
argparser.add_argument('-b', '--browser', choices=['firefox', 'phantomjs'], default='firefox',
help='Browser to use for accessing ORACC')
argparser.add_argument('-l', '--language_dir', help='Directory with language bins')
args = argparser.parse_args()
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO)
export_all(args)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment