wincentbalin/oracc-export.py

## oracc-export.py
#!/usr/bin/env python
"""Export cuneiform corpus from ORACC
"""

import sys
import os
import argparse
import logging
import re
import shutil
from collections import defaultdict
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException


ORACC_SITE = 'http://oracc.museum.upenn.edu'  # short URL: 'http://oracc.org'
RE_OBJECT_NUMBER = re.compile(r'([A-Z][0-9]{6})')
RE_CUNEIFORM_HREF = re.compile(r'javascript:cuneifyPopup\(\'([\w/]+)\',\'(\w+)\'\)')
RE_REPLACE_ANNOTATION = re.compile(r'[\[\]A-Za-z0-9#⸢⸣?₅�]+')
RE_NON_EMPTY = re.compile(r'\S+')
XPATH_CUNEIFY_LINES = \
    '//table[@class="cuneify-text"]/tbody/tr[contains(@class, "cuneify-line")]/td/p[@class="cuneify-content"]'
EXPORTED_FILES = set()
LANGUAGE_BINS = {'he': ['Hellenistic (281-261)'],
                 'ah': ['Achaemenid (539-530)'],
                 'na': ['Neo-Assyrian',
                        'Neo-Assyrian (668-ca. 631)',
                        'Neo-Assyrian (671-670)',
                        'Neo-Assyrian (673, 672)',
                        'Neo-Assyrian (676)',
                        'Neo-Assyrian (676-669)',
                        'Neo-Assyrian (677)',
                        'Neo-Assyrian (680-669)',
                        'Neo-Assyrian (688-681)',
                        'Neo-Assyrian (704-681)',
                        'Neo-Assyrian (744-727)',
                        'Neo-Assyrian (1068-1047)'],
                 'nb': ['Neo-Babylonian',
                        'Neo-Babylonian (562–560)',
                        'Neo-Babylonian (1025-1008)',
                        'Neo-Babylonian (1068-1047)',
                        'Neo-Babylonian (1081-1069)'],
                 'mb': ['Middle Babylonian (early 10th century)',
                        'Middle Babylonian (984-979)',
                        'Middle Babylonian (987-985)',
                        'Middle Babylonian (1004-988)',
                        'Middle Babylonian (1007-1005)',
                        'Middle Babylonian (1025-1008)',
                        'Middle Babylonian (1033-1026)',
                        'Middle Babylonian (1068-1047)',
                        'Middle Babylonian (1081-1069)',
                        'Middle Babylonian (1099-1082)',
                        'Middle Babylonian (1103-1100)',
                        'Middle Babylonian (1139-1132)']}
LANGUAGE_BINS_FOR_EXPORTED_FILES = defaultdict(list)


def get_language(details: list) -> str:
    """Get language name from list of details."""
    for detail in details:
        for lang, lang_bin in LANGUAGE_BINS.items():
            if detail in lang_bin:
                return lang
    else:
        raise ValueError('Unknown language')


def get_object_number(numbers: list) -> str:
    """Get object number even from complicated formats."""
    for number in numbers:
        match = RE_OBJECT_NUMBER.search(number)
        if match:
            return match.group(1)
    else:
        raise ValueError('Unknown object number')


def wait_for_xpath(wd: webdriver, xpath: str, timeout=10):
    WebDriverWait(wd, timeout).until((EC.presence_of_element_located((By.XPATH, xpath))))


def store_cuneiform(args: argparse.Namespace, lines: list, corpus_name: str, object_name: str):
    if not args.annotations:
        mangled = [RE_REPLACE_ANNOTATION.sub('', line) for line in lines]
        lines = mangled
    contents = '\n'.join(lines)
    if not RE_NON_EMPTY.search(contents):
        return
    # Write cuneiform characters into file
    dirname = os.path.join(args.directory, corpus_name.replace('/', os.path.sep))
    if not os.path.exists(dirname):
        os.makedirs(dirname, 0o755)
    filename = os.path.join(dirname, '{}.txt'.format(object_name))
    with open(filename, 'w', encoding='utf-8') as f:
        print(contents, file=f)
    if args.corpus_file:
        EXPORTED_FILES.add(filename)


def extract_cuneiform(args: argparse.Namespace, wd: webdriver, cuneified_link: webelement, window_handles: list):
    corpus_name, object_name = RE_CUNEIFORM_HREF.match(cuneified_link.get_attribute('href')).groups()
    if args.language_dir:
        xpath_details = '//div[contains(@class, "xmdoutline")]/h3[text()="Details"]/following::ul[1]/li'
        details = [el.text for el in wd.find_elements_by_xpath(xpath_details)]
        language = get_language(details)
        LANGUAGE_BINS_FOR_EXPORTED_FILES[language].append((corpus_name, object_name))
    cuneified_link.click()
    if wd.name == 'firefox':
        WebDriverWait(wd, 10).until(EC.new_window_is_opened(window_handles))
    wd.switch_to.window('cuneified')
    # Workaround to ensure that the cuneify-text table exists
    try:
        wait_for_xpath(wd, '//table[@class="cuneify-text"]')
    except TimeoutException:
        try:
            wd.find_element_by_xpath('//table[@class="cuneify-text"]/tbody')
        except NoSuchElementException:
            wd.close()
            return
    cuneiform_lines = [line.text for line in wd.find_elements_by_xpath(XPATH_CUNEIFY_LINES)]
    store_cuneiform(args, cuneiform_lines, corpus_name, object_name)
    wd.close()


def export_corpus(args: argparse.Namespace, wd: webdriver, name: str):
    logging.info('Opening corpus {}'.format(name))
    wd.get(urljoin(ORACC_SITE, '{}/{}'.format(name, 'corpus')))
    xpath_designations_table = '//div[@id="p3right"]/table[@class="xmd"]'
    try:
        wd.find_element_by_xpath(xpath_designations_table)
    except NoSuchElementException:
        logging.warning('No objects found in the corpus {}'.format(name))
        return
    # Save all object link texts, as we go to object pages and back to object list
    object_link_texts = [link.text for link in wd.find_elements_by_xpath('//tr[not(@class)]/td[2]/a')]
    logging.debug('Found {} objects'.format(len(object_link_texts)))
    for object_link_text in object_link_texts:
        # Open object page
        object_link = wd.find_element_by_link_text(object_link_text)
        object_link.click()
        xpath_outline_panel = '//div[contains(@class, "xmdoutline")]'
        wait_for_xpath(wd, xpath_outline_panel)
        xpath_object_numbers = '//div[contains(@class, "xmdoutline")]/h3[text()="Numbers"]/following::ul[1]/li'
        object_number = get_object_number([el.text for el in wd.find_elements_by_xpath(xpath_object_numbers)])
        logging.info('Processing object {}'.format(object_number))
        # Prepare for switching windows
        window_handles = wd.window_handles
        oracc_window_handle = wd.current_window_handle
        # Search for Cuneified link
        try:
            # Open Cuneified link and process text
            cuneified_link = wd.find_element_by_link_text('Cuneified')
            extract_cuneiform(args, wd, cuneified_link, window_handles)
        except NoSuchElementException:
            pass
        # Return to original window
        wd.switch_to.window(oracc_window_handle)
        # Go back
        wd.back()
        wait_for_xpath(wd, xpath_designations_table)


def export_all(args: argparse.Namespace):
    # Initialise webdriver
    if args.browser == 'firefox':
        wd = webdriver.Firefox()
    elif args.browser == 'phantomjs':
        wd = webdriver.PhantomJS()
    else:
        raise NotImplementedError('Software not tested with browser {}'.format(args.browser))
    # Get list of projects
    logging.info('Getting list of projects')
    wd.get(urljoin(ORACC_SITE, 'projectlist.html'))
    xpath_projects = '//div[@class="projects"]/div[@class="subproject-entry"]/h2[@class="proj-head"]/a'
    wait_for_xpath(wd, xpath_projects)
    project_names = [el.get_attribute('href').replace(ORACC_SITE, '') for el in wd.find_elements_by_xpath(xpath_projects)]
    logging.info('Available projects:')
    for project_index, project_name in enumerate(project_names, 1):
        logging.info('{}. {}'.format(project_index, project_name))
    if args.starting_index:
        del project_names[0:args.starting_index-1]
        logging.info('Starting with {}'.format(project_names[0]))
    for project_name in project_names:
        if args.corpora and project_name in args.corpora or not args.corpora:
            export_corpus(args, wd, project_name)
    wd.quit()
    if args.language_dir:
        os.makedirs(args.language_dir, 0o755, exist_ok=True)
        for language, exported_files in LANGUAGE_BINS_FOR_EXPORTED_FILES.items():
            language_corpus_filename = os.path.join(args.language_dir, 'corpus_{}.txt'.format(language))
            logging.info('Creating corpus file {}...'.format(language_corpus_filename))
            with open(language_corpus_filename, 'w', encoding='utf-8') as language_corpus:
                for corpus_name, object_name in exported_files:
                    exported_file = os.path.join(corpus_name.replace('/', os.path.sep), '{}.txt'.format(object_name))
                    try:
                        with open(exported_file, 'r', encoding='utf-8') as inp:
                            logging.debug('Processing {}...'.format(exported_file))
                            shutil.copyfileobj(inp, language_corpus)
                            print('', file=language_corpus)  # add newline at the end of every file
                    except FileNotFoundError:
                        logging.warning('File {} not found'.format(exported_file))
    if args.corpus_file:
        logging.info('Creating corpus file {}...'.format(args.corpus_file))
        with open(args.corpus_file, 'w', encoding='utf-8') as corpus:
            for exported_file in sorted(EXPORTED_FILES):
                try:
                    with open(exported_file, 'r', encoding='utf-8') as inp:
                        logging.debug('Processing {}...'.format(exported_file))
                        shutil.copyfileobj(inp, corpus)
                        print('', file=corpus)  # add newline at the end of every file
                except FileNotFoundError:
                    logging.warning('File {} not found'.format(exported_file))


def main():
    argparser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
    argparser.add_argument('-d', '--directory', help='Output directory', default=os.getcwd())
    argparser.add_argument('-a', '--annotations', help='Do not remove annotations')
    argparser.add_argument('-c', '--corpora', help='Download only these corpora')
    argparser.add_argument('-s', '--starting_index', type=int, help='Index of corpus to start with')
    argparser.add_argument('-f', '--corpus_file', help='Specify corpus file')
    argparser.add_argument('-b', '--browser', choices=['firefox', 'phantomjs'], default='firefox',
                           help='Browser to use for accessing ORACC')
    argparser.add_argument('-l', '--language_dir', help='Directory with language bins')
    args = argparser.parse_args()
    logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO)
    export_all(args)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	"""Export cuneiform corpus from ORACC
	"""

	import sys
	import os
	import argparse
	import logging
	import re
	import shutil
	from collections import defaultdict
	from urllib.parse import urljoin
	from selenium import webdriver
	from selenium.webdriver.remote import webelement
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import NoSuchElementException, TimeoutException


	ORACC_SITE = 'http://oracc.museum.upenn.edu' # short URL: 'http://oracc.org'
	RE_OBJECT_NUMBER = re.compile(r'([A-Z][0-9]{6})')
	RE_CUNEIFORM_HREF = re.compile(r'javascript:cuneifyPopup\(\'([\w/]+)\',\'(\w+)\'\)')
	RE_REPLACE_ANNOTATION = re.compile(r'[\[\]A-Za-z0-9#⸢⸣?₅�]+')
	RE_NON_EMPTY = re.compile(r'\S+')
	XPATH_CUNEIFY_LINES = \
	'//table[@class="cuneify-text"]/tbody/tr[contains(@class, "cuneify-line")]/td/p[@class="cuneify-content"]'
	EXPORTED_FILES = set()
	LANGUAGE_BINS = {'he': ['Hellenistic (281-261)'],
	'ah': ['Achaemenid (539-530)'],
	'na': ['Neo-Assyrian',
	'Neo-Assyrian (668-ca. 631)',
	'Neo-Assyrian (671-670)',
	'Neo-Assyrian (673, 672)',
	'Neo-Assyrian (676)',
	'Neo-Assyrian (676-669)',
	'Neo-Assyrian (677)',
	'Neo-Assyrian (680-669)',
	'Neo-Assyrian (688-681)',
	'Neo-Assyrian (704-681)',
	'Neo-Assyrian (744-727)',
	'Neo-Assyrian (1068-1047)'],
	'nb': ['Neo-Babylonian',
	'Neo-Babylonian (562–560)',
	'Neo-Babylonian (1025-1008)',
	'Neo-Babylonian (1068-1047)',
	'Neo-Babylonian (1081-1069)'],
	'mb': ['Middle Babylonian (early 10th century)',
	'Middle Babylonian (984-979)',
	'Middle Babylonian (987-985)',
	'Middle Babylonian (1004-988)',
	'Middle Babylonian (1007-1005)',
	'Middle Babylonian (1025-1008)',
	'Middle Babylonian (1033-1026)',
	'Middle Babylonian (1068-1047)',
	'Middle Babylonian (1081-1069)',
	'Middle Babylonian (1099-1082)',
	'Middle Babylonian (1103-1100)',
	'Middle Babylonian (1139-1132)']}
	LANGUAGE_BINS_FOR_EXPORTED_FILES = defaultdict(list)


	def get_language(details: list) -> str:
	"""Get language name from list of details."""
	for detail in details:
	for lang, lang_bin in LANGUAGE_BINS.items():
	if detail in lang_bin:
	return lang
	else:
	raise ValueError('Unknown language')


	def get_object_number(numbers: list) -> str:
	"""Get object number even from complicated formats."""
	for number in numbers:
	match = RE_OBJECT_NUMBER.search(number)
	if match:
	return match.group(1)
	else:
	raise ValueError('Unknown object number')


	def wait_for_xpath(wd: webdriver, xpath: str, timeout=10):
	WebDriverWait(wd, timeout).until((EC.presence_of_element_located((By.XPATH, xpath))))


	def store_cuneiform(args: argparse.Namespace, lines: list, corpus_name: str, object_name: str):
	if not args.annotations:
	mangled = [RE_REPLACE_ANNOTATION.sub('', line) for line in lines]
	lines = mangled
	contents = '\n'.join(lines)
	if not RE_NON_EMPTY.search(contents):
	return
	# Write cuneiform characters into file
	dirname = os.path.join(args.directory, corpus_name.replace('/', os.path.sep))
	if not os.path.exists(dirname):
	os.makedirs(dirname, 0o755)
	filename = os.path.join(dirname, '{}.txt'.format(object_name))
	with open(filename, 'w', encoding='utf-8') as f:
	print(contents, file=f)
	if args.corpus_file:
	EXPORTED_FILES.add(filename)


	def extract_cuneiform(args: argparse.Namespace, wd: webdriver, cuneified_link: webelement, window_handles: list):
	corpus_name, object_name = RE_CUNEIFORM_HREF.match(cuneified_link.get_attribute('href')).groups()
	if args.language_dir:
	xpath_details = '//div[contains(@class, "xmdoutline")]/h3[text()="Details"]/following::ul[1]/li'
	details = [el.text for el in wd.find_elements_by_xpath(xpath_details)]
	language = get_language(details)
	LANGUAGE_BINS_FOR_EXPORTED_FILES[language].append((corpus_name, object_name))
	cuneified_link.click()
	if wd.name == 'firefox':
	WebDriverWait(wd, 10).until(EC.new_window_is_opened(window_handles))
	wd.switch_to.window('cuneified')
	# Workaround to ensure that the cuneify-text table exists
	try:
	wait_for_xpath(wd, '//table[@class="cuneify-text"]')
	except TimeoutException:
	try:
	wd.find_element_by_xpath('//table[@class="cuneify-text"]/tbody')
	except NoSuchElementException:
	wd.close()
	return
	cuneiform_lines = [line.text for line in wd.find_elements_by_xpath(XPATH_CUNEIFY_LINES)]
	store_cuneiform(args, cuneiform_lines, corpus_name, object_name)
	wd.close()


	def export_corpus(args: argparse.Namespace, wd: webdriver, name: str):
	logging.info('Opening corpus {}'.format(name))
	wd.get(urljoin(ORACC_SITE, '{}/{}'.format(name, 'corpus')))
	xpath_designations_table = '//div[@id="p3right"]/table[@class="xmd"]'
	try:
	wd.find_element_by_xpath(xpath_designations_table)
	except NoSuchElementException:
	logging.warning('No objects found in the corpus {}'.format(name))
	return
	# Save all object link texts, as we go to object pages and back to object list
	object_link_texts = [link.text for link in wd.find_elements_by_xpath('//tr[not(@class)]/td[2]/a')]
	logging.debug('Found {} objects'.format(len(object_link_texts)))
	for object_link_text in object_link_texts:
	# Open object page
	object_link = wd.find_element_by_link_text(object_link_text)
	object_link.click()
	xpath_outline_panel = '//div[contains(@class, "xmdoutline")]'
	wait_for_xpath(wd, xpath_outline_panel)
	xpath_object_numbers = '//div[contains(@class, "xmdoutline")]/h3[text()="Numbers"]/following::ul[1]/li'
	object_number = get_object_number([el.text for el in wd.find_elements_by_xpath(xpath_object_numbers)])
	logging.info('Processing object {}'.format(object_number))
	# Prepare for switching windows
	window_handles = wd.window_handles
	oracc_window_handle = wd.current_window_handle
	# Search for Cuneified link
	try:
	# Open Cuneified link and process text
	cuneified_link = wd.find_element_by_link_text('Cuneified')
	extract_cuneiform(args, wd, cuneified_link, window_handles)
	except NoSuchElementException:
	pass
	# Return to original window
	wd.switch_to.window(oracc_window_handle)
	# Go back
	wd.back()
	wait_for_xpath(wd, xpath_designations_table)


	def export_all(args: argparse.Namespace):
	# Initialise webdriver
	if args.browser == 'firefox':
	wd = webdriver.Firefox()
	elif args.browser == 'phantomjs':
	wd = webdriver.PhantomJS()
	else:
	raise NotImplementedError('Software not tested with browser {}'.format(args.browser))
	# Get list of projects
	logging.info('Getting list of projects')
	wd.get(urljoin(ORACC_SITE, 'projectlist.html'))
	xpath_projects = '//div[@class="projects"]/div[@class="subproject-entry"]/h2[@class="proj-head"]/a'
	wait_for_xpath(wd, xpath_projects)
	project_names = [el.get_attribute('href').replace(ORACC_SITE, '') for el in wd.find_elements_by_xpath(xpath_projects)]
	logging.info('Available projects:')
	for project_index, project_name in enumerate(project_names, 1):
	logging.info('{}. {}'.format(project_index, project_name))
	if args.starting_index:
	del project_names[0:args.starting_index-1]
	logging.info('Starting with {}'.format(project_names[0]))
	for project_name in project_names:
	if args.corpora and project_name in args.corpora or not args.corpora:
	export_corpus(args, wd, project_name)
	wd.quit()
	if args.language_dir:
	os.makedirs(args.language_dir, 0o755, exist_ok=True)
	for language, exported_files in LANGUAGE_BINS_FOR_EXPORTED_FILES.items():
	language_corpus_filename = os.path.join(args.language_dir, 'corpus_{}.txt'.format(language))
	logging.info('Creating corpus file {}...'.format(language_corpus_filename))
	with open(language_corpus_filename, 'w', encoding='utf-8') as language_corpus:
	for corpus_name, object_name in exported_files:
	exported_file = os.path.join(corpus_name.replace('/', os.path.sep), '{}.txt'.format(object_name))
	try:
	with open(exported_file, 'r', encoding='utf-8') as inp:
	logging.debug('Processing {}...'.format(exported_file))
	shutil.copyfileobj(inp, language_corpus)
	print('', file=language_corpus) # add newline at the end of every file
	except FileNotFoundError:
	logging.warning('File {} not found'.format(exported_file))
	if args.corpus_file:
	logging.info('Creating corpus file {}...'.format(args.corpus_file))
	with open(args.corpus_file, 'w', encoding='utf-8') as corpus:
	for exported_file in sorted(EXPORTED_FILES):
	try:
	with open(exported_file, 'r', encoding='utf-8') as inp:
	logging.debug('Processing {}...'.format(exported_file))
	shutil.copyfileobj(inp, corpus)
	print('', file=corpus) # add newline at the end of every file
	except FileNotFoundError:
	logging.warning('File {} not found'.format(exported_file))


	def main():
	argparser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
	argparser.add_argument('-d', '--directory', help='Output directory', default=os.getcwd())
	argparser.add_argument('-a', '--annotations', help='Do not remove annotations')
	argparser.add_argument('-c', '--corpora', help='Download only these corpora')
	argparser.add_argument('-s', '--starting_index', type=int, help='Index of corpus to start with')
	argparser.add_argument('-f', '--corpus_file', help='Specify corpus file')
	argparser.add_argument('-b', '--browser', choices=['firefox', 'phantomjs'], default='firefox',
	help='Browser to use for accessing ORACC')
	argparser.add_argument('-l', '--language_dir', help='Directory with language bins')
	args = argparser.parse_args()
	logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO)
	export_all(args)


	if __name__ == '__main__':
	main()