Created
June 3, 2019 01:00
-
-
Save stevecassidy/1956a6c84807efdcb10626d31bcaf2ca to your computer and use it in GitHub Desktop.
Import CIARA corpus data into Alveo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyalveo | |
import re | |
import os | |
# map file extensions to Alveo media type names for metadata | |
EXT_MAP = { | |
'.wav': "Audio", | |
'.txt': "Text", | |
'.mp4': "Video" | |
} | |
def parse_filename(fname): | |
"""Parse a filename into various metadata fields | |
Return a dictionary of metadata, eg. | |
{ | |
'filename': '20180719LSJB_01_composite.wav' | |
'basename': '20180719LSJB_01', | |
'date': '2018-07-19', | |
'recorder': 'LSJB', | |
'session': '01', | |
'speaker': 'composite' | |
} | |
Return None if filename doesn't match expected pattern | |
""" | |
# filename is eg. 20180719LSJB_01_composite.wav | |
(basename, ext) = os.path.splitext(fname) | |
(datespkr, session, speaker) = basename.split('_') | |
# split datespkr (20180719LSJB) into parts | |
pattern = r'(\d\d\d\d)(\d\d)(\d\d)(.*)' | |
match = re.match(pattern, datespkr) | |
if match: | |
year, month, day, recorder = match.groups() | |
date = year + '-' + month + '-' + day | |
return { | |
'filename': fname, | |
'itemname': datespkr + '_' + session, | |
'date': date, | |
'recorder': recorder, | |
'speaker': speaker, | |
'session': session, | |
} | |
else: | |
return None | |
def corpus_items(basedir): | |
"""Return an iterator over items in the corpus, | |
each item is returned as a dictionary containing | |
metadata and a list of files and speakers. | |
""" | |
base_meta = { | |
'dcterms:creator': 'Joe Blyth', | |
"ausnc:mode": "spoken", | |
"ausnc:communication_context": "face-to-face", | |
"olac:language": "eng", | |
} | |
for directory in os.listdir(basedir): | |
meta = {} | |
# iterate over files | |
for fname in os.listdir(os.path.join(basedir, directory)): | |
info = parse_filename(fname) | |
itemname = info['itemname'] | |
if not itemname in meta: | |
meta[itemname] = base_meta.copy() | |
meta[itemname]['dcterms:created'] = info['date'] | |
meta[itemname]['dcterms:title'] = info['itemname'] | |
meta[itemname]['olac:speakers'] = [] | |
meta[itemname]['files'] = [] | |
if not info['speaker'] in ['composite', 'X400', 'XA30']: | |
meta[itemname]['olac:speakers'].append(info['speaker']) | |
meta[itemname]['files'].append(os.path.join(basedir, directory, fname)) | |
for itemname in meta: | |
yield meta[itemname] | |
def delete_existing(client): | |
"""Delete any existing items in this collection""" | |
collection_uri = client.api_url + "catalog/" + COLLECTION_NAME | |
print("Deleting items: ", list(client.get_items(collection_uri))) | |
for itemuri in client.get_items(collection_uri): | |
client.delete_item(itemuri) | |
def process(client, basedir): | |
"""Process the files in this corpus""" | |
collection_uri = client.api_url + "catalog/" + COLLECTION_NAME | |
for iteminfo in corpus_items(basedir): | |
itemid = iteminfo['dcterms:title'].lower() | |
files = iteminfo['files'] | |
# remove 'files' from the metadata | |
del iteminfo['files'] | |
del iteminfo['olac:speakers'] | |
item = client.add_item(collection_uri, itemid, iteminfo) | |
print("Item: ", itemid, item) | |
for file in files: | |
docname = os.path.basename(file) | |
root, ext = os.path.splitext(docname) | |
if ext in EXT_MAP: | |
doctype = EXT_MAP[ext] | |
else: | |
doctype = "Other" | |
docmeta = { | |
"dcterms:title": docname, | |
"dcterms:type": doctype | |
} | |
try: | |
client.add_document(item, docname, docmeta, file=file) | |
print("\tDocument: ", docname) | |
except pyalveo.pyalveo.APIError as e: | |
print("Error: ", e) | |
if __name__ == '__main__': | |
# collection name to upload data into | |
COLLECTION_NAME = "ciara-kimberley-english" | |
# alveo.config file downloaded from Alveo | |
config = "/Users/steve/alveo.config" | |
# directory containing the data | |
basedir = "./data" | |
client = pyalveo.Client(configfile=config) | |
# uncomment this to delete any existing items before upload | |
# in case you want to re-do the upload of the same items | |
# delete_existing(client) | |
process(client, basedir) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment