Skip to content

Instantly share code, notes, and snippets.

@stevecassidy
Created June 3, 2019 01:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stevecassidy/1956a6c84807efdcb10626d31bcaf2ca to your computer and use it in GitHub Desktop.
Save stevecassidy/1956a6c84807efdcb10626d31bcaf2ca to your computer and use it in GitHub Desktop.
Import CIARA corpus data into Alveo
import pyalveo
import re
import os
# map file extensions to Alveo media type names for metadata
EXT_MAP = {
'.wav': "Audio",
'.txt': "Text",
'.mp4': "Video"
}
def parse_filename(fname):
"""Parse a filename into various metadata fields
Return a dictionary of metadata, eg.
{
'filename': '20180719LSJB_01_composite.wav'
'basename': '20180719LSJB_01',
'date': '2018-07-19',
'recorder': 'LSJB',
'session': '01',
'speaker': 'composite'
}
Return None if filename doesn't match expected pattern
"""
# filename is eg. 20180719LSJB_01_composite.wav
(basename, ext) = os.path.splitext(fname)
(datespkr, session, speaker) = basename.split('_')
# split datespkr (20180719LSJB) into parts
pattern = r'(\d\d\d\d)(\d\d)(\d\d)(.*)'
match = re.match(pattern, datespkr)
if match:
year, month, day, recorder = match.groups()
date = year + '-' + month + '-' + day
return {
'filename': fname,
'itemname': datespkr + '_' + session,
'date': date,
'recorder': recorder,
'speaker': speaker,
'session': session,
}
else:
return None
def corpus_items(basedir):
"""Return an iterator over items in the corpus,
each item is returned as a dictionary containing
metadata and a list of files and speakers.
"""
base_meta = {
'dcterms:creator': 'Joe Blyth',
"ausnc:mode": "spoken",
"ausnc:communication_context": "face-to-face",
"olac:language": "eng",
}
for directory in os.listdir(basedir):
meta = {}
# iterate over files
for fname in os.listdir(os.path.join(basedir, directory)):
info = parse_filename(fname)
itemname = info['itemname']
if not itemname in meta:
meta[itemname] = base_meta.copy()
meta[itemname]['dcterms:created'] = info['date']
meta[itemname]['dcterms:title'] = info['itemname']
meta[itemname]['olac:speakers'] = []
meta[itemname]['files'] = []
if not info['speaker'] in ['composite', 'X400', 'XA30']:
meta[itemname]['olac:speakers'].append(info['speaker'])
meta[itemname]['files'].append(os.path.join(basedir, directory, fname))
for itemname in meta:
yield meta[itemname]
def delete_existing(client):
"""Delete any existing items in this collection"""
collection_uri = client.api_url + "catalog/" + COLLECTION_NAME
print("Deleting items: ", list(client.get_items(collection_uri)))
for itemuri in client.get_items(collection_uri):
client.delete_item(itemuri)
def process(client, basedir):
"""Process the files in this corpus"""
collection_uri = client.api_url + "catalog/" + COLLECTION_NAME
for iteminfo in corpus_items(basedir):
itemid = iteminfo['dcterms:title'].lower()
files = iteminfo['files']
# remove 'files' from the metadata
del iteminfo['files']
del iteminfo['olac:speakers']
item = client.add_item(collection_uri, itemid, iteminfo)
print("Item: ", itemid, item)
for file in files:
docname = os.path.basename(file)
root, ext = os.path.splitext(docname)
if ext in EXT_MAP:
doctype = EXT_MAP[ext]
else:
doctype = "Other"
docmeta = {
"dcterms:title": docname,
"dcterms:type": doctype
}
try:
client.add_document(item, docname, docmeta, file=file)
print("\tDocument: ", docname)
except pyalveo.pyalveo.APIError as e:
print("Error: ", e)
if __name__ == '__main__':
# collection name to upload data into
COLLECTION_NAME = "ciara-kimberley-english"
# alveo.config file downloaded from Alveo
config = "/Users/steve/alveo.config"
# directory containing the data
basedir = "./data"
client = pyalveo.Client(configfile=config)
# uncomment this to delete any existing items before upload
# in case you want to re-do the upload of the same items
# delete_existing(client)
process(client, basedir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment