Created
March 17, 2017 03:03
-
-
Save stevecassidy/86cac760298eb681f802201869b96454 to your computer and use it in GitHub Desktop.
Get audio data for hVd words for a group of speakers from the Austalk collection in the Alveo VL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Get data from Austalk in Alveo, | |
finding speakers via metadata queries, | |
finding speaker metadata, | |
finding items for a speaker | |
downloading audio files from an item""" | |
import pyalveo | |
import os | |
from fnmatch import fnmatch | |
PREFIXES = """ | |
PREFIX dc:<http://purl.org/dc/terms/> | |
PREFIX austalk:<http://ns.austalk.edu.au/> | |
PREFIX olac:<http://www.language-archives.org/OLAC/1.1/> | |
PREFIX ausnc:<http://ns.ausnc.org.au/schemas/ausnc_md_model/> | |
PREFIX foaf:<http://xmlns.com/foaf/0.1/> | |
PREFIX dbpedia:<http://dbpedia.org/ontology/> | |
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX geo:<http://www.w3.org/2003/01/geo/wgs84_pos#> | |
PREFIX iso639schema:<http://downlode.org/rdf/iso-639/schema#> | |
PREFIX austalkid:<http://id.austalk.edu.au/> | |
PREFIX iso639:<http://downlode.org/rdf/iso-639/languages#> | |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | |
PREFIX is: <http://purl.org/ontology/is/core#> | |
PREFIX iso: <http://purl.org/iso25964/skos-thes#> | |
PREFIX dada: <http://purl.org/dada/schema/0.2#>""" | |
def find_speakers(client): | |
"""Find speakers in Austalk and return | |
a dictionary with one key per speaker | |
and each key containing a dictionary of | |
metadata values for that speaker""" | |
query = PREFIXES + """ | |
select ?spkr ?id ?town ?country { | |
?spkr a foaf:Person . | |
?spkr austalk:id ?id . | |
?spkr austalk:pob_town ?town . | |
?spkr austalk:pob_country ?country . | |
} | |
limit 10 | |
""" | |
result = client.sparql_query('austalk', query) | |
speakers = {} | |
for b in result['results']['bindings']: | |
speakers[b['spkr']['value']] = {'id': b['id']['value'], | |
'town': b['town']['value'], | |
'country': b['country']['value'] | |
} | |
return speakers | |
def find_words(client, speakerid, words): | |
"""Find words in the Austalk corpus for a given speaker | |
return an ItemGroup object containing the items | |
""" | |
query = PREFIXES + """ | |
SELECT distinct ?item ?prompt ?compname | |
WHERE { | |
?item a ausnc:AusNCObject . | |
?item olac:speaker <%s> . | |
?item austalk:prompt ?prompt . | |
?item austalk:componentName ?compname . | |
""" % speakerid | |
filterclause = 'FILTER regex(?prompt, "^' | |
filterclause += '$|^'.join(words) | |
filterclause += '$", "i")\n' | |
query += filterclause + "}" | |
result = client.sparql_query('austalk', query) | |
items = [] | |
for b in result['results']['bindings']: | |
itemurl = b['item']['value'] | |
# HACK: current database has the wrong URL for items - this will be fixed soon so this will be | |
# redundant | |
itemurl = itemurl.replace('http://id.austalk.edu.au/item/', 'https://app.alveo.edu.au/catalog/austalk/') | |
items.append(itemurl) | |
return pyalveo.ItemGroup(items, client) | |
def download_documents(item_list, patterns, output_path): | |
""" | |
Downloads a list of documents to the directory specificed by output_path. | |
:type documents: list of pyalveo.Document | |
:param documents: Documents to download | |
:type output_path: String | |
:param output_path: directory to download to the documents to | |
""" | |
if not os.path.exists(output_path): | |
os.makedirs(output_path) | |
downloaded = [] | |
items = item_list.get_all() | |
filtered_documents = [] | |
for item in items: | |
documents = item.get_documents() | |
for doc in documents: | |
for pattern in patterns: | |
if not pattern == '' and fnmatch(doc.get_filename(), pattern): | |
fname = doc.get_filename() | |
try: | |
doc.download_content(dir_path=output_path, filename=fname) | |
downloaded.append(fname) | |
print "Got ", fname | |
except: | |
# maybe it doesn't exist or we have no access | |
# TODO: report this | |
pass | |
return downloaded | |
if __name__=='__main__': | |
client = pyalveo.Client() | |
speakers = find_speakers(client) | |
hVdWords = { | |
'monopthongs': ['head', 'had', 'hud', 'heed', 'hid', 'hood', 'hod', 'whod', 'herd', 'haired', 'hard', 'horde'], | |
'dipthongs': ['howd', 'hoyd', 'hide', 'hode', 'hade', 'heared'] | |
} | |
for speaker in speakers: | |
print speaker, speakers[speaker] | |
items = find_words(client, speaker, hVdWords['dipthongs']) | |
for item in items: | |
print "\t", item | |
# download to a speaker specific directory | |
outdir = os.path.join('out', speakers[speaker]['id']) | |
download_documents(items, ["*speaker16.wav"], outdir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment