Created
March 22, 2018 23:28
-
-
Save stevecassidy/84e1e8b18d9be25939d41d2c8deeeeeb to your computer and use it in GitHub Desktop.
Script to download Austalk maptask data from Alveo.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script to identify and download maptask recordings from Austalk using | |
the Alveo API. | |
Author: Steve Cassidy | |
This script takes as input a spreadsheet (CSV) of participant data | |
created by the austalk-query app at https://austalk-query.apps.alveo.edu.au/. | |
On that app, use the first page to select the speakers you want via their | |
demographic data. On the page showing your search results, export the | |
participant data as a CSV file. That then forms the input to this script. | |
The script queries the Austalk metadata via the Alveo API to find the | |
maptask items for the given list of speakers. It finds items where | |
the speaker is either information-giver or information-follower. This is | |
done using a SPARQL query over the RDF metadata. | |
Once the list of items is identified, the data is downloaded. The script | |
downloads the maptask and speaker channels downsampled. This could be changed | |
by modifying the file patterns in the download_data function. | |
The script can optionally (if you uncomment a line) create an item list | |
on the Alveo system for the list of items retrieved. This would be useful | |
if you want to be able to use these items again or publish a reference | |
to them for future use. | |
""" | |
from __future__ import print_function | |
import pyalveo | |
import os | |
import csv | |
from fnmatch import fnmatch | |
PREFIXES = """ | |
PREFIX dc:<http://purl.org/dc/terms/> | |
PREFIX austalk:<http://ns.austalk.edu.au/> | |
PREFIX olac:<http://www.language-archives.org/OLAC/1.1/> | |
PREFIX ausnc:<http://ns.ausnc.org.au/schemas/ausnc_md_model/> | |
PREFIX foaf:<http://xmlns.com/foaf/0.1/> | |
PREFIX dbpedia:<http://dbpedia.org/ontology/> | |
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX geo:<http://www.w3.org/2003/01/geo/wgs84_pos#> | |
PREFIX iso639schema:<http://downlode.org/rdf/iso-639/schema#> | |
PREFIX austalkid:<http://id.austalk.edu.au/> | |
PREFIX iso639:<http://downlode.org/rdf/iso-639/languages#> | |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | |
PREFIX is: <http://purl.org/ontology/is/core#> | |
PREFIX iso: <http://purl.org/iso25964/skos-thes#> | |
PREFIX dada: <http://purl.org/dada/schema/0.2#>""" | |
def read_participants(csvfile): | |
"""Read a CSV file listing participants downloaded from | |
the austalk-query participant search page. Return | |
a list of participant identifiers (eg. '1_1119')""" | |
participants = [] | |
with open(csvfile) as partcsv: | |
reader = csv.DictReader(partcsv) | |
for row in reader: | |
participants.append(row['id']) | |
return participants | |
def get_maptask_items(client, participants): | |
"""Given a Pyalveo client connection and a list of | |
participants, return a list of item names for maptask | |
items for these participants""" | |
query = PREFIXES + """ | |
SELECT distinct ?item | |
WHERE { | |
{?item olac:speaker ?id .} | |
UNION {?item austalk:information_giver ?id .} | |
UNION {?item austalk:information_follower ?id .} | |
{?item austalk:componentName "maptask-1" .} | |
UNION {?item austalk:componentName "maptask-2" .} | |
VALUES ?id {<https://app.alveo.edu.au/speakers/austalk/%s>} | |
}""" | |
items = [] | |
for p in participants: | |
q = query % p | |
result = client.sparql_query('austalk', q) | |
# get the value of ?item for every binding in the query result | |
for b in result['results']['bindings']: | |
item = client.get_item(b['item']['value']) | |
items.append(item) | |
print(item) | |
return items | |
def download_documents(items, patterns, output_path): | |
""" | |
Downloads a list of documents to the directory specificed by output_path. | |
:type documents: list of pyalveo.Document | |
:param documents: Documents to download | |
:type output_path: String | |
:param output_path: directory to download to the documents to | |
""" | |
if not os.path.exists(output_path): | |
os.makedirs(output_path) | |
downloaded = [] | |
for item in items: | |
documents = item.get_documents() | |
for doc in documents: | |
for pattern in patterns: | |
if not pattern == '' and fnmatch(doc.get_filename(), pattern): | |
fname = doc.get_filename() | |
try: | |
doc.download_content(dir_path=output_path, filename=fname) | |
downloaded.append(fname) | |
print("Got ", fname) | |
except: | |
# maybe it doesn't exist or we have no access | |
# TODO: report this | |
pass | |
return downloaded | |
if __name__=='__main__': | |
import sys | |
csvfile = sys.argv[1] | |
client = pyalveo.Client() | |
participants = read_participants(csvfile) | |
items = get_maptask_items(client, participants) | |
# optional, make an item list on Alveo with these items | |
# so that you can refer to it again later | |
#itemlist = client.add_to_item_list_by_name([i.url() for i in items], "maptask-items") | |
download_documents(items, ["*maptask16.wav", "*speaker16.wav"], "maptask-data") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment