Skip to content

Instantly share code, notes, and snippets.

@stevecassidy
Created March 22, 2018 23:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stevecassidy/84e1e8b18d9be25939d41d2c8deeeeeb to your computer and use it in GitHub Desktop.
Save stevecassidy/84e1e8b18d9be25939d41d2c8deeeeeb to your computer and use it in GitHub Desktop.
Script to download Austalk maptask data from Alveo.
"""
Script to identify and download maptask recordings from Austalk using
the Alveo API.
Author: Steve Cassidy
This script takes as input a spreadsheet (CSV) of participant data
created by the austalk-query app at https://austalk-query.apps.alveo.edu.au/.
On that app, use the first page to select the speakers you want via their
demographic data. On the page showing your search results, export the
participant data as a CSV file. That then forms the input to this script.
The script queries the Austalk metadata via the Alveo API to find the
maptask items for the given list of speakers. It finds items where
the speaker is either information-giver or information-follower. This is
done using a SPARQL query over the RDF metadata.
Once the list of items is identified, the data is downloaded. The script
downloads the maptask and speaker channels downsampled. This could be changed
by modifying the file patterns in the download_data function.
The script can optionally (if you uncomment a line) create an item list
on the Alveo system for the list of items retrieved. This would be useful
if you want to be able to use these items again or publish a reference
to them for future use.
"""
from __future__ import print_function
import pyalveo
import os
import csv
from fnmatch import fnmatch
PREFIXES = """
PREFIX dc:<http://purl.org/dc/terms/>
PREFIX austalk:<http://ns.austalk.edu.au/>
PREFIX olac:<http://www.language-archives.org/OLAC/1.1/>
PREFIX ausnc:<http://ns.ausnc.org.au/schemas/ausnc_md_model/>
PREFIX foaf:<http://xmlns.com/foaf/0.1/>
PREFIX dbpedia:<http://dbpedia.org/ontology/>
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
PREFIX geo:<http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX iso639schema:<http://downlode.org/rdf/iso-639/schema#>
PREFIX austalkid:<http://id.austalk.edu.au/>
PREFIX iso639:<http://downlode.org/rdf/iso-639/languages#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX is: <http://purl.org/ontology/is/core#>
PREFIX iso: <http://purl.org/iso25964/skos-thes#>
PREFIX dada: <http://purl.org/dada/schema/0.2#>"""
def read_participants(csvfile):
"""Read a CSV file listing participants downloaded from
the austalk-query participant search page. Return
a list of participant identifiers (eg. '1_1119')"""
participants = []
with open(csvfile) as partcsv:
reader = csv.DictReader(partcsv)
for row in reader:
participants.append(row['id'])
return participants
def get_maptask_items(client, participants):
"""Given a Pyalveo client connection and a list of
participants, return a list of item names for maptask
items for these participants"""
query = PREFIXES + """
SELECT distinct ?item
WHERE {
{?item olac:speaker ?id .}
UNION {?item austalk:information_giver ?id .}
UNION {?item austalk:information_follower ?id .}
{?item austalk:componentName "maptask-1" .}
UNION {?item austalk:componentName "maptask-2" .}
VALUES ?id {<https://app.alveo.edu.au/speakers/austalk/%s>}
}"""
items = []
for p in participants:
q = query % p
result = client.sparql_query('austalk', q)
# get the value of ?item for every binding in the query result
for b in result['results']['bindings']:
item = client.get_item(b['item']['value'])
items.append(item)
print(item)
return items
def download_documents(items, patterns, output_path):
"""
Downloads a list of documents to the directory specificed by output_path.
:type documents: list of pyalveo.Document
:param documents: Documents to download
:type output_path: String
:param output_path: directory to download to the documents to
"""
if not os.path.exists(output_path):
os.makedirs(output_path)
downloaded = []
for item in items:
documents = item.get_documents()
for doc in documents:
for pattern in patterns:
if not pattern == '' and fnmatch(doc.get_filename(), pattern):
fname = doc.get_filename()
try:
doc.download_content(dir_path=output_path, filename=fname)
downloaded.append(fname)
print("Got ", fname)
except:
# maybe it doesn't exist or we have no access
# TODO: report this
pass
return downloaded
if __name__=='__main__':
import sys
csvfile = sys.argv[1]
client = pyalveo.Client()
participants = read_participants(csvfile)
items = get_maptask_items(client, participants)
# optional, make an item list on Alveo with these items
# so that you can refer to it again later
#itemlist = client.add_to_item_list_by_name([i.url() for i in items], "maptask-items")
download_documents(items, ["*maptask16.wav", "*speaker16.wav"], "maptask-data")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment