stevecassidy/get_maptask_data.py

## get_maptask_data.py
"""
Script to identify and download maptask recordings from Austalk using
the Alveo API.

Author: Steve Cassidy

This script takes as input a spreadsheet (CSV) of participant data
created by the austalk-query app at https://austalk-query.apps.alveo.edu.au/.

On that app, use the first page to select the speakers you want via their
demographic data. On the page showing your search results, export the
participant data as a CSV file.  That then forms the input to this script.

The script queries the Austalk metadata via the Alveo API to find the
maptask items for the given list of speakers.   It finds items where
the speaker is either information-giver or information-follower.  This is
done using a SPARQL query over the RDF metadata.

Once the list of items is identified, the data is downloaded.  The script
downloads the maptask and speaker channels downsampled. This could be changed
by modifying the file patterns in the download_data function.

The script can optionally (if you uncomment a line) create an item list
on the Alveo system for the list of items retrieved.  This would be useful
if you want to be able to use these items again or publish a reference
to them for future use.

"""


from __future__ import print_function
import pyalveo
import os
import csv
from fnmatch import fnmatch


PREFIXES = """
PREFIX dc:<http://purl.org/dc/terms/>
PREFIX austalk:<http://ns.austalk.edu.au/>
PREFIX olac:<http://www.language-archives.org/OLAC/1.1/>
PREFIX ausnc:<http://ns.ausnc.org.au/schemas/ausnc_md_model/>
PREFIX foaf:<http://xmlns.com/foaf/0.1/>
PREFIX dbpedia:<http://dbpedia.org/ontology/>
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
PREFIX geo:<http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX iso639schema:<http://downlode.org/rdf/iso-639/schema#>
PREFIX austalkid:<http://id.austalk.edu.au/>
PREFIX iso639:<http://downlode.org/rdf/iso-639/languages#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX is: <http://purl.org/ontology/is/core#>
PREFIX iso: <http://purl.org/iso25964/skos-thes#>
PREFIX dada: <http://purl.org/dada/schema/0.2#>"""


def read_participants(csvfile):
    """Read a CSV file listing participants downloaded from
    the austalk-query participant search page. Return
    a list of participant identifiers (eg. '1_1119')"""

    participants = []
    with open(csvfile) as partcsv:
        reader = csv.DictReader(partcsv)
        for row in reader:
            participants.append(row['id'])
    return participants


def get_maptask_items(client, participants):
    """Given a Pyalveo client connection and a list of
    participants, return a list of item names for maptask
    items for these participants"""

    query = PREFIXES + """
    SELECT distinct ?item
WHERE {
  {?item olac:speaker ?id .}
  UNION {?item austalk:information_giver ?id .}
  UNION {?item austalk:information_follower ?id .}

  {?item austalk:componentName "maptask-1" .}
  UNION {?item austalk:componentName "maptask-2" .}

  VALUES ?id {<https://app.alveo.edu.au/speakers/austalk/%s>}

}"""

    items = []
    for p in participants:
        q = query % p
        result = client.sparql_query('austalk', q)

        # get the value of ?item for every binding in the query result
        for b in result['results']['bindings']:
            item = client.get_item(b['item']['value'])
            items.append(item)
            print(item)

    return items


def download_documents(items, patterns, output_path):
    """
    Downloads a list of documents to the directory specificed by output_path.

    :type documents: list of pyalveo.Document
    :param documents: Documents to download

    :type output_path: String
    :param output_path: directory to download to the documents to
    """
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    downloaded = []

    for item in items:
        documents = item.get_documents()
        for doc in documents:
            for pattern in patterns:
                if not pattern == '' and fnmatch(doc.get_filename(), pattern):
                    fname = doc.get_filename()
                    try:
                        doc.download_content(dir_path=output_path, filename=fname)
                        downloaded.append(fname)
                        print("Got ", fname)
                    except:
                        # maybe it doesn't exist or we have no access
                        # TODO: report this
                        pass
    return downloaded


if __name__=='__main__':

    import sys

    csvfile = sys.argv[1]

    client = pyalveo.Client()
    participants = read_participants(csvfile)
    items = get_maptask_items(client, participants)

    # optional, make an item list on Alveo with these items
    # so that you can refer to it again later
    #itemlist = client.add_to_item_list_by_name([i.url() for i in items], "maptask-items")


    download_documents(items, ["*maptask16.wav", "*speaker16.wav"], "maptask-data")
	"""
	Script to identify and download maptask recordings from Austalk using
	the Alveo API.

	Author: Steve Cassidy

	This script takes as input a spreadsheet (CSV) of participant data
	created by the austalk-query app at https://austalk-query.apps.alveo.edu.au/.

	On that app, use the first page to select the speakers you want via their
	demographic data. On the page showing your search results, export the
	participant data as a CSV file. That then forms the input to this script.

	The script queries the Austalk metadata via the Alveo API to find the
	maptask items for the given list of speakers. It finds items where
	the speaker is either information-giver or information-follower. This is
	done using a SPARQL query over the RDF metadata.

	Once the list of items is identified, the data is downloaded. The script
	downloads the maptask and speaker channels downsampled. This could be changed
	by modifying the file patterns in the download_data function.

	The script can optionally (if you uncomment a line) create an item list
	on the Alveo system for the list of items retrieved. This would be useful
	if you want to be able to use these items again or publish a reference
	to them for future use.

	"""


	from __future__ import print_function
	import pyalveo
	import os
	import csv
	from fnmatch import fnmatch


	PREFIXES = """
	PREFIX dc:<http://purl.org/dc/terms/>
	PREFIX austalk:<http://ns.austalk.edu.au/>
	PREFIX olac:<http://www.language-archives.org/OLAC/1.1/>
	PREFIX ausnc:<http://ns.ausnc.org.au/schemas/ausnc_md_model/>
	PREFIX foaf:<http://xmlns.com/foaf/0.1/>
	PREFIX dbpedia:<http://dbpedia.org/ontology/>
	PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
	PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
	PREFIX geo:<http://www.w3.org/2003/01/geo/wgs84_pos#>
	PREFIX iso639schema:<http://downlode.org/rdf/iso-639/schema#>
	PREFIX austalkid:<http://id.austalk.edu.au/>
	PREFIX iso639:<http://downlode.org/rdf/iso-639/languages#>
	PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
	PREFIX is: <http://purl.org/ontology/is/core#>
	PREFIX iso: <http://purl.org/iso25964/skos-thes#>
	PREFIX dada: <http://purl.org/dada/schema/0.2#>"""


	def read_participants(csvfile):
	"""Read a CSV file listing participants downloaded from
	the austalk-query participant search page. Return
	a list of participant identifiers (eg. '1_1119')"""

	participants = []
	with open(csvfile) as partcsv:
	reader = csv.DictReader(partcsv)
	for row in reader:
	participants.append(row['id'])
	return participants


	def get_maptask_items(client, participants):
	"""Given a Pyalveo client connection and a list of
	participants, return a list of item names for maptask
	items for these participants"""

	query = PREFIXES + """
	SELECT distinct ?item
	WHERE {
	{?item olac:speaker ?id .}
	UNION {?item austalk:information_giver ?id .}
	UNION {?item austalk:information_follower ?id .}

	{?item austalk:componentName "maptask-1" .}
	UNION {?item austalk:componentName "maptask-2" .}

	VALUES ?id {<https://app.alveo.edu.au/speakers/austalk/%s>}

	}"""

	items = []
	for p in participants:
	q = query % p
	result = client.sparql_query('austalk', q)

	# get the value of ?item for every binding in the query result
	for b in result['results']['bindings']:
	item = client.get_item(b['item']['value'])
	items.append(item)
	print(item)

	return items


	def download_documents(items, patterns, output_path):
	"""
	Downloads a list of documents to the directory specificed by output_path.

	:type documents: list of pyalveo.Document
	:param documents: Documents to download

	:type output_path: String
	:param output_path: directory to download to the documents to
	"""
	if not os.path.exists(output_path):
	os.makedirs(output_path)

	downloaded = []

	for item in items:
	documents = item.get_documents()
	for doc in documents:
	for pattern in patterns:
	if not pattern == '' and fnmatch(doc.get_filename(), pattern):
	fname = doc.get_filename()
	try:
	doc.download_content(dir_path=output_path, filename=fname)
	downloaded.append(fname)
	print("Got ", fname)
	except:
	# maybe it doesn't exist or we have no access
	# TODO: report this
	pass
	return downloaded


	if __name__=='__main__':

	import sys

	csvfile = sys.argv[1]

	client = pyalveo.Client()
	participants = read_participants(csvfile)
	items = get_maptask_items(client, participants)

	# optional, make an item list on Alveo with these items
	# so that you can refer to it again later
	#itemlist = client.add_to_item_list_by_name([i.url() for i in items], "maptask-items")


	download_documents(items, ["maptask16.wav", "speaker16.wav"], "maptask-data")