Skip to content

Instantly share code, notes, and snippets.

@saverkamp
Created January 24, 2020 22:45
Show Gist options
  • Save saverkamp/3af7e44f1a63e045219ef5ffe3a94a4d to your computer and use it in GitHub Desktop.
Save saverkamp/3af7e44f1a63e045219ef5ffe3a94a4d to your computer and use it in GitHub Desktop.
Get lists of LC vocabulary terms and URIs from id.loc.gov
"""This script traverses all narrower terms of a http://id.loc.gov/ thesaurus
(or all terms of a term list) starting at a given term within the tree (replace
seedterm in the main code block with your URI of choice) and adds the URI and
label to a list. Outputs in CSV and JSON as well as JSONL as patterns for use in
rule-based NER with the NLP tool SpaCy.
(More info at: https://spacy.io/usage/rule-based-matching#entityruler)
NOTE the 5-second rate limit courtesy to the LC servers working hard for your
controlled vocabulary needs (see queryTerms() function). You might get away with
less, but don't be a jerk about it.
"""
import requests
import csv
import json
import time
#comment this out if you don't need a JSONL patterns file for NER in SpaCy and don't want to install jsonlines
import jsonlines
def queryTerm(request_url):
"""Query the id.loc.gov service for the JSON (MADS/RDF and SKOS/RDF) representation of a given URI and return the JSON response as a Python dict"""
#go easy on the LC servers
time.sleep(5)
headers = {'accept': 'application/json'}
query = requests.get(request_url, headers=headers)
if query.status_code == 200:
response = json.loads(query.content)
else:
response = None
return response
def addTerm(request_url, termlist):
"""Recursively add thesaurus terms to a list by traversing the tree through narrower terms"""
termlist = termlist
response = queryTerm(request_url)
if response is not None:
term = getTermInfo(request_url, response)
if len(term) > 0:
termlist.append(term)
#if there are narrower terms, parse those
for r in response:
if 'http://www.w3.org/2004/02/skos/core#narrower' in r:
narrower = r['http://www.w3.org/2004/02/skos/core#narrower']
for n in narrower:
n_url = n['@id']
termlist = addTerm(n_url, termlist)
#this works for term lists instead of thesauri
for r in response:
if 'http://www.loc.gov/mads/rdf/v1#hasMADSSchemeMember' in r:
narrower = r['http://www.loc.gov/mads/rdf/v1#hasMADSSchemeMember']
for n in narrower:
n_url = n['@id']
termlist = addTerm(n_url, termlist)
return termlist
def getTermInfo(request_url, response):
"""Get the URI and label for a term. If you want to include more info about the term, you can do it here"""
term = {}
#iterate through objects in response to find the one with the request_url as id
for r in response:
if r['@id'] == request_url:
term_object = r
if 'http://www.loc.gov/mads/rdf/v1#authoritativeLabel' in r:
#add URI and label for the term to the master list
term['uri'] = r['@id']
term['prefLabel'] = None
print(request_url)
for p in r['http://www.loc.gov/mads/rdf/v1#authoritativeLabel']:
if '@language' in p:
if p['@language'] == 'en':
term['prefLabel'] = p['@value']
else:
term['prefLabel'] = p['@value']
print(term['prefLabel'])
return term
def writeCsv(termlist, filestem):
"""Write output to a CSV file"""
filename = filestem + '.csv'
f = open(filename, 'w')
fieldnames = ['uri', 'prefLabel']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for t in termlist:
writer.writerow(t)
f.close()
def writeJson(termlist, filestem):
"""Write output to a JSON file"""
filename = filestem + '.json'
f = open(filename, 'w')
json.dump(termlist, f)
f.close()
def writePatternsJsonl(termlist, label):
"""Write output to a patterns.jsonl file to use for rule-based NER
with SpaCy"""
patternlist = []
for t in termlist:
if t['prefLabel'] is not None:
listitem = {}
listitem['label'] = label
patterns = []
for s in t['prefLabel'].split(' '):
pattern = {}
pattern['LOWER'] = s.lower().replace('.', '')
patterns.append(pattern)
listitem['pattern'] = patterns
patternlist.append(listitem)
filename = folderpath + 'patterns.jsonl'
f = open(filename, 'w')
writer = jsonlines.Writer(f)
writer.write_all(patternlist)
writer.close()
f.close()
if __name__ == "__main__":
termlist = []
#change the seedterm to your top-level term URI
seedterm = 'http://id.loc.gov/authorities/performanceMediums/mp2013015382'
termlist = addTerm(seedterm, termlist)
#dedupe list--some terms have multiple parents
termlist = [dict(t) for t in {tuple(d.items()) for d in termlist}]
#name your CSV and JSON files here (without extension)
folderpath = ''
filestem = folderpath + 'lc_performance_mediums_pianos'
#comment out any of the below that you don't want
#write to CSV
writeCsv(termlist, filestem)
#write to JSON
writeJson(termlist, filestem)
#write terms only to patterns.jsonl file for use with SpaCy
#change this label to whatever you want to call this custom entity type in SpaCy NER
label = 'INSTRUMENT'
writePatternsJsonl(termlist, label)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment