saverkamp/lc_vocab_term_harvester.py

## lc_vocab_term_harvester.py
"""This script traverses all narrower terms of a http://id.loc.gov/ thesaurus
(or all terms of a term list) starting at a given term within the tree (replace
seedterm in the main code block with your URI of choice) and adds the URI and
label to a list. Outputs in CSV and JSON as well as JSONL as patterns for use in
rule-based NER with the NLP tool SpaCy.
(More info at: https://spacy.io/usage/rule-based-matching#entityruler)

NOTE the 5-second rate limit courtesy to the LC servers working hard for your
controlled vocabulary needs (see queryTerms() function). You might get away with
less, but don't be a jerk about it.
"""

import requests
import csv
import json
import time
#comment this out if you don't need a JSONL patterns file for NER in SpaCy and don't want to install jsonlines
import jsonlines

def queryTerm(request_url):
  """Query the id.loc.gov service for the JSON (MADS/RDF and SKOS/RDF) representation of a given URI and return the JSON response as a Python dict"""
  #go easy on the LC servers
  time.sleep(5)
  headers = {'accept': 'application/json'}
  query = requests.get(request_url, headers=headers)
  if query.status_code == 200:
    response = json.loads(query.content)
  else:
    response = None
  return response

def addTerm(request_url, termlist):
  """Recursively add thesaurus terms to a list by traversing the tree through narrower terms"""
  termlist = termlist
  response = queryTerm(request_url)
  if response is not None:
    term = getTermInfo(request_url, response)
    if len(term) > 0:
      termlist.append(term)
    #if there are narrower terms, parse those
    for r in response:
      if 'http://www.w3.org/2004/02/skos/core#narrower' in r:
        narrower = r['http://www.w3.org/2004/02/skos/core#narrower']
        for n in narrower:
          n_url = n['@id']
          termlist = addTerm(n_url, termlist)
    #this works for term lists instead of thesauri
    for r in response:
      if 'http://www.loc.gov/mads/rdf/v1#hasMADSSchemeMember' in r:
        narrower = r['http://www.loc.gov/mads/rdf/v1#hasMADSSchemeMember']
        for n in narrower:
          n_url = n['@id']
          termlist = addTerm(n_url, termlist)
  return termlist

def getTermInfo(request_url, response):
  """Get the URI and label for a term. If you want to include more info about the term, you can do it here"""
  term = {}
  #iterate through objects in response to find the one with the request_url as id
  for r in response:
    if r['@id'] == request_url:
      term_object = r
      if 'http://www.loc.gov/mads/rdf/v1#authoritativeLabel' in r:
        #add URI and label for the term to the master list
        term['uri'] = r['@id']
        term['prefLabel'] = None
        print(request_url)
        for p in r['http://www.loc.gov/mads/rdf/v1#authoritativeLabel']:
          if '@language' in p:
            if p['@language'] == 'en':
              term['prefLabel'] = p['@value']
          else:
            term['prefLabel'] = p['@value']
          print(term['prefLabel'])
  return term

def writeCsv(termlist, filestem):
  """Write output to a CSV file"""
  filename = filestem + '.csv'
  f = open(filename, 'w')
  fieldnames = ['uri', 'prefLabel']
  writer = csv.DictWriter(f, fieldnames=fieldnames)
  writer.writeheader()
  for t in termlist:
      writer.writerow(t)
  f.close()

def writeJson(termlist, filestem):
  """Write output to a JSON file"""
  filename = filestem + '.json'
  f = open(filename, 'w')
  json.dump(termlist, f)
  f.close()

def writePatternsJsonl(termlist, label):
  """Write output to a patterns.jsonl file to use for rule-based NER
  with SpaCy"""
  patternlist = []
  for t in termlist:
    if t['prefLabel'] is not None:
      listitem = {}
      listitem['label'] = label
      patterns = []
      for s in t['prefLabel'].split(' '):
        pattern = {}
        pattern['LOWER'] = s.lower().replace('.', '')
        patterns.append(pattern)
        listitem['pattern'] = patterns
      patternlist.append(listitem)
  filename = folderpath + 'patterns.jsonl'
  f = open(filename, 'w')
  writer = jsonlines.Writer(f)
  writer.write_all(patternlist)
  writer.close()
  f.close()

if __name__ == "__main__":
  termlist = []
  #change the seedterm to your top-level term URI
  seedterm = 'http://id.loc.gov/authorities/performanceMediums/mp2013015382'
  termlist = addTerm(seedterm, termlist)

  #dedupe list--some terms have multiple parents
  termlist = [dict(t) for t in {tuple(d.items()) for d in termlist}]

  #name your CSV and JSON files here (without extension)
  folderpath = ''
  filestem = folderpath + 'lc_performance_mediums_pianos'

  #comment out any of the below that you don't want

  #write to CSV
  writeCsv(termlist, filestem)

  #write to JSON
  writeJson(termlist, filestem)

  #write terms only to patterns.jsonl file for use with SpaCy
  #change this label to whatever you want to call this custom entity type in SpaCy NER
  label = 'INSTRUMENT'
  writePatternsJsonl(termlist, label)
	"""This script traverses all narrower terms of a http://id.loc.gov/ thesaurus
	(or all terms of a term list) starting at a given term within the tree (replace
	seedterm in the main code block with your URI of choice) and adds the URI and
	label to a list. Outputs in CSV and JSON as well as JSONL as patterns for use in
	rule-based NER with the NLP tool SpaCy.
	(More info at: https://spacy.io/usage/rule-based-matching#entityruler)

	NOTE the 5-second rate limit courtesy to the LC servers working hard for your
	controlled vocabulary needs (see queryTerms() function). You might get away with
	less, but don't be a jerk about it.
	"""

	import requests
	import csv
	import json
	import time
	#comment this out if you don't need a JSONL patterns file for NER in SpaCy and don't want to install jsonlines
	import jsonlines

	def queryTerm(request_url):
	"""Query the id.loc.gov service for the JSON (MADS/RDF and SKOS/RDF) representation of a given URI and return the JSON response as a Python dict"""
	#go easy on the LC servers
	time.sleep(5)
	headers = {'accept': 'application/json'}
	query = requests.get(request_url, headers=headers)
	if query.status_code == 200:
	response = json.loads(query.content)
	else:
	response = None
	return response

	def addTerm(request_url, termlist):
	"""Recursively add thesaurus terms to a list by traversing the tree through narrower terms"""
	termlist = termlist
	response = queryTerm(request_url)
	if response is not None:
	term = getTermInfo(request_url, response)
	if len(term) > 0:
	termlist.append(term)
	#if there are narrower terms, parse those
	for r in response:
	if 'http://www.w3.org/2004/02/skos/core#narrower' in r:
	narrower = r['http://www.w3.org/2004/02/skos/core#narrower']
	for n in narrower:
	n_url = n['@id']
	termlist = addTerm(n_url, termlist)
	#this works for term lists instead of thesauri
	for r in response:
	if 'http://www.loc.gov/mads/rdf/v1#hasMADSSchemeMember' in r:
	narrower = r['http://www.loc.gov/mads/rdf/v1#hasMADSSchemeMember']
	for n in narrower:
	n_url = n['@id']
	termlist = addTerm(n_url, termlist)
	return termlist

	def getTermInfo(request_url, response):
	"""Get the URI and label for a term. If you want to include more info about the term, you can do it here"""
	term = {}
	#iterate through objects in response to find the one with the request_url as id
	for r in response:
	if r['@id'] == request_url:
	term_object = r
	if 'http://www.loc.gov/mads/rdf/v1#authoritativeLabel' in r:
	#add URI and label for the term to the master list
	term['uri'] = r['@id']
	term['prefLabel'] = None
	print(request_url)
	for p in r['http://www.loc.gov/mads/rdf/v1#authoritativeLabel']:
	if '@language' in p:
	if p['@language'] == 'en':
	term['prefLabel'] = p['@value']
	else:
	term['prefLabel'] = p['@value']
	print(term['prefLabel'])
	return term

	def writeCsv(termlist, filestem):
	"""Write output to a CSV file"""
	filename = filestem + '.csv'
	f = open(filename, 'w')
	fieldnames = ['uri', 'prefLabel']
	writer = csv.DictWriter(f, fieldnames=fieldnames)
	writer.writeheader()
	for t in termlist:
	writer.writerow(t)
	f.close()

	def writeJson(termlist, filestem):
	"""Write output to a JSON file"""
	filename = filestem + '.json'
	f = open(filename, 'w')
	json.dump(termlist, f)
	f.close()

	def writePatternsJsonl(termlist, label):
	"""Write output to a patterns.jsonl file to use for rule-based NER
	with SpaCy"""
	patternlist = []
	for t in termlist:
	if t['prefLabel'] is not None:
	listitem = {}
	listitem['label'] = label
	patterns = []
	for s in t['prefLabel'].split(' '):
	pattern = {}
	pattern['LOWER'] = s.lower().replace('.', '')
	patterns.append(pattern)
	listitem['pattern'] = patterns
	patternlist.append(listitem)
	filename = folderpath + 'patterns.jsonl'
	f = open(filename, 'w')
	writer = jsonlines.Writer(f)
	writer.write_all(patternlist)
	writer.close()
	f.close()

	if __name__ == "__main__":
	termlist = []
	#change the seedterm to your top-level term URI
	seedterm = 'http://id.loc.gov/authorities/performanceMediums/mp2013015382'
	termlist = addTerm(seedterm, termlist)

	#dedupe list--some terms have multiple parents
	termlist = [dict(t) for t in {tuple(d.items()) for d in termlist}]

	#name your CSV and JSON files here (without extension)
	folderpath = ''
	filestem = folderpath + 'lc_performance_mediums_pianos'

	#comment out any of the below that you don't want

	#write to CSV
	writeCsv(termlist, filestem)

	#write to JSON
	writeJson(termlist, filestem)

	#write terms only to patterns.jsonl file for use with SpaCy
	#change this label to whatever you want to call this custom entity type in SpaCy NER
	label = 'INSTRUMENT'
	writePatternsJsonl(termlist, label)