Skip to content

Instantly share code, notes, and snippets.

View walkerdb's full-sized avatar

Walker Boyle walkerdb

View GitHub Profile
@walkerdb
walkerdb / makedictionary.py
Last active August 29, 2015 14:25 — forked from eckardm/makenamedictionary.py
Creates a dictionary using the Name and LC Record Link columns from OpenRefine.
# import what we need
import csv
from fuzzywuzzy import fuzz
csv.field_size_limit(1000000000)
# what's coming form openrefine?
openrefine_persname_1 = 'openrefine_persname_1.csv'
openrefine_persname_2 = 'openrefine_persname_2.csv'
openrefine_corpname = 'openrefine_corpname.csv'
from urllib2 import urlopen, quote
# if you're running python 3, replace the above with the following:
# from urllib.request import urlopen
# from urllib.parse import quote
def retrieve_viaf_search_results(search_index, search_term, auth_source):
# url search template formatted to allow easy variable insertion
search_url_template = 'http://viaf.org/viaf/search/viaf?query=local.{0}+all+{1}+and+local.sources+any+{2}&sortKeys=holdingscount&httpAccept=application/xml'
# since we'll be inserting the three passed variables into the
from lxml import etree
def get_lc_auth_from_viaf_data(response):
lc_auth = ""
# parse the returned xml into an lxml etree
tree = etree.fromstring(response)
# extract a list of the VIAF search result nodes using an xpath query
results = tree.xpath("//*[local-name()='record']")
from bs4 import BeautifulSoup
def get_lc_term_name(lc_auth_number):
# create the LoC address by inserting the auth id into a template
lc_template = "http://id.loc.gov/authorities/names/{0}.html"
lc_address = lc_template.format(lc_auth_number)
# get the html for that address
response = urlopen(lc_address).read()
>>> from fuzzywuzzy import fuzz
>>> bentley_term = "Emily Dickinson (1830-1886)"
>>> lc_term = "Dickinson, Emily, 1830-1886"
>>> fuzz.ratio(bentley_term, lc_term)
70
>>> fuzz.ratio("Clark Kent", "Superman")
22
>>> fuzz.token_sort_ratio(bentley_term, lc_term)
100
def is_same_entity(local_term, lc_term, controlaccess_type):
if "geogname" in controlaccess_type:
# geognames are a simple check. Returns true if the
# similarity is > 95; else false
similarity = fuzz.token_sort_ratio(local_term, lc_term)
return similarity > 95
elif "corpname" in controlaccess_type:
# replace some common abbreviations with their full forms
# we're only going to use the etree module (short for "element tree")
from lxml import etree
tree = etree.parse("path/to/gargoyle.xml") # replace the path text with your own filesystem path
extents = tree.xpath("//extent")
# to find all unitid elements whose parent is a did tag:
tree.xpath("//did/unitid")
# using an absolute path to find exact locations:
tree.xpath("/ead/archdesc/did/physdesc/extent")
## if there are multiple "extent" tags in the parent physdesc,
## you can find specific tags by designating an index
## unlike any other language ever, xpath indexes start at 1, not zero