Walker Boyle walkerdb

## makedictionary.py
# import what we need
import csv
from fuzzywuzzy import fuzz

csv.field_size_limit(1000000000)

# what's coming form openrefine?
openrefine_persname_1 = 'openrefine_persname_1.csv'
openrefine_persname_2 = 'openrefine_persname_2.csv'
openrefine_corpname = 'openrefine_corpname.csv'

## viaf_snippet_1.py
from urllib2 import urlopen, quote
# if you're running python 3, replace the above with the following:
# from urllib.request import urlopen
# from urllib.parse import quote

def retrieve_viaf_search_results(search_index, search_term, auth_source):
    # url search template formatted to allow easy variable insertion
    search_url_template = 'http://viaf.org/viaf/search/viaf?query=local.{0}+all+{1}+and+local.sources+any+{2}&sortKeys=holdingscount&httpAccept=application/xml'

    # since we'll be inserting the three passed variables into the

## viaf_snippet_2.py
from lxml import etree

def get_lc_auth_from_viaf_data(response):
    lc_auth = ""

    # parse the returned xml into an lxml etree
    tree = etree.fromstring(response)

    # extract a list of the VIAF search result nodes using an xpath query
    results = tree.xpath("//*[local-name()='record']")

## viaf_snippet_3.py
from bs4 import BeautifulSoup

def get_lc_term_name(lc_auth_number):
    # create the LoC address by inserting the auth id into a template
    lc_template = "http://id.loc.gov/authorities/names/{0}.html"
    lc_address = lc_template.format(lc_auth_number)

    # get the html for that address
    response = urlopen(lc_address).read()


## fuzzy_snippet_1.py
>>> from fuzzywuzzy import fuzz

>>> bentley_term = "Emily Dickinson (1830-1886)"
>>> lc_term = "Dickinson, Emily, 1830-1886"

>>> fuzz.ratio(bentley_term, lc_term)
70

>>> fuzz.ratio("Clark Kent", "Superman")
22

## fuzzy_snippet_2.py
>>> fuzz.token_sort_ratio(bentley_term, lc_term)
100


## false_positive_check.py
def is_same_entity(local_term, lc_term, controlaccess_type):

    if "geogname" in controlaccess_type:
        # geognames are a simple check. Returns true if the
        # similarity is > 95; else false
        similarity = fuzz.token_sort_ratio(local_term, lc_term)
        return similarity > 95

    elif "corpname" in controlaccess_type:
        # replace some common abbreviations with their full forms

## lxml_1.py
# we're only going to use the etree module (short for "element tree")
from lxml import etree

tree = etree.parse("path/to/gargoyle.xml") # replace the path text with your own filesystem path

## lxml_2.py
extents = tree.xpath("//extent")

## lxml_3.py
# to find all unitid elements whose parent is a did tag:
tree.xpath("//did/unitid")

# using an absolute path to find exact locations:
tree.xpath("/ead/archdesc/did/physdesc/extent")

## if there are multiple "extent" tags in the parent physdesc,
## you can find specific tags by designating an index
## unlike any other language ever, xpath indexes start at 1, not zero
	# import what we need
	import csv
	from fuzzywuzzy import fuzz

	csv.field_size_limit(1000000000)

	# what's coming form openrefine?
	openrefine_persname_1 = 'openrefine_persname_1.csv'
	openrefine_persname_2 = 'openrefine_persname_2.csv'
	openrefine_corpname = 'openrefine_corpname.csv'
	from urllib2 import urlopen, quote
	# if you're running python 3, replace the above with the following:
	# from urllib.request import urlopen
	# from urllib.parse import quote

	def retrieve_viaf_search_results(search_index, search_term, auth_source):
	# url search template formatted to allow easy variable insertion
	search_url_template = 'http://viaf.org/viaf/search/viaf?query=local.{0}+all+{1}+and+local.sources+any+{2}&sortKeys=holdingscount&httpAccept=application/xml'

	# since we'll be inserting the three passed variables into the
	from lxml import etree

	def get_lc_auth_from_viaf_data(response):
	lc_auth = ""

	# parse the returned xml into an lxml etree
	tree = etree.fromstring(response)

	# extract a list of the VIAF search result nodes using an xpath query
	results = tree.xpath("//*[local-name()='record']")
	from bs4 import BeautifulSoup

	def get_lc_term_name(lc_auth_number):
	# create the LoC address by inserting the auth id into a template
	lc_template = "http://id.loc.gov/authorities/names/{0}.html"
	lc_address = lc_template.format(lc_auth_number)

	# get the html for that address
	response = urlopen(lc_address).read()
	>>> from fuzzywuzzy import fuzz

	>>> bentley_term = "Emily Dickinson (1830-1886)"
	>>> lc_term = "Dickinson, Emily, 1830-1886"

	>>> fuzz.ratio(bentley_term, lc_term)
	70

	>>> fuzz.ratio("Clark Kent", "Superman")
	22
	def is_same_entity(local_term, lc_term, controlaccess_type):

	if "geogname" in controlaccess_type:
	# geognames are a simple check. Returns true if the
	# similarity is > 95; else false
	similarity = fuzz.token_sort_ratio(local_term, lc_term)
	return similarity > 95

	elif "corpname" in controlaccess_type:
	# replace some common abbreviations with their full forms
	# we're only going to use the etree module (short for "element tree")
	from lxml import etree

	tree = etree.parse("path/to/gargoyle.xml") # replace the path text with your own filesystem path
	# to find all unitid elements whose parent is a did tag:
	tree.xpath("//did/unitid")

	# using an absolute path to find exact locations:
	tree.xpath("/ead/archdesc/did/physdesc/extent")

	## if there are multiple "extent" tags in the parent physdesc,
	## you can find specific tags by designating an index
	## unlike any other language ever, xpath indexes start at 1, not zero