Skip to content

Instantly share code, notes, and snippets.

@edipretoro
Created June 21, 2020 09:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edipretoro/37c843fc25760e2f2845f9cd45162ffb to your computer and use it in GitHub Desktop.
Save edipretoro/37c843fc25760e2f2845f9cd45162ffb to your computer and use it in GitHub Desktop.
Simple CLI to compute similarities between bibliographic records.
#!/usr/bin/env python
"""Simple CLI to compute similarities between bibliographic records."""
from typing import Callable, Iterable, Tuple
import jellyfish as jf
import pymarc
import pathlib
import argparse
def record_walker(
record: pymarc.Record
) -> Iterable[Tuple[str, str, str]]:
"""
Return a generator of all the fields in a pymarc.Record.
:param record: The record to process
:type record: pymarc.Record
:return: Iterable of tuples with three str fields
:rtype: Iterable of tuple with three str fields
"""
for f in record.get_fields():
for sf in f:
yield (f.tag, *sf)
def get_records_similarity(
rec_a: pymarc.Record,
rec_b: pymarc.Record,
sim_method: Callable[[str, str], float] = jf.jaro_winkler_similarity
) -> float:
"""
Compute the similarity between two records.
:param rec_a: Record serving as basis
:type rec_a: pymarc.Record
:param rec_b: Record to compare with the basis one
:type rec_b: pymarc.Record
:param sim_method: Similarity function to use
:type sim_method: callable
:return: The percentage of similarity between rec_a and rec_b
:rtype: float
"""
last = 0
sim = 0
for idx, d in enumerate(record_walker(rec_a), start=1):
for field in rec_b.get_fields(d[0]):
subfield = field.get_subfields(d[1])
if subfield:
subfield = subfield[0]
sim += sim_method(d[2], subfield)
last = idx
return sim / last
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description=__doc__)
arg_parser.add_argument(
'-b', '--basis',
required=True,
type=pathlib.Path,
help="The path to the bibliographic to use as a basis for the "
"computation."
)
arg_parser.add_argument(
'-r', '--records',
required=True,
type=pathlib.Path,
action="append",
help="The path to the bibliographic records to compute similarities "
"with."
)
cli = arg_parser.parse_args()
with cli.basis.open(mode='rb') as a:
rec_a = next(pymarc.MARCReader(a, to_unicode=True))
for record in cli.records:
with record.open(mode='rb') as b:
rec_b = next(pymarc.MARCReader(b, to_unicode=True))
print(f"Similarity between {cli.basis.name} and "
f"{record.name} is "
f"{get_records_similarity(rec_a, rec_b):.2%}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment