Created
June 21, 2020 09:11
-
-
Save edipretoro/37c843fc25760e2f2845f9cd45162ffb to your computer and use it in GitHub Desktop.
Simple CLI to compute similarities between bibliographic records.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Simple CLI to compute similarities between bibliographic records.""" | |
from typing import Callable, Iterable, Tuple | |
import jellyfish as jf | |
import pymarc | |
import pathlib | |
import argparse | |
def record_walker( | |
record: pymarc.Record | |
) -> Iterable[Tuple[str, str, str]]: | |
""" | |
Return a generator of all the fields in a pymarc.Record. | |
:param record: The record to process | |
:type record: pymarc.Record | |
:return: Iterable of tuples with three str fields | |
:rtype: Iterable of tuple with three str fields | |
""" | |
for f in record.get_fields(): | |
for sf in f: | |
yield (f.tag, *sf) | |
def get_records_similarity( | |
rec_a: pymarc.Record, | |
rec_b: pymarc.Record, | |
sim_method: Callable[[str, str], float] = jf.jaro_winkler_similarity | |
) -> float: | |
""" | |
Compute the similarity between two records. | |
:param rec_a: Record serving as basis | |
:type rec_a: pymarc.Record | |
:param rec_b: Record to compare with the basis one | |
:type rec_b: pymarc.Record | |
:param sim_method: Similarity function to use | |
:type sim_method: callable | |
:return: The percentage of similarity between rec_a and rec_b | |
:rtype: float | |
""" | |
last = 0 | |
sim = 0 | |
for idx, d in enumerate(record_walker(rec_a), start=1): | |
for field in rec_b.get_fields(d[0]): | |
subfield = field.get_subfields(d[1]) | |
if subfield: | |
subfield = subfield[0] | |
sim += sim_method(d[2], subfield) | |
last = idx | |
return sim / last | |
if __name__ == '__main__': | |
arg_parser = argparse.ArgumentParser(description=__doc__) | |
arg_parser.add_argument( | |
'-b', '--basis', | |
required=True, | |
type=pathlib.Path, | |
help="The path to the bibliographic to use as a basis for the " | |
"computation." | |
) | |
arg_parser.add_argument( | |
'-r', '--records', | |
required=True, | |
type=pathlib.Path, | |
action="append", | |
help="The path to the bibliographic records to compute similarities " | |
"with." | |
) | |
cli = arg_parser.parse_args() | |
with cli.basis.open(mode='rb') as a: | |
rec_a = next(pymarc.MARCReader(a, to_unicode=True)) | |
for record in cli.records: | |
with record.open(mode='rb') as b: | |
rec_b = next(pymarc.MARCReader(b, to_unicode=True)) | |
print(f"Similarity between {cli.basis.name} and " | |
f"{record.name} is " | |
f"{get_records_similarity(rec_a, rec_b):.2%}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment