Skip to content

Instantly share code, notes, and snippets.

@bnewbold
Created April 17, 2021 00:17
Show Gist options
  • Save bnewbold/f55d616d891981c83835eb763b1419ed to your computer and use it in GitHub Desktop.
Save bnewbold/f55d616d891981c83835eb763b1419ed to your computer and use it in GitHub Desktop.
RSCVD Fatcat Fuzzycat Lookup
#!/usr/bin/env/python3
"""
To run this script you need the 'fuzzycat' and 'elasticsearch' pip packages
installed (eg, 'pip install fuzzycat')
"""
import sys
import csv
import json
import elasticsearch
from fuzzycat.simple import closest_fuzzy_biblio_match
from fuzzycat.matching import public_api
def first_ia_access(release):
for f in (release.files or []):
for u in (f.urls or []):
if "://web.archive.org/" in u.url or "://archive.org/" in u.url:
return u.url
for w in (release.webcaptures or []):
for u in w.archive_urls:
if "://web.archive.org/" in u.url:
return u.url
return None
def run(tsv_input):
api = public_api("https://api.fatcat.wiki/v0")
es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki:443")
reader = csv.DictReader(tsv_input, delimiter="\t")
for row in reader:
doi = row.get('DOI') or None
if doi and not doi.startswith('10.'):
doi = None
match = closest_fuzzy_biblio_match(
es_client=es_client,
biblio=dict(
title=row.get('Article') or None,
journal=row.get('Journal') or None,
first_author=row.get('Author Name') or None,
year=row.get('Publication Year') or None,
volume=row.get('Volume / Edition') or None,
issue=row.get('Issue') or None,
pages=row.get('Pages') or None,
doi=doi,
pmid=row.get('pmid') or None,
pcmid=row.get('pmcid') or None,
)
)
if match:
row['fuzzycat_status'] = match.status.name
if match.release:
match.release = api.get_release(match.release.ident, expand="container,files,webcaptures")
row['fatcat_url'] = f"https://fatcat.wiki/release/{match.release.ident}"
row['ia_access_url'] = first_ia_access(match.release)
if match.release.container and match.release.container.extra:
#print(match.release.container.extra.get('ia'), file=sys.stderr)
row['ia_any_sim'] = bool((match.release.container.extra.get('ia') or {}).get('sim'))
else:
row['ia_any_sim'] = False
else:
row['fuzzycat_status'] = "NONE"
print(json.dumps(row))
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Expect a single argument: RSCVD TSV file (exported from google sheets")
sys.exit(-1)
with open(sys.argv[1], "r") as tsv_input:
run(tsv_input)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment