Skip to content

Instantly share code, notes, and snippets.

@rafaelpezzuto
Last active January 18, 2022 00:34
Show Gist options
  • Save rafaelpezzuto/c4ba708badfcdd9ee3efee645384b430 to your computer and use it in GitHub Desktop.
Save rafaelpezzuto/c4ba708badfcdd9ee3efee645384b430 to your computer and use it in GitHub Desktop.
Match SciELO and DOAJ
import urllib.parse
PATH_SCL_DOAJ_DOCS = 'scl-doaj-docs.csv'
PATH_SCL_PIDS_DOIS = 'scl-pids-dois.csv'
PATH_SCL_PIDS_DOAJ_LAST = 'scl-doaj-docs-jan-22.csv'
PATH_ISSNS = 'issns.txt'
scl_doaj_docs = [i.strip().split('|') for i in open(PATH_SCL_DOAJ_DOCS)]
scl_pids_dois = [i.strip().split(',') for i in open(PATH_SCL_PIDS_DOIS)]
issns = set([i.strip().upper() for i in open(PATH_ISSNS)])
existing_dois = [(i[1], i[2], i[5]) for i in scl_pids_dois if i[1] != '']
doaj_dois = set([i[7].lower() for i in scl_doaj_docs if i[7] != ''])
doaj_links = [i[8].split('#') for i in scl_doaj_docs if i[8] != '']
doaj_pids = set()
for dl in doaj_links:
for l in dl:
url = urllib.parse.urlparse(l)
params = dict(urllib.parse.parse_qsl(url.query))
if 'pid' in params:
doaj_pids.add(params['pid'].upper())
with open(PATH_SCL_PIDS_DOAJ_LAST) as fin:
for i in fin:
els = i.strip().split('|')
pid = els[0]
doaj_id = els[1]
doaj_pids.add(pid.upper())
with open('pending-doaj-scielo.csv', 'w') as fout:
for ed in existing_dois:
doi, issn, pid = ed
if issn not in issns:
continue
if doi.lower() in doaj_dois:
continue
if pid.upper() in doaj_pids:
continue
fout.write(pid + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment