Skip to content

Instantly share code, notes, and snippets.

@rafaelpezzuto
Created January 17, 2022 22:46
Show Gist options
  • Save rafaelpezzuto/50aeaf139e109498aa051f37cd8f268d to your computer and use it in GitHub Desktop.
Save rafaelpezzuto/50aeaf139e109498aa051f37cd8f268d to your computer and use it in GitHub Desktop.
Parse SciELO Brazil Mongo dump
import csv
import json
import os
PATH_ISSNS = 'issns.txt'
PATH_DOAJ_DUMP_PARSED = 'doaj-dump-parsed.csv'
issns = set([i.strip() for i in open(PATH_ISSNS)])
scielo_doaj_docs = []
with open(PATH_DOAJ_DUMP_PARSED) as fin:
csv_reader = csv.DictReader(fin, delimiter='|')
with open('scielo_doaj_docs.csv', 'w') as fout:
csv_writer = csv.DictWriter(fout, fieldnames=['doaj_id', 'issn', 'pissn', 'eissn', 'link', 'czu', 'created_date', 'doi', 'links', 'last_updated', 'publisher', 'elocationid'], delimiter='|')
csv_writer.writeheader()
for row in csv_reader:
r_issns = [row.get('issn'), row.get('pissn'), row.get('eissn')]
r_issns = set([i.upper() for i in r_issns if i and i != ''])
r_doi = row.get('doi')
for r in r_issns:
if r in issns:
csv_writer.writerow(row)
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment