Skip to content

Instantly share code, notes, and snippets.

@rafaelpezzuto
Created January 17, 2022 22:44
Show Gist options
  • Save rafaelpezzuto/ae8456e29faff7bf026c3e873eabef19 to your computer and use it in GitHub Desktop.
Save rafaelpezzuto/ae8456e29faff7bf026c3e873eabef19 to your computer and use it in GitHub Desktop.
Parse doaj dump
import csv
import json
import os
DOAJ_DUMP_FILES = [f for f in os.listdir() if f.endswith('json')] # diretório com arquivos JSON do DOAJ DUMP
DOAJ_KEYS = ['publisher', 'doi', 'czu', 'pissn', 'issn', 'eissn', 'elocationid'] # chaves esperadas nos dados DOAJ
def extract_links(data):
links = data.get('bibjson', {}).get('link', [])
parsed_links = [l.get('url', '').lower() for l in links]
return '#'.join([l for l in parsed_links if l != ''])
def extract(data):
identifiers = data.get('bibjson', {}).get('identifier', [])
created_date = data.get('created_date')
last_updated = data.get('last_updated')
links = extract_links(data)
parsed_ids = {}
for i in identifiers:
id_type = i.get('type', '').lower()
id_value = i.get('id', '').lower()
if id_type != '':
parsed_ids[id_type] = id_value
return '|'.join([data['id']] + [parsed_ids.get(v, '') for v in sorted(DOAJ_KEYS)] + [links] + [created_date, last_updated])
with open('doaj-dump-parsed.csv', 'w') as fout:
for f in DOAJ_DUMP_FILES:
fj = json.load(open(f))
fout.write('|'.join(['doaj_id'] + sorted(DOAJ_KEYS) + ['links'] + ['created_date', 'last_updated']) + '\n')
for line in fj:
try:
fout.write(extract(line) + '\n')
except Exception as e:
print(line)
print(e)
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment