Created
July 5, 2022 16:20
-
-
Save rafaelpezzuto/76885d2417920221a445eef089af8fd0 to your computer and use it in GitHub Desktop.
Contabiliza títulos citados não resolvidos por StatBiblio e NewBiblio
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "27528559", | |
"metadata": {}, | |
"source": [ | |
"### 1. Configurações" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "77c58bdc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import pandas\n", | |
"import os\n", | |
"\n", | |
"# Diretórios\n", | |
"DIR_DATA = '/home/rafaeljpd/Data/'\n", | |
"\n", | |
"# Caminho de arquivo de citações enriquecidas\n", | |
"FILE_CITED_REFS_ENRICHED = os.path.join(DIR_DATA, 'scielo/refs/via-rm/gold/regsc_rafael_v0.6c.txt')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d1c2f502", | |
"metadata": {}, | |
"source": [ | |
"### 2. Leitura de referências citadas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "768eebfb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"titles = {}\n", | |
"\n", | |
"counter = 1\n", | |
"\n", | |
"with open(FILE_CITED_REFS_ENRICHED) as fin:\n", | |
" for line in fin:\n", | |
" if counter % 250000 == 0:\n", | |
" print(counter)\n", | |
" json_line = json.loads(line)\n", | |
"\n", | |
" if 'sb_cited_issn' not in json_line and 'cited_issnl' not in json_line:\n", | |
" t = json_line['cited_journal']\n", | |
"\n", | |
" if t:\n", | |
" if t not in titles:\n", | |
" titles[t] = 0\n", | |
" titles[t] += 1 \n", | |
" counter += 1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3c04e15e", | |
"metadata": {}, | |
"source": [ | |
"### 3. Geração de tabela de frequência de títulos não resolvidos" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "416b5773", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dftf = pandas.DataFrame({'Frequência': titles})\n", | |
"dftf.sort_values('Frequência', ascending=False, inplace=True)\n", | |
"dftf.to_csv(os.path.join(DIR_DATA_RESULTS, 'unmatches_freq.csv'), sep='|', index_label='Título citado')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.9.7 ('scielo-cited-references')", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
}, | |
"vscode": { | |
"interpreter": { | |
"hash": "fda8a88b6a134bc64e7f47eb787831199f0bdc9bd15c443221a5978bbdc2215d" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment