Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rafaelpezzuto/1545d45d2abfcb84833d857d79a811a1 to your computer and use it in GitHub Desktop.
Save rafaelpezzuto/1545d45d2abfcb84833d857d79a811a1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import pandas\n",
"import os\n",
"\n",
"from scielo_scholarly_data import standardizer"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"DIR_DATA_RESULTS = 'data/results/'\n",
"\n",
"FILE_ISSN_TO_ALL = 'data/bases/base_issnl2all_v0.6.csv'\n",
"\n",
"FILE_JOURNALS = 'data/bases/journals.csv'\n",
"\n",
"missing_journals = {\n",
" '0379-3962': {'issns': set(['0379-3962']), 'title': 'TECNOLOGIA EN MARCHA', 'collection': set(['cri'])},\n",
" '2683-2623': {'issns': set(['2683-2623']), 'title': 'VERTICE UNIVERSITARIO', 'collection': set(['mex'])}\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def load_issn_to_attrs(path):\n",
" '''\n",
" Gera um dicionário que mapeia código ISSN a diversos atributos\n",
"\n",
" Params\n",
" ------\n",
" path: str\n",
" Caminho do arquivo que contém códigos ISSN e diversos atributos\n",
"\n",
" Returns\n",
" -------\n",
" dict\n",
" Dicionário que mapeia código ISSN a título e outros atributos\n",
" '''\n",
" issn_to_attrs = {}\n",
"\n",
" with open(path) as fin:\n",
" csv_reader = csv.DictReader(fin, delimiter='|')\n",
"\n",
" for item in csv_reader:\n",
" issnl = item['ISSNL']\n",
" issns = sorted(set([issnl] + item['ISSNs'].split('#')))\n",
"\n",
" tmp_main_title = [ti for ti in item['MAIN_TITLE'].split('#') if len(ti) > 0]\n",
" tmp_abbrev_title = [ti for ti in item['MAIN_ABBREV_TITLE'].split('#') if len(ti) > 0]\n",
" tmp_other_titles = [ti for ti in item['OTHER_TITLEs'].split('#') if len(ti) > 0]\n",
"\n",
" if len(tmp_main_title) > 0:\n",
" title = tmp_main_title[0]\n",
" elif len(tmp_abbrev_title) > 0:\n",
" title = tmp_abbrev_title[0]\n",
" elif len(tmp_other_titles) > 0:\n",
" title = tmp_other_titles[0]\n",
" else:\n",
" print(f'{issnl} não possui título válido')\n",
" continue\n",
"\n",
" for i in issns:\n",
" if i not in issn_to_attrs:\n",
" if i in issn_to_attrs:\n",
" print(f'{i} já está no dicionário issn_to_attrs')\n",
" continue\n",
"\n",
" issn_to_attrs[i] = {\n",
" 'main_issn': issnl,\n",
" 'main_title': title,\n",
" 'issns': issns,\n",
" }\n",
"\n",
" return issn_to_attrs\n",
"\n",
"\n",
"def load_scielo_journals(path):\n",
" '''\n",
" Gera dicionário de periódicos SciELO\n",
"\n",
" Params\n",
" ------\n",
" path: str\n",
" Caminho do arquivo de periódicos SciELO\n",
"\n",
" Returns\n",
" -------\n",
" dict\n",
" Dicionário contendo os periódicos SciELO no formato:\n",
" {\n",
" '0000-0000': {\n",
" 'title': 'Nome do periódico',\n",
" 'issns': {\n",
" '0000-0000',\n",
" '0001-0001',\n",
" },\n",
" 'collection': {\n",
" 'scl',\n",
" 'ssp',\n",
" }\n",
" }\n",
" }\n",
" '''\n",
" scielo_journals = {}\n",
" \n",
" with open(path) as fin:\n",
" creader = csv.DictReader(fin, delimiter=',', fieldnames=['issn', 'title', 'issns'])\n",
" for row in creader:\n",
" j_issn = standardizer.journal_issn(row['issn'])\n",
" j_title = standardizer.journal_title_for_deduplication(row['title'].lower()).upper()\n",
" \n",
" if j_issn not in scielo_journals:\n",
" scielo_journals[j_issn] = {\n",
" 'title': j_title,\n",
" 'issns': set([j_issn]),\n",
" }\n",
"\n",
" return scielo_journals\n",
"\n",
"\n",
"def _enrich_scielo_journals(scielo_journals, issn_to_attrs):\n",
" '''\n",
" Adiciona códigos ISSN em periódicos SciELO usando dados da base de títulos\n",
" \n",
" Params\n",
" ------\n",
" scielo_journals: dict\n",
" issn_to_attrs: dict\n",
" '''\n",
" for i in scielo_journals:\n",
" for ei in issn_to_attrs.get(i, {}).get('issns', []):\n",
" scielo_journals[i]['issns'].add(ei)\n",
"\n",
"\n",
"def _extract_scielo_issns(scielo_journals):\n",
" '''\n",
" Extrai set de códigos ISSN pertencentes a periódicos SciELO\n",
"\n",
" Params\n",
" ------\n",
" scielo_journals: dict\n",
"\n",
" Returns\n",
" -------\n",
" scielo_issns: Set\n",
" '''\n",
" scielo_issns = set()\n",
" \n",
" for j_attrs in scielo_journals.values():\n",
" for i in j_attrs['issns']:\n",
" scielo_issns.add(i)\n",
"\n",
" return scielo_issns"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"scielo_journals = load_scielo_journals(FILE_JOURNALS)\n",
"issn_to_attrs = load_issn_to_attrs(FILE_ISSN_TO_ALL)\n",
"for mj in missing_journals:\n",
" if mj not in issn_to_attrs:\n",
" issn_to_attrs[mj] = {'main_issn': mj, 'issns': missing_journals[mj]['issns'], 'main_title': missing_journals[mj]['title']}\n",
"_enrich_scielo_journals(scielo_journals, issn_to_attrs)\n",
"scielo_issns = _extract_scielo_issns(scielo_journals)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"issns = os.listdir(DIR_DATA_RESULTS)\n",
"\n",
"received_citations = {}\n",
"granted_citations = {}\n",
"\n",
"for issn in issns:\n",
" key = issn_title = (issn, issn_to_attrs[issn]['main_title'])\n",
"\n",
" received_citations[key] = {}\n",
" \n",
" for f in [f for f in os.listdir(os.path.join(DIR_DATA_RESULTS, issn)) if '_citations_received_count' in f]:\n",
" f_path = os.path.join(DIR_DATA_RESULTS, issn, f)\n",
" f_year = int(f_path.split('.')[-2].split('_')[-1])\n",
" \n",
" with open(f_path) as fin:\n",
" received_citations[key][f_year] = 0\n",
"\n",
" for row in csv.DictReader(fin):\n",
" received_citations[key][f_year] += int(row['Citações recebidas'])\n",
"\n",
" granted_citations[key] = {}\n",
"\n",
" for f in [f for f in os.listdir(os.path.join(DIR_DATA_RESULTS, issn)) if '_citations_granted_count' in f]:\n",
" f_path = os.path.join(DIR_DATA_RESULTS, issn, f)\n",
" f_year = int(f_path.split('.')[-2].split('_')[-1])\n",
" \n",
" with open(f_path) as fin:\n",
" granted_citations[key][f_year] = 0\n",
"\n",
" for row in csv.DictReader(fin):\n",
" granted_citations[key][f_year] += int(row['Citações concedidas'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = pandas.DataFrame(received_citations)\n",
"df.fillna(0, inplace=True)\n",
"df = df.transpose()\n",
"df.sort_index(axis=1, inplace=True)\n",
"\n",
"for col in df.select_dtypes(include=['float64']):\n",
" df[col] = df[col].astype('int64')\n",
"\n",
"df.to_csv(os.path.join(DIR_DATA_RESULTS, 'citations_received.csv'), index_label=['ISSN', 'Título'])"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"df = pandas.DataFrame(granted_citations)\n",
"df.fillna(0, inplace=True)\n",
"df = df.transpose()\n",
"df.sort_index(axis=1, inplace=True)\n",
"\n",
"for col in df.select_dtypes(include=['float64']):\n",
" df[col] = df[col].astype('int64')\n",
"\n",
"df.to_csv(os.path.join(DIR_DATA_RESULTS, 'citations_granted.csv'), index_label=['ISSN', 'Título'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.7 ('scielo-bibliometrics')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "420b81cf9fd95b777b31ce10675349dd3490f0a93d23fb97ad6befdeb4be28ed"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment