rafaelpezzuto/gen-journal-citation-summary.ipynb

## gen-journal-citation-summary.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import pandas\n",
    "import os\n",
    "\n",
    "from scielo_scholarly_data import standardizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "DIR_DATA_RESULTS = 'data/results/'\n",
    "\n",
    "FILE_ISSN_TO_ALL = 'data/bases/base_issnl2all_v0.6.csv'\n",
    "\n",
    "FILE_JOURNALS = 'data/bases/journals.csv'\n",
    "\n",
    "missing_journals = {\n",
    "    '0379-3962': {'issns': set(['0379-3962']), 'title': 'TECNOLOGIA EN MARCHA', 'collection': set(['cri'])},\n",
    "    '2683-2623': {'issns': set(['2683-2623']), 'title': 'VERTICE UNIVERSITARIO', 'collection': set(['mex'])}\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_issn_to_attrs(path):\n",
    "    '''\n",
    "    Gera um dicionário que mapeia código ISSN a diversos atributos\n",
    "\n",
    "    Params\n",
    "    ------\n",
    "    path: str\n",
    "        Caminho do arquivo que contém códigos ISSN e diversos atributos\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    dict\n",
    "        Dicionário que mapeia código ISSN a título e outros atributos\n",
    "    '''\n",
    "    issn_to_attrs = {}\n",
    "\n",
    "    with open(path) as fin:\n",
    "        csv_reader = csv.DictReader(fin, delimiter='|')\n",
    "\n",
    "        for item in csv_reader:\n",
    "            issnl = item['ISSNL']\n",
    "            issns = sorted(set([issnl] + item['ISSNs'].split('#')))\n",
    "\n",
    "            tmp_main_title = [ti for ti in item['MAIN_TITLE'].split('#') if len(ti) > 0]\n",
    "            tmp_abbrev_title = [ti for ti in item['MAIN_ABBREV_TITLE'].split('#') if len(ti) > 0]\n",
    "            tmp_other_titles = [ti for ti in item['OTHER_TITLEs'].split('#') if len(ti) > 0]\n",
    "\n",
    "            if len(tmp_main_title) > 0:\n",
    "                title = tmp_main_title[0]\n",
    "            elif len(tmp_abbrev_title) > 0:\n",
    "                title = tmp_abbrev_title[0]\n",
    "            elif len(tmp_other_titles) > 0:\n",
    "                title = tmp_other_titles[0]\n",
    "            else:\n",
    "                print(f'{issnl} não possui título válido')\n",
    "                continue\n",
    "\n",
    "            for i in issns:\n",
    "                if i not in issn_to_attrs:\n",
    "                    if i in issn_to_attrs:\n",
    "                        print(f'{i} já está no dicionário issn_to_attrs')\n",
    "                        continue\n",
    "\n",
    "                    issn_to_attrs[i] = {\n",
    "                        'main_issn': issnl,\n",
    "                        'main_title': title,\n",
    "                        'issns': issns,\n",
    "                    }\n",
    "\n",
    "    return issn_to_attrs\n",
    "\n",
    "\n",
    "def load_scielo_journals(path):\n",
    "    '''\n",
    "    Gera dicionário de periódicos SciELO\n",
    "\n",
    "    Params\n",
    "    ------\n",
    "    path: str\n",
    "        Caminho do arquivo de periódicos SciELO\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    dict\n",
    "        Dicionário contendo os periódicos SciELO no formato:\n",
    "            {\n",
    "                '0000-0000': {\n",
    "                    'title': 'Nome do periódico',\n",
    "                    'issns': {\n",
    "                        '0000-0000',\n",
    "                        '0001-0001',\n",
    "                    },\n",
    "                    'collection': {\n",
    "                        'scl',\n",
    "                        'ssp',\n",
    "                    }\n",
    "                }\n",
    "            }\n",
    "    '''\n",
    "    scielo_journals = {}\n",
    " \n",
    "    with open(path) as fin:\n",
    "        creader = csv.DictReader(fin, delimiter=',', fieldnames=['issn', 'title', 'issns'])\n",
    "        for row in creader:\n",
    "            j_issn = standardizer.journal_issn(row['issn'])\n",
    "            j_title = standardizer.journal_title_for_deduplication(row['title'].lower()).upper()\n",
    "            \n",
    "            if j_issn not in scielo_journals:\n",
    "                scielo_journals[j_issn] = {\n",
    "                    'title': j_title,\n",
    "                    'issns': set([j_issn]),\n",
    "                }\n",
    "\n",
    "    return scielo_journals\n",
    "\n",
    "\n",
    "def _enrich_scielo_journals(scielo_journals, issn_to_attrs):\n",
    "    '''\n",
    "    Adiciona códigos ISSN em periódicos SciELO usando dados da base de títulos\n",
    "    \n",
    "    Params\n",
    "    ------\n",
    "    scielo_journals: dict\n",
    "    issn_to_attrs: dict\n",
    "    '''\n",
    "    for i in scielo_journals:\n",
    "        for ei in issn_to_attrs.get(i, {}).get('issns', []):\n",
    "            scielo_journals[i]['issns'].add(ei)\n",
    "\n",
    "\n",
    "def _extract_scielo_issns(scielo_journals):\n",
    "    '''\n",
    "    Extrai set de códigos ISSN pertencentes a periódicos SciELO\n",
    "\n",
    "    Params\n",
    "    ------\n",
    "    scielo_journals: dict\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    scielo_issns: Set\n",
    "    '''\n",
    "    scielo_issns = set()\n",
    "    \n",
    "    for j_attrs in scielo_journals.values():\n",
    "        for i in j_attrs['issns']:\n",
    "            scielo_issns.add(i)\n",
    "\n",
    "    return scielo_issns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "scielo_journals = load_scielo_journals(FILE_JOURNALS)\n",
    "issn_to_attrs = load_issn_to_attrs(FILE_ISSN_TO_ALL)\n",
    "for mj in missing_journals:\n",
    "    if mj not in issn_to_attrs:\n",
    "        issn_to_attrs[mj] = {'main_issn': mj, 'issns': missing_journals[mj]['issns'], 'main_title': missing_journals[mj]['title']}\n",
    "_enrich_scielo_journals(scielo_journals, issn_to_attrs)\n",
    "scielo_issns = _extract_scielo_issns(scielo_journals)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "issns = os.listdir(DIR_DATA_RESULTS)\n",
    "\n",
    "received_citations = {}\n",
    "granted_citations = {}\n",
    "\n",
    "for issn in issns:\n",
    "    key = issn_title = (issn, issn_to_attrs[issn]['main_title'])\n",
    "\n",
    "    received_citations[key] = {}\n",
    "    \n",
    "    for f in [f for f in os.listdir(os.path.join(DIR_DATA_RESULTS, issn)) if '_citations_received_count' in f]:\n",
    "        f_path = os.path.join(DIR_DATA_RESULTS, issn, f)\n",
    "        f_year = int(f_path.split('.')[-2].split('_')[-1])\n",
    "        \n",
    "        with open(f_path) as fin:\n",
    "            received_citations[key][f_year] = 0\n",
    "\n",
    "            for row in csv.DictReader(fin):\n",
    "                received_citations[key][f_year] += int(row['Citações recebidas'])\n",
    "\n",
    "    granted_citations[key] = {}\n",
    "\n",
    "    for f in [f for f in os.listdir(os.path.join(DIR_DATA_RESULTS, issn)) if '_citations_granted_count' in f]:\n",
    "        f_path = os.path.join(DIR_DATA_RESULTS, issn, f)\n",
    "        f_year = int(f_path.split('.')[-2].split('_')[-1])\n",
    "        \n",
    "        with open(f_path) as fin:\n",
    "            granted_citations[key][f_year] = 0\n",
    "\n",
    "            for row in csv.DictReader(fin):\n",
    "                granted_citations[key][f_year] += int(row['Citações concedidas'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pandas.DataFrame(received_citations)\n",
    "df.fillna(0, inplace=True)\n",
    "df = df.transpose()\n",
    "df.sort_index(axis=1, inplace=True)\n",
    "\n",
    "for col in df.select_dtypes(include=['float64']):\n",
    "    df[col] = df[col].astype('int64')\n",
    "\n",
    "df.to_csv(os.path.join(DIR_DATA_RESULTS, 'citations_received.csv'), index_label=['ISSN', 'Título'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pandas.DataFrame(granted_citations)\n",
    "df.fillna(0, inplace=True)\n",
    "df = df.transpose()\n",
    "df.sort_index(axis=1, inplace=True)\n",
    "\n",
    "for col in df.select_dtypes(include=['float64']):\n",
    "    df[col] = df[col].astype('int64')\n",
    "\n",
    "df.to_csv(os.path.join(DIR_DATA_RESULTS, 'citations_granted.csv'), index_label=['ISSN', 'Título'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.7 ('scielo-bibliometrics')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "420b81cf9fd95b777b31ce10675349dd3490f0a93d23fb97ad6befdeb4be28ed"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import csv\n",
	"import pandas\n",
	"import os\n",
	"\n",
	"from scielo_scholarly_data import standardizer"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"DIR_DATA_RESULTS = 'data/results/'\n",
	"\n",
	"FILE_ISSN_TO_ALL = 'data/bases/base_issnl2all_v0.6.csv'\n",
	"\n",
	"FILE_JOURNALS = 'data/bases/journals.csv'\n",
	"\n",
	"missing_journals = {\n",
	" '0379-3962': {'issns': set(['0379-3962']), 'title': 'TECNOLOGIA EN MARCHA', 'collection': set(['cri'])},\n",
	" '2683-2623': {'issns': set(['2683-2623']), 'title': 'VERTICE UNIVERSITARIO', 'collection': set(['mex'])}\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"def load_issn_to_attrs(path):\n",
	" '''\n",
	" Gera um dicionário que mapeia código ISSN a diversos atributos\n",
	"\n",
	" Params\n",
	" ------\n",
	" path: str\n",
	" Caminho do arquivo que contém códigos ISSN e diversos atributos\n",
	"\n",
	" Returns\n",
	" -------\n",
	" dict\n",
	" Dicionário que mapeia código ISSN a título e outros atributos\n",
	" '''\n",
	" issn_to_attrs = {}\n",
	"\n",
	" with open(path) as fin:\n",
	" csv_reader = csv.DictReader(fin, delimiter='\|')\n",
	"\n",
	" for item in csv_reader:\n",
	" issnl = item['ISSNL']\n",
	" issns = sorted(set([issnl] + item['ISSNs'].split('#')))\n",
	"\n",
	" tmp_main_title = [ti for ti in item['MAIN_TITLE'].split('#') if len(ti) > 0]\n",
	" tmp_abbrev_title = [ti for ti in item['MAIN_ABBREV_TITLE'].split('#') if len(ti) > 0]\n",
	" tmp_other_titles = [ti for ti in item['OTHER_TITLEs'].split('#') if len(ti) > 0]\n",
	"\n",
	" if len(tmp_main_title) > 0:\n",
	" title = tmp_main_title[0]\n",
	" elif len(tmp_abbrev_title) > 0:\n",
	" title = tmp_abbrev_title[0]\n",
	" elif len(tmp_other_titles) > 0:\n",
	" title = tmp_other_titles[0]\n",
	" else:\n",
	" print(f'{issnl} não possui título válido')\n",
	" continue\n",
	"\n",
	" for i in issns:\n",
	" if i not in issn_to_attrs:\n",
	" if i in issn_to_attrs:\n",
	" print(f'{i} já está no dicionário issn_to_attrs')\n",
	" continue\n",
	"\n",
	" issn_to_attrs[i] = {\n",
	" 'main_issn': issnl,\n",
	" 'main_title': title,\n",
	" 'issns': issns,\n",
	" }\n",
	"\n",
	" return issn_to_attrs\n",
	"\n",
	"\n",
	"def load_scielo_journals(path):\n",
	" '''\n",
	" Gera dicionário de periódicos SciELO\n",
	"\n",
	" Params\n",
	" ------\n",
	" path: str\n",
	" Caminho do arquivo de periódicos SciELO\n",
	"\n",
	" Returns\n",
	" -------\n",
	" dict\n",
	" Dicionário contendo os periódicos SciELO no formato:\n",
	" {\n",
	" '0000-0000': {\n",
	" 'title': 'Nome do periódico',\n",
	" 'issns': {\n",
	" '0000-0000',\n",
	" '0001-0001',\n",
	" },\n",
	" 'collection': {\n",
	" 'scl',\n",
	" 'ssp',\n",
	" }\n",
	" }\n",
	" }\n",
	" '''\n",
	" scielo_journals = {}\n",
	" \n",
	" with open(path) as fin:\n",
	" creader = csv.DictReader(fin, delimiter=',', fieldnames=['issn', 'title', 'issns'])\n",
	" for row in creader:\n",
	" j_issn = standardizer.journal_issn(row['issn'])\n",
	" j_title = standardizer.journal_title_for_deduplication(row['title'].lower()).upper()\n",
	" \n",
	" if j_issn not in scielo_journals:\n",
	" scielo_journals[j_issn] = {\n",
	" 'title': j_title,\n",
	" 'issns': set([j_issn]),\n",
	" }\n",
	"\n",
	" return scielo_journals\n",
	"\n",
	"\n",
	"def _enrich_scielo_journals(scielo_journals, issn_to_attrs):\n",
	" '''\n",
	" Adiciona códigos ISSN em periódicos SciELO usando dados da base de títulos\n",
	" \n",
	" Params\n",
	" ------\n",
	" scielo_journals: dict\n",
	" issn_to_attrs: dict\n",
	" '''\n",
	" for i in scielo_journals:\n",
	" for ei in issn_to_attrs.get(i, {}).get('issns', []):\n",
	" scielo_journals[i]['issns'].add(ei)\n",
	"\n",
	"\n",
	"def _extract_scielo_issns(scielo_journals):\n",
	" '''\n",
	" Extrai set de códigos ISSN pertencentes a periódicos SciELO\n",
	"\n",
	" Params\n",
	" ------\n",
	" scielo_journals: dict\n",
	"\n",
	" Returns\n",
	" -------\n",
	" scielo_issns: Set\n",
	" '''\n",
	" scielo_issns = set()\n",
	" \n",
	" for j_attrs in scielo_journals.values():\n",
	" for i in j_attrs['issns']:\n",
	" scielo_issns.add(i)\n",
	"\n",
	" return scielo_issns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"scielo_journals = load_scielo_journals(FILE_JOURNALS)\n",
	"issn_to_attrs = load_issn_to_attrs(FILE_ISSN_TO_ALL)\n",
	"for mj in missing_journals:\n",
	" if mj not in issn_to_attrs:\n",
	" issn_to_attrs[mj] = {'main_issn': mj, 'issns': missing_journals[mj]['issns'], 'main_title': missing_journals[mj]['title']}\n",
	"_enrich_scielo_journals(scielo_journals, issn_to_attrs)\n",
	"scielo_issns = _extract_scielo_issns(scielo_journals)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"issns = os.listdir(DIR_DATA_RESULTS)\n",
	"\n",
	"received_citations = {}\n",
	"granted_citations = {}\n",
	"\n",
	"for issn in issns:\n",
	" key = issn_title = (issn, issn_to_attrs[issn]['main_title'])\n",
	"\n",
	" received_citations[key] = {}\n",
	" \n",
	" for f in [f for f in os.listdir(os.path.join(DIR_DATA_RESULTS, issn)) if '_citations_received_count' in f]:\n",
	" f_path = os.path.join(DIR_DATA_RESULTS, issn, f)\n",
	" f_year = int(f_path.split('.')[-2].split('_')[-1])\n",
	" \n",
	" with open(f_path) as fin:\n",
	" received_citations[key][f_year] = 0\n",
	"\n",
	" for row in csv.DictReader(fin):\n",
	" received_citations[key][f_year] += int(row['Citações recebidas'])\n",
	"\n",
	" granted_citations[key] = {}\n",
	"\n",
	" for f in [f for f in os.listdir(os.path.join(DIR_DATA_RESULTS, issn)) if '_citations_granted_count' in f]:\n",
	" f_path = os.path.join(DIR_DATA_RESULTS, issn, f)\n",
	" f_year = int(f_path.split('.')[-2].split('_')[-1])\n",
	" \n",
	" with open(f_path) as fin:\n",
	" granted_citations[key][f_year] = 0\n",
	"\n",
	" for row in csv.DictReader(fin):\n",
	" granted_citations[key][f_year] += int(row['Citações concedidas'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pandas.DataFrame(received_citations)\n",
	"df.fillna(0, inplace=True)\n",
	"df = df.transpose()\n",
	"df.sort_index(axis=1, inplace=True)\n",
	"\n",
	"for col in df.select_dtypes(include=['float64']):\n",
	" df[col] = df[col].astype('int64')\n",
	"\n",
	"df.to_csv(os.path.join(DIR_DATA_RESULTS, 'citations_received.csv'), index_label=['ISSN', 'Título'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 83,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pandas.DataFrame(granted_citations)\n",
	"df.fillna(0, inplace=True)\n",
	"df = df.transpose()\n",
	"df.sort_index(axis=1, inplace=True)\n",
	"\n",
	"for col in df.select_dtypes(include=['float64']):\n",
	" df[col] = df[col].astype('int64')\n",
	"\n",
	"df.to_csv(os.path.join(DIR_DATA_RESULTS, 'citations_granted.csv'), index_label=['ISSN', 'Título'])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3.9.7 ('scielo-bibliometrics')",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.7"
	},
	"orig_nbformat": 4,
	"vscode": {
	"interpreter": {
	"hash": "420b81cf9fd95b777b31ce10675349dd3490f0a93d23fb97ad6befdeb4be28ed"
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}