Skip to content

Instantly share code, notes, and snippets.

@joffilyfe
Last active November 5, 2019 17:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joffilyfe/6359c01d5321e2f6abf180bc406cb0ad to your computer and use it in GitHub Desktop.
Save joffilyfe/6359c01d5321e2f6abf180bc406cb0ad to your computer and use it in GitHub Desktop.
S1517-106X2013000200012
S1517-106X2012000100007
S1414-753X2003000300008
S1414-753X2001000800007
S1414-753X1999000200004
S1414-753X2013000400006
S1414-753X2009000100004
S1414-753X2014000300005
S1414-753X2014000300013
S1414-753X2013000100006
S1414-753X2012000200002
S0365-05962007000600013
S0365-05962007000400002
S0365-05962007000400012
S0365-05962006000300011
S0365-05962004000100003
S0365-05962003000100004
S0365-05962008000400001
S0365-05962011000200034
S0365-05962011000200035
S0365-05962011000200036
S0365-05962011000200037
S0365-05962011000500030
S0365-05962011000500031
S0365-05962011000500032
S0365-05962011000500033
S0365-05962011000400046
S0365-05962011000100002
S0365-05962011000100003
S0365-05962011000100004
S0365-05962011000100005
S0365-05962011000100006
S0365-05962011000100007
S0365-05962011000100008
S0365-05962011000100009
S0365-05962011000100010
S0365-05962011000100011
S0365-05962011000100012
S0365-05962011000100013
S0365-05962011000100014
S0365-05962011000100015
S0365-05962011000100016
S0365-05962011000100017
S0365-05962011000100018
S0365-05962011000100019
S0365-05962011000100020
S0365-05962011000100021
S0365-05962011000100022
S0365-05962011000100023
S0365-05962011000100024
S0365-05962011000100025
S0365-05962011000100026
S0365-05962011000100027
S0365-05962011000100028
S0365-05962011000100029
S0365-05962011000100030
S0365-05962011000100031
S0365-05962011000100032
S0365-05962011000100033
S0365-05962011000100034
S0365-05962011000100035
S0365-05962011000100036
S0365-05962011000700001
S0365-05962011000700002
S0365-05962011000700003
S0365-05962011000700004
S0365-05962011000700005
S0365-05962011000700006
S0365-05962011000700007
S0365-05962011000700008
S0365-05962011000700009
S0365-05962011000700010
S0365-05962011000700011
S0365-05962011000700012
S0365-05962011000700013
S0365-05962011000700014
S0365-05962011000700015
S0365-05962011000700016
S0365-05962011000700017
S0365-05962011000700018
S0365-05962011000700019
S0365-05962011000700020
S0365-05962011000700021
S0365-05962011000700022
S0365-05962011000700023
S0365-05962011000700024
S0365-05962011000700025
S0365-05962011000700026
S0365-05962011000700027
S0365-05962011000700028
S0365-05962011000700029
S0365-05962011000700030
S0365-05962011000700031
S0365-05962011000700032
S0365-05962011000700033
S0365-05962011000700034
S0365-05962011000700035
S0365-05962011000700036
S0365-05962011000700037
S0365-05962011000700038
S0365-05962011000700039
S0365-05962011000700040
S0365-05962011000700041
S0365-05962011000700042
S0365-05962011000700043
S0365-05962011000700044
S0365-05962011000700045
S0365-05962011000700046
S0365-05962011000700047
S0365-05962011000700048
S0365-05962011000700049
S0365-05962011000700050
S0365-05962012000100014
S0365-05962012000100025
S0365-05962012000100009
S0365-05962012000500011
S0365-05962012000500024
S0001-37652000000200009
S0001-37652002000100002
S0001-37652002000300012
S0001-37652004000200004
S0001-37652006000400007
S0001-37652007000200015
S0001-37652009000100016
S0001-37652010000200020
S0001-37652010000400028
S0001-37652011000100021
S0001-37652011000300021
S0001-37652012000100011
S0001-37652012000300022
S0001-37652012000400029
S0301-80592000000400001
S0301-80592000000400002
S0301-80592000000400003
S0301-80592000000400004
S0301-80592000000400005
S0301-80592000000400006
S0301-80592000000400007
S0301-80592000000400008
S0301-80592000000400009
S0301-80592000000400010
S0301-80592000000400011
S0301-80592000000400012
S0301-80592000000400013
S0301-80592000000400014
S0301-80592000000400015
S0301-80592000000400016
S0301-80592000000400017
S0301-80592000000400018
S0301-80592000000400019
S0301-80592000000400020
S0301-80592000000400021
S0301-80592000000400022
S0301-80592000000400023
S0301-80592000000400024
S0301-80592000000400025
S0301-80592000000400026
S0301-80592000000400027
S0301-80592000000400028
S0301-80592000000400029
S0301-80592000000400030
S0301-80592000000300001
S0301-80592000000300002
S0301-80592000000300003
S0301-80592000000300004
S0301-80592000000300005
S0301-80592000000300006
S0301-80592000000300007
S0301-80592000000300008
S0301-80592000000300009
S0301-80592000000300010
S0301-80592000000300011
S0301-80592000000300012
S0301-80592000000300013
S0301-80592000000300014
S0301-80592000000300015
S0301-80592000000300016
S0301-80592000000300017
S0301-80592000000300018
S0301-80592000000300019
S0301-80592000000300020
S0301-80592000000300021
S0301-80592000000300022
S0301-80592000000300023
S0301-80592000000300024
S0301-80592000000300025
S0301-80592000000200001
S0301-80592000000200002
S0301-80592000000200003
S0301-80592000000200004
S0301-80592000000200005
S0301-80592000000200006
S0301-80592000000200007
S0301-80592000000200008
S0301-80592000000200009
S0301-80592000000200010
S0301-80592000000200011
S0301-80592000000200012
S0301-80592000000200013
S0301-80592000000200014
S0301-80592000000200015
S0301-80592000000200016
S0301-80592000000200017
S0301-80592000000200018
S0301-80592000000200019
S0301-80592000000200020
S0301-80592000000200021
S0301-80592000000200022
S0301-80592000000100001
S0301-80592000000100002
S0301-80592000000100003
S0301-80592000000100004
S0301-80592000000100005
S0301-80592000000100006
S0301-80592000000100007
S0301-80592000000100008
S0301-80592000000100009
S0301-80592000000100010
S0301-80592000000100011
S0301-80592000000100012
S0301-80592000000100013
S0301-80592000000100014
S0301-80592000000100015
S0301-80592000000100016
S0301-80592000000100017
S0301-80592000000100018
S0301-80592000000100019
S0301-80592000000100020
S0301-80592000000100021
S0301-80592000000100022
S0301-80592000000100023
S0301-80591999000300001
S0301-80591999000300002
S0301-80591999000300003
S0301-80591999000300004
S0301-80591999000300005
S0301-80591999000300006
S0301-80591999000300007
S0301-80591999000300008
S0301-80591999000300009
S0301-80591999000300010
S0301-80591999000300011
S0301-80591999000300012
S0301-80591999000300013
S0301-80591999000300014
S0301-80591999000300015
S0301-80591999000300016
S0301-80591999000300017
S0301-80591999000300018
S0301-80591999000300019
S0301-80591999000300020
S0301-80591999000300021
S0301-80591999000300022
S0301-80591999000300023
S0301-80591999000300024
S0301-80591999000300025
S0301-80591999000300026
S0301-80591999000300027
S0301-80591999000400001
S0301-80591999000400002
S0301-80591999000400003
S0301-80591999000400004
S0301-80591999000400005
S0301-80591999000400006
S0301-80591999000400007
S0301-80591999000400008
S0301-80591999000400009
S0301-80591999000400010
S0301-80591999000400011
S0301-80591999000400012
S0301-80591999000400013
S0301-80591999000400014
S0301-80591999000400015
S0301-80591999000400016
S0301-80591999000400017
S0301-80591999000400018
S0301-80591999000400019
S0301-80591999000400020
S0301-80591999000400021
S0301-80591999000100001
S0301-80591999000100002
S0301-80591999000100003
S0301-80591999000100004
S0301-80591999000100005
S0301-80591999000100006
S0301-80591999000100007
S0301-80591999000100008
S0301-80591999000100009
S0301-80591999000100010
S0301-80591999000100011
S0301-80591999000100012
S0301-80591999000100013
S0301-80591999000100014
S0301-80591999000100015
S0301-80591999000100016
S0301-80591999000100017
S0301-80591999000100018
S0301-80591999000100019
S0301-80591999000100020
S0301-80591999000100021
S0301-80591999000200001
S0301-80591999000200002
S0301-80591999000200003
S0301-80591999000200004
S0301-80591999000200005
S0301-80591999000200006
S0301-80591999000200007
import re
import html
import requests
import json
import argparse
import logging
from typing import List
from lxml import etree
from itertools import chain
from copy import deepcopy
from io import StringIO, BytesIO
ARTICLE_META_URL = (
"http://articlemeta.scielo.org/api/v1/article/?format=json&body=true&code=%s"
)
html_parser = etree.HTMLParser()
logger = logging.getLogger(__name__)
def _config_logging(logging_level="INFO", logging_file=None):
allowed_levels = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL,
}
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger.setLevel(allowed_levels.get(logging_level, "INFO"))
if logging_file:
hl = logging.FileHandler(logging_file, mode="a")
else:
hl = logging.StreamHandler()
hl.setFormatter(formatter)
hl.setLevel(allowed_levels.get(logging_level, "INFO"))
logger.addHandler(hl)
return logger
def get_nested(node, *path, default=""):
try:
for p in path:
node = node[p]
except (IndexError, KeyError):
return default
return node
def get_citations_nodes_by_references_comments(tree):
"""Retorna uma lista de citações a partir de dos comentários que delimitam
as referências no artigo.
Ideia retirada do transformador via XIS (ptf?)
https://github.com/scieloorg/Web/blob/5c55a8596ac581fb9246a68d9a84501fd7ed8f66/cgi-bin/ScieloXML/paragraphs.xis#L265
"""
def _get_references(body: str) -> list:
start_at = 0
start_tag = "<!-- ref -->"
end_tag = "<!-- end-ref -->"
references = []
while body.find(start_tag, start_at) >= 0:
start_index = body.find(start_tag, start_at)
end_index = body.find(end_tag, start_at)
reference = body[start_index : (end_index + len(end_tag))]
references.append(reference)
start_at = end_index + len(end_tag)
return references
parents = []
body = etree.tostring(tree).decode()
body = body.replace("<br/>", "")
for reference in _get_references(body):
node = etree.parse(StringIO(reference), parser=html_parser)
root = node.find(".//font")
if root is None:
root = node.find(".//body")
root.tag = "font"
if root is not None:
for comment in root.xpath(".//comment()"):
parent = comment.getparent()
parent.remove(comment)
parents.append(root)
return parents
def wrap_citations_into_dict(raw_citations):
"""Retorna as citações em formato de dicionário Python.
O dicionário resultante do encapsulamento das citações contém as chaves
`numer` e `text`. A chave `number` representa o número capturado a partir
do texto da citação, a chave `text` representa o resto do conteúdo da
citação sem espaços duplicados e quebras de linhas.
Params:
raw_citations (List[etree.Element]): Lista crua de citações encapsuladas
no seu próprio elemento (<font>, <*>).
Returns:
citations (List[Dict]): Lista contendo todas as citaçõesm em formato
de dicionário, exemplo:
`[{"number": "1", "text": "Texto da citação", "citation": "<font>Texto da citação</font>}]`."""
def _get_full_content(node) -> str:
"""Retorna o conteúdo em HTML de um elemento."""
if node.find(".//font") is not None:
node = node.find(".//font")
node_text = list(filter(lambda n: isinstance(n, str), node.text or []))
node_tail = list(filter(lambda n: isinstance(n, str), node.tail or []))
node_children = list(map(etree.tostring, node.xpath(".//*")))
node_children = list(map(lambda n: n.decode(), node_children))
nodes = chain(node_text, node_children, node_tail)
nodes_text = "".join(nodes)
nodes_text = re.sub(r"[\s\n]+", " ", nodes_text)
nodes_text = nodes_text.strip()
nodes_text = html.unescape(nodes_text) # transforma símbolos html
return nodes_text
def _get_citation_number(node) -> str:
"""Retorna o número da referência"""
node_to_string = etree.tostring(node).decode()
match = CITATION_NUMBER_REGEX.match(node_to_string)
if match:
groups = match.groupdict()
return groups["number"]
return None
def _get_citation_text_only(node) -> str:
etree.strip_tags(node, "*")
return html.unescape(etree.tostring(node).decode())
citations = []
CITATION_REGEX = re.compile(r"(?P<number>\d+)?\.?\s?(?P<citation>.*)")
CITATION_NUMBER_REGEX = re.compile(r".*>(?P<number>\d+?)\.\s.*", re.MULTILINE)
for raw_citation in raw_citations:
full_content = _get_full_content(raw_citation)
match = CITATION_REGEX.match(full_content)
number = _get_citation_number(raw_citation)
text_content = _get_citation_text_only(raw_citation)
if match is None:
continue
groups = match.groupdict()
groups["text"] = _get_citation_text_only(raw_citation)
if groups.get("number") is None and number is not None:
groups["number"] = number
citations.append(groups)
return citations
def _get_article_body_as_etree(bodies: dict):
"""Retorna o corpo do artigo como árvore etree"""
for body in bodies.values():
try:
return etree.fromstring("<body>" + body + "</body>")
except etree.XMLSyntaxError:
pass
return None
def _text_has_all_citation_words(words, text) -> bool:
"""Verifica se todas as palavras estão em um determinado texto"""
if len(words) == 0:
return False
for word in words:
if word not in text:
return False
return True
def _get_citation_words(citation: dict) -> list:
"""Retorna a lista de palavras contidas no título da citação.
Nos casos onde o título é vazio é retornado uma array vazia."""
text = (
get_nested(citation, "v12", 0, "_")
or get_nested(citation, "v18", 0, "_")
or get_nested(citation, "v30", 0, "_") # type article
or get_nested(citation, "v801", 0, "_")
).lower()
if len(text) == 0:
return []
return re.split(r"[\W\s]+", text)
def _citation_and_text_authors_matches(citation: dict, citation_text: str) -> bool:
"""Verifica se os autores de uma citação estão no texto informado"""
authors = citation.get("v10", []) or citation.get("v16", [])
matches = 0
for author in authors:
surname = author.get("s", "").lower()
name = author.get("n", "").lower()
if surname in citation_text or name in citation_text:
matches += 1
if matches >= (len(authors) // 2) + 1:
return True
return False
def get_mixed_citations(
citations: dict, bodies: str, pid: str, collection: str = "scl", debug_mode=False
) -> List[dict]:
"""Retorna o conteúdo `mixed_citation`das citações de um artigo.
A partir das citações de um artigo no formato `isis2json` e do seu body em
HTMl é formada lista de `mixed_citation` contendo apenas as citações que
foram casadas com o texto extraído do corpo HTML.
Params:
citations (List[dict]): citações em formato isis2json
bodies (str): Corpos do artigo em HTML (Idiomas em pt, es, en, etc)
pid (str): Identificador do artigo na base ISIS/ArticleMeta
collection (str): Acrônimo da coleção a ser processada
debug_mode (bool): Ativa ou desativa o modo de debug do código
Returns:
mixed_citations (List[dict]): Citações que foram casadas com o texto
extraído do corpo HTML do artigo."""
if citations is None or bodies is None:
return []
body_etree = _get_article_body_as_etree(bodies)
if body_etree is None:
return []
mixed_citations = []
raw_citations = get_citations_nodes_by_references_comments(body_etree)
wrapped_citations = wrap_citations_into_dict(raw_citations)
if debug_mode:
article_citations_indexes = list(range(0, len(citations)))
for citation_index, citation in enumerate(citations):
words = _get_citation_words(citation)
for wrapped_citation in wrapped_citations:
wrapped_text = wrapped_citation["text"].lower()
matched_text = _text_has_all_citation_words(words, wrapped_text)
matched_authors = _citation_and_text_authors_matches(citation, wrapped_text)
if matched_text or matched_authors:
mixed_citations.append(
{
"mixed": wrapped_citation["citation"],
"pid": pid,
"collection": collection,
"order": citation_index + 1,
}
)
if debug_mode:
article_citations_indexes.remove(citation_index)
break
if debug_mode and len(article_citations_indexes) > 0:
logger.debug("Índices das citações não preenchidas: %s, PID: %s", article_citations_indexes, pid)
return mixed_citations
def main():
parser = argparse.ArgumentParser(description="Gerador de citações mixadas")
parser.add_argument(
"input",
type=argparse.FileType("r"),
help="Arquivo com um PID por linha representando os artigos a serem processados",
)
parser.add_argument(
"output",
type=argparse.FileType("w"),
help="Arquivo onde as citações serão armazenadas",
)
parser.add_argument(
"--collection", "-c", default="scl", help="Coleção SciELO a ser processada"
)
parser.add_argument(
"--logging_level",
"-l",
default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
help="Logggin level",
)
args = parser.parse_args()
pids = args.input.readlines()
collection = args.collection
_config_logging(logging_level=args.logging_level)
debug_mode = args.logging_level == "DEBUG"
with args.output as output:
for pid in pids:
pid = pid.strip()
try:
article = requests.get(ARTICLE_META_URL % pid, timeout=10).json()
if article is None:
logger.info("Could not fetch %s." % pid)
continue
for mixed_citation in get_mixed_citations(
article.get("citations", []),
article.get("body", ""),
pid,
collection,
debug_mode=debug_mode
):
output.write(json.dumps(mixed_citation) + "\n")
except Exception as e:
logger.error("%s. %s", pid, e)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment