Skip to content

Instantly share code, notes, and snippets.

@joffilyfe
Last active April 14, 2020 20:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joffilyfe/bfc656e878dee6376234e6b1edc668be to your computer and use it in GitHub Desktop.
Save joffilyfe/bfc656e878dee6376234e6b1edc668be to your computer and use it in GitHub Desktop.
import os
import json
import logging
import argparse
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(
description="Programa para agrupar as mixed citations a partir"
" de arquivos de parágrafos"
)
parser.add_argument(
"pids",
help="Lista de pids que serão utilizados para extrair os parágrafos"
" de arquivos JSON",
type=argparse.FileType("r"),
)
parser.add_argument(
"paragraphs",
help="Diretório contendo os arquivos de parágrafos no formatos JSON.",
)
parser.add_argument(
"output",
help="Arquivo com o resultado do processamento.",
type=argparse.FileType("w"),
)
args = parser.parse_args()
for pid in args.pids:
pid = pid.strip()
paragraph_file = os.path.abspath(os.path.join(args.paragraphs, pid + ".json"))
if not os.path.exists(paragraph_file):
logger.error("Paragraph file '%s' does not exist.", paragraph_file)
continue
with open(paragraph_file, "r") as f:
for line in f.readlines():
paragraph: dict = json.loads(line)
article_pid: str = paragraph.get("v880", [{}])[0].get("_")
raw_mixed_citation: str = paragraph.get("v704", [{}])[0].get("_")
raw_mixed_citation_index: str = paragraph.get("v888", [{}])[0].get("_")
if (
article_pid is not None
and raw_mixed_citation is not None
and raw_mixed_citation_index is not None
and raw_mixed_citation_index.isdigit()
):
mixed_citation = {
"mixed": raw_mixed_citation,
"order": int(raw_mixed_citation_index) - 1,
"collection": "scl",
}
args.output.write(json.dumps(mixed_citation) + "\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment