Skip to content

Instantly share code, notes, and snippets.

@joffilyfe
Last active April 25, 2019 20:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joffilyfe/dfc4b4629dbd4f7a99b0108ce4eb0ef5 to your computer and use it in GitHub Desktop.
Save joffilyfe/dfc4b4629dbd4f7a99b0108ce4eb0ef5 to your computer and use it in GitHub Desktop.
import os
import itertools
from lxml import etree
from documentstore_migracao.config import BASE_PATH
def find_assets_nodes(xmltree: etree.ElementTree) -> iter:
"""Busca por todos os assets e retorna um iterator com
de todos os nós etree.Element encontrados"""
paths = [
".//ext-link[@xlink:href]",
".//graphic[@xlink:href]",
".//inline-graphic[@xlink:href]",
".//inline-supplementary-material[@xlink:href]",
".//media[@xlink:href]",
".//supplementary-material[@xlink:href]",
]
iterators = [
xmltree.iterfind(path, namespaces={"xlink": "http://www.w3.org/1999/xlink"})
for path in paths
]
return itertools.chain(*iterators)
def get_assets_list(assets_nodes: iter) -> list:
"""Retorna uma lista contendo dicionários de ativos digitais"""
link_selector = "{http://www.w3.org/1999/xlink}href"
assets = []
for node in assets_nodes:
asset_name = node.get(link_selector).split("/")[-1]
asset_name = os.path.splitext(asset_name)[0]
assets.append({"id": asset_name, "url": node.get(link_selector)})
# import pdb; pdb.set_trace()
return assets
path = os.path.join(
BASE_PATH, "xml/sps_packages/S0066-782X2004001200001/1678-4170-abc-82-s06-1-14.xml"
)
xml = etree.parse(path)
assets_iterator = find_assets_nodes(xml)
from documentstore_migracao.utils.xylose_converter import parse_date
from uuid import uuid4
def get_document_bundle_manifest(document: etree.ElementTree, document_url: str) -> dict:
"""Cria um manifesto no formato do Kernel a partir de um
documento xml"""
try:
_id = document.find(".//article-id[@pub-id-type='scielo-id']").text
except AttributeError:
# raise ValueError("Document requires an scielo-id") from None
_id = str(uuid4())
# TODO: Qual é a data adequada pra usar como "creation_date"?
date = document.find(".//pub-date[@pub-type='epub-ppub']")
if date is None:
raise ValueError("A creation date is required") from None
_creation_date = parse_date(
"%s-%s" % (date.find(".//year").text, date.find(".//month").text)
)
_version = {"data": document_url, "assets": {}, "timestamp": _creation_date}
_document = {"id": _id, "versions": [_version]}
asset_nodes = find_assets_nodes(document)
asset_list = get_assets_list(asset_nodes)
for asset in asset_list:
_version["assets"][asset.get("id")] = [[_creation_date, asset.get("url")]]
return _document
get_document_bundle_manifest(xml, "http://scielo-bucket.scielo.br/document.xml")
+-----------------------------+ +------------------------------+
| | | |
| | | |
| | | Converte XMLS |
| Extrai XMLS +------> Extraídos |
| | | |
| | | |
| | | |
+-----------------------------+ +---------------+--------------+
|
|
|
+------------------------------+ +---------------v--------------+
| | | |
| | | |
| Gera pacotes SPS | | |
| dos XMLS válidos <------+ Valida os XMLS convertidos |
| | | |
| | | |
| | | |
+---------------+--------------+ +------------------------------+
|
|
|
+---------------v--------------+ +------------------------------+
| | | |
| | | |
| Aplica o XML | | Envia pacotes SPS para |
| Constructor +------> o Object Storage |
| | | |
| | | |
| | | |
+------------------------------+ +--------------+---------------+
|
v
+--------------+----------------+
| |
| |
| Gera o manifesto do XML |
| e registra no Banco |
| de dados |
| |
| |
+-------------------------------+
{
"id": "e8ec3b28-825d-4b60-9711-7dbc106ecfbf",
"versions": [
{
"data": "http://scielo-bucket.scielo.br/document.xml",
"assets": {
"rad": [
[
"2004-04-01T00:00:00.000000Z",
"http://www.vtmed.org/vascular/rad.htm"
]
],
"1678-4170-abc-82-s06-1-14-glogo": [
[
"2004-04-01T00:00:00.000000Z",
"1678-4170-abc-82-s06-1-14-glogo.gif"
]
],
"1678-4170-abc-82-s06-1-14-ga01img01": [
[
"2004-04-01T00:00:00.000000Z",
"1678-4170-abc-82-s06-1-14-ga01img01.gif"
]
],
"1678-4170-abc-82-s06-1-14-ga01img02": [
[
"2004-04-01T00:00:00.000000Z",
"1678-4170-abc-82-s06-1-14-ga01img02.gif"
]
],
"1678-4170-abc-82-s06-1-14-ga01img03": [
[
"2004-04-01T00:00:00.000000Z",
"1678-4170-abc-82-s06-1-14-ga01img03.gif"
]
],
"1678-4170-abc-82-s06-1-14-ga01img04": [
[
"2004-04-01T00:00:00.000000Z",
"1678-4170-abc-82-s06-1-14-ga01img04.gif"
]
],
"1678-4170-abc-82-s06-1-14-ga01img05": [
[
"2004-04-01T00:00:00.000000Z",
"1678-4170-abc-82-s06-1-14-ga01img05.gif"
]
],
"1678-4170-abc-82-s06-1-14-ga01img06": [
[
"2004-04-01T00:00:00.000000Z",
"1678-4170-abc-82-s06-1-14-ga01img06.gif"
]
],
"1678-4170-abc-82-s06-1-14-ga01img07": [
[
"2004-04-01T00:00:00.000000Z",
"1678-4170-abc-82-s06-1-14-ga01img07.gif"
]
]
},
"timestamp": "2004-04-01T00:00:00.000000Z"
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment