Skip to content

Instantly share code, notes, and snippets.

@fabiobatalha
Last active December 18, 2015 00:59
Show Gist options
  • Save fabiobatalha/5700577 to your computer and use it in GitHub Desktop.
Save fabiobatalha/5700577 to your computer and use it in GitHub Desktop.
load_remarks.py
import os
import re
from lxml import etree
ns = {'xs': 'http://www.w3.org/2001/XMLSchema',
'mml': 'http://www.w3.org/1998/Math/MathML',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'xlink': 'http://www.w3.org/1999/xlink'}
def list_htmls():
files = os.listdir("jats-publishing-documentation-1.0")
htmls = []
for f in files:
if "html" in f:
htmls.append(f)
return htmls
def cleaner(html_string):
clean_html = html_string
clean_html = re.sub(r'<h2 class="header">Remarks</h2>', '', clean_html)
clean_html = re.sub(r' class="para"', '', clean_html)
clean_html = re.sub(r'\n', '', clean_html)
clean_html = re.sub(r'<a(.*?)>', '', clean_html)
clean_html = re.sub(r'</a>', '', clean_html)
clean_html = re.sub(r'<div(.*?)>', '', clean_html)
clean_html = re.sub(r'</div>', '', clean_html)
clean_html = re.sub(r'<span(.*?)>', '', clean_html)
clean_html = re.sub(r'</span>', '', clean_html)
return clean_html
def clean_key(key):
clean_key = key
clean_key = re.sub(r'<', '', clean_key)
clean_key = re.sub(r'>', '', clean_key)
clean_key = re.sub(r'%', '', clean_key)
clean_key = re.sub(r';', '', clean_key)
return clean_key
def load_key(body):
header = body[0].xpath("div[@class='header']/h1")
if header:
key = {}
skey = clean_key(header[0].text)
t = key.setdefault(skey, {})
t[skey] = {"key_type": header[1].attrib['class']}
return t
def load_content(body, class_name):
content = body[0].xpath("div[@class='{0}']".format(class_name))
if content:
return cleaner(etree.tostring(content[0]))
def parse_html(html):
str_html = open("jats-publishing-documentation-1.0/{0}".format(html)).read()
html = etree.HTML(str_html)
body = html.xpath("body")
doc = {}
if body:
header = load_key(body)
if header:
key = header.keys()[0]
header[key]['definition'] = load_content(body, 'definition')
header[key]['remarks'] = load_content(body, 'remarks')
doc.update(header)
return doc
def load_remarks(htmls):
remarks = {}
for html in htmls:
remarks.update(parse_html(html))
return remarks
def parse_schema(xsd):
pass
def insert_annotations(annotations):
str_schema = open("SciELO-journalpublishing1.xsd").read()
schema = etree.XML(str_schema)
schema.xpath("//xs:element[@name='styled-content']", namespaces=ns)
context = etree.iterwalk(schema, events=("start", "end"))
for action, elem in context:
if action == "start" and 'name' in elem.attrib and not 'type' in elem.attrib:
elem_name = elem.attrib['name']
if elem_name in annotations:
doc = "<annotation>{0}</annotation>".format(annotations[elem_name]['definition']+str(annotations[elem_name]['remarks']).replace('None',''))
etree_doc = etree.fromstring(doc)
elem.insert(0, etree_doc)
print(etree.tostring(schema, pretty_print=True).replace('<annotation>',"<xs:annotation><xs:documentation>").replace('</annotation>',"</xs:documentation></xs:annotation>"))
annotations = load_remarks(list_htmls())
insert_annotations(annotations)
#for key, info in annotations.items():
#print '{0}|{1}|{2}|{3}'.format(key, info['key_type'], info['definition'], info['remarks'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment