Last active
December 18, 2015 00:59
-
-
Save fabiobatalha/5700577 to your computer and use it in GitHub Desktop.
load_remarks.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from lxml import etree | |
ns = {'xs': 'http://www.w3.org/2001/XMLSchema', | |
'mml': 'http://www.w3.org/1998/Math/MathML', | |
'xsi': 'http://www.w3.org/2001/XMLSchema-instance', | |
'xlink': 'http://www.w3.org/1999/xlink'} | |
def list_htmls(): | |
files = os.listdir("jats-publishing-documentation-1.0") | |
htmls = [] | |
for f in files: | |
if "html" in f: | |
htmls.append(f) | |
return htmls | |
def cleaner(html_string): | |
clean_html = html_string | |
clean_html = re.sub(r'<h2 class="header">Remarks</h2>', '', clean_html) | |
clean_html = re.sub(r' class="para"', '', clean_html) | |
clean_html = re.sub(r'\n', '', clean_html) | |
clean_html = re.sub(r'<a(.*?)>', '', clean_html) | |
clean_html = re.sub(r'</a>', '', clean_html) | |
clean_html = re.sub(r'<div(.*?)>', '', clean_html) | |
clean_html = re.sub(r'</div>', '', clean_html) | |
clean_html = re.sub(r'<span(.*?)>', '', clean_html) | |
clean_html = re.sub(r'</span>', '', clean_html) | |
return clean_html | |
def clean_key(key): | |
clean_key = key | |
clean_key = re.sub(r'<', '', clean_key) | |
clean_key = re.sub(r'>', '', clean_key) | |
clean_key = re.sub(r'%', '', clean_key) | |
clean_key = re.sub(r';', '', clean_key) | |
return clean_key | |
def load_key(body): | |
header = body[0].xpath("div[@class='header']/h1") | |
if header: | |
key = {} | |
skey = clean_key(header[0].text) | |
t = key.setdefault(skey, {}) | |
t[skey] = {"key_type": header[1].attrib['class']} | |
return t | |
def load_content(body, class_name): | |
content = body[0].xpath("div[@class='{0}']".format(class_name)) | |
if content: | |
return cleaner(etree.tostring(content[0])) | |
def parse_html(html): | |
str_html = open("jats-publishing-documentation-1.0/{0}".format(html)).read() | |
html = etree.HTML(str_html) | |
body = html.xpath("body") | |
doc = {} | |
if body: | |
header = load_key(body) | |
if header: | |
key = header.keys()[0] | |
header[key]['definition'] = load_content(body, 'definition') | |
header[key]['remarks'] = load_content(body, 'remarks') | |
doc.update(header) | |
return doc | |
def load_remarks(htmls): | |
remarks = {} | |
for html in htmls: | |
remarks.update(parse_html(html)) | |
return remarks | |
def parse_schema(xsd): | |
pass | |
def insert_annotations(annotations): | |
str_schema = open("SciELO-journalpublishing1.xsd").read() | |
schema = etree.XML(str_schema) | |
schema.xpath("//xs:element[@name='styled-content']", namespaces=ns) | |
context = etree.iterwalk(schema, events=("start", "end")) | |
for action, elem in context: | |
if action == "start" and 'name' in elem.attrib and not 'type' in elem.attrib: | |
elem_name = elem.attrib['name'] | |
if elem_name in annotations: | |
doc = "<annotation>{0}</annotation>".format(annotations[elem_name]['definition']+str(annotations[elem_name]['remarks']).replace('None','')) | |
etree_doc = etree.fromstring(doc) | |
elem.insert(0, etree_doc) | |
print(etree.tostring(schema, pretty_print=True).replace('<annotation>',"<xs:annotation><xs:documentation>").replace('</annotation>',"</xs:documentation></xs:annotation>")) | |
annotations = load_remarks(list_htmls()) | |
insert_annotations(annotations) | |
#for key, info in annotations.items(): | |
#print '{0}|{1}|{2}|{3}'.format(key, info['key_type'], info['definition'], info['remarks']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment