fabiobatalha/load_remarks.py

## load_remarks.py
import os
import re
from lxml import etree

ns = {'xs': 'http://www.w3.org/2001/XMLSchema',
      'mml': 'http://www.w3.org/1998/Math/MathML',
      'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
      'xlink': 'http://www.w3.org/1999/xlink'}


def list_htmls():
    files = os.listdir("jats-publishing-documentation-1.0")

    htmls = []
    for f in files:
        if "html" in f:
            htmls.append(f)

    return htmls


def cleaner(html_string):
    clean_html = html_string
    clean_html = re.sub(r'<h2 class="header">Remarks</h2>', '', clean_html)
    clean_html = re.sub(r' class="para"', '', clean_html)
    clean_html = re.sub(r'\n', '', clean_html)
    clean_html = re.sub(r'<a(.*?)>', '', clean_html)
    clean_html = re.sub(r'</a>', '', clean_html)
    clean_html = re.sub(r'<div(.*?)>', '', clean_html)
    clean_html = re.sub(r'</div>', '', clean_html)
    clean_html = re.sub(r'<span(.*?)>', '', clean_html)
    clean_html = re.sub(r'</span>', '', clean_html)
    return clean_html


def clean_key(key):
    clean_key = key
    clean_key = re.sub(r'<', '', clean_key)
    clean_key = re.sub(r'>', '', clean_key)
    clean_key = re.sub(r'%', '', clean_key)
    clean_key = re.sub(r';', '', clean_key)

    return clean_key


def load_key(body):
    header = body[0].xpath("div[@class='header']/h1")

    if header:
        key = {}
        skey = clean_key(header[0].text)
        t = key.setdefault(skey, {})
        t[skey] = {"key_type": header[1].attrib['class']}
        return t


def load_content(body, class_name):
    content = body[0].xpath("div[@class='{0}']".format(class_name))

    if content:
        return cleaner(etree.tostring(content[0]))


def parse_html(html):
    str_html = open("jats-publishing-documentation-1.0/{0}".format(html)).read()
    html = etree.HTML(str_html)
    body = html.xpath("body")

    doc = {}
    if body:
        header = load_key(body)
        if header:
            key = header.keys()[0]
            header[key]['definition'] = load_content(body, 'definition')
            header[key]['remarks'] = load_content(body, 'remarks')
            doc.update(header)
    return doc


def load_remarks(htmls):

    remarks = {}
    for html in htmls:
        remarks.update(parse_html(html))
    return remarks


def parse_schema(xsd):
    pass


def insert_annotations(annotations):
    str_schema = open("SciELO-journalpublishing1.xsd").read()
    schema = etree.XML(str_schema)

    schema.xpath("//xs:element[@name='styled-content']", namespaces=ns)
    context = etree.iterwalk(schema, events=("start", "end"))
    for action, elem in context:
        if action == "start" and 'name' in elem.attrib and not 'type' in elem.attrib:
            elem_name = elem.attrib['name']
            if elem_name in annotations:
                doc = "<annotation>{0}</annotation>".format(annotations[elem_name]['definition']+str(annotations[elem_name]['remarks']).replace('None',''))
                etree_doc = etree.fromstring(doc)
                elem.insert(0, etree_doc)

    print(etree.tostring(schema, pretty_print=True).replace('<annotation>',"<xs:annotation><xs:documentation>").replace('</annotation>',"</xs:documentation></xs:annotation>"))

annotations = load_remarks(list_htmls())
insert_annotations(annotations)

#for key, info in annotations.items():
    #print '{0}|{1}|{2}|{3}'.format(key, info['key_type'], info['definition'], info['remarks'])
	import os
	import re
	from lxml import etree

	ns = {'xs': 'http://www.w3.org/2001/XMLSchema',
	'mml': 'http://www.w3.org/1998/Math/MathML',
	'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
	'xlink': 'http://www.w3.org/1999/xlink'}


	def list_htmls():
	files = os.listdir("jats-publishing-documentation-1.0")

	htmls = []
	for f in files:
	if "html" in f:
	htmls.append(f)

	return htmls


	def cleaner(html_string):
	clean_html = html_string
	clean_html = re.sub(r'<h2 class="header">Remarks</h2>', '', clean_html)
	clean_html = re.sub(r' class="para"', '', clean_html)
	clean_html = re.sub(r'\n', '', clean_html)
	clean_html = re.sub(r'<a(.*?)>', '', clean_html)
	clean_html = re.sub(r'</a>', '', clean_html)
	clean_html = re.sub(r'<div(.*?)>', '', clean_html)
	clean_html = re.sub(r'</div>', '', clean_html)
	clean_html = re.sub(r'<span(.*?)>', '', clean_html)
	clean_html = re.sub(r'</span>', '', clean_html)
	return clean_html


	def clean_key(key):
	clean_key = key
	clean_key = re.sub(r'<', '', clean_key)
	clean_key = re.sub(r'>', '', clean_key)
	clean_key = re.sub(r'%', '', clean_key)
	clean_key = re.sub(r';', '', clean_key)

	return clean_key


	def load_key(body):
	header = body[0].xpath("div[@class='header']/h1")

	if header:
	key = {}
	skey = clean_key(header[0].text)
	t = key.setdefault(skey, {})
	t[skey] = {"key_type": header[1].attrib['class']}
	return t


	def load_content(body, class_name):
	content = body[0].xpath("div[@class='{0}']".format(class_name))

	if content:
	return cleaner(etree.tostring(content[0]))


	def parse_html(html):
	str_html = open("jats-publishing-documentation-1.0/{0}".format(html)).read()
	html = etree.HTML(str_html)
	body = html.xpath("body")

	doc = {}
	if body:
	header = load_key(body)
	if header:
	key = header.keys()[0]
	header[key]['definition'] = load_content(body, 'definition')
	header[key]['remarks'] = load_content(body, 'remarks')
	doc.update(header)
	return doc


	def load_remarks(htmls):

	remarks = {}
	for html in htmls:
	remarks.update(parse_html(html))
	return remarks


	def parse_schema(xsd):
	pass


	def insert_annotations(annotations):
	str_schema = open("SciELO-journalpublishing1.xsd").read()
	schema = etree.XML(str_schema)

	schema.xpath("//xs:element[@name='styled-content']", namespaces=ns)
	context = etree.iterwalk(schema, events=("start", "end"))
	for action, elem in context:
	if action == "start" and 'name' in elem.attrib and not 'type' in elem.attrib:
	elem_name = elem.attrib['name']
	if elem_name in annotations:
	doc = "<annotation>{0}</annotation>".format(annotations[elem_name]['definition']+str(annotations[elem_name]['remarks']).replace('None',''))
	etree_doc = etree.fromstring(doc)
	elem.insert(0, etree_doc)

	print(etree.tostring(schema, pretty_print=True).replace('<annotation>',"<xs:annotation><xs:documentation>").replace('</annotation>',"</xs:documentation></xs:annotation>"))

	annotations = load_remarks(list_htmls())
	insert_annotations(annotations)

	#for key, info in annotations.items():
	#print '{0}\|{1}\|{2}\|{3}'.format(key, info['key_type'], info['definition'], info['remarks'])