Skip to content

Instantly share code, notes, and snippets.

@So-Cool So-Cool/extractor.py
Last active Oct 20, 2017

Embed
What would you like to do?
Prettify ugly MS Word HTML
import os
import sys
from bs4 import BeautifulSoup
if len(sys.argv) != 3:
sys.exit("First argument is HTML second is an output directory.")
with open(sys.argv[1], "r") as html:
soup = BeautifulSoup(html.read(), "html.parser")
mydivs = soup.findAll("div", {"class":"extract"}) # swish, figure, exercise, infobox
for d in mydivs:
name = d["class"][1] + "_" + d["id"] + ".html"
with open(os.path.join(sys.argv[2], name), 'w') as of:
of.write(str(d))
from bs4 import BeautifulSoup, Comment, NavigableString
# from pprint import pprint
# import sys
style_name = "AutoStyle%02d"
style_counter = 0
stylesCSS = {}
stylesCSSno = {}
def writeCSStoFile(CSS, CSSno):
s = ""
for style in CSS:
for tag in CSS[style]:
s += tag.encode("utf-8") + '.' + style_name%CSSno[style] + ','
s = s[:-1] + '{\n'
defs = style.split(';')
defs = [d for d in defs if d.strip()]
for d in defs:
s+= ' ' + d.encode("utf-8") + ';\n'
s += '}\n'
with open('common_style.css', 'w') as CSSfile:
CSSfile.write(s)
tag_list = []
style_list = []
soupNames = ["../simply-logical/Part I.htm.dat",\
"../simply-logical/Part II.htm.dat",\
"../simply-logical/Part III.htm.dat",\
"../simply-logical/Appendix.htm.dat"]
for soupName in soupNames:
soup = BeautifulSoup(open(soupName), "html.parser")
# replace \n with blank in tags parameters (attributes)
# Don't do pre tags which are swish blocks
for tag in soup.findAll(True):
if tag.name == 'pre':
continue
for attr in tag.attrs:
if type(tag[attr]) == str or type(tag[attr]) == unicode:
x = tag[attr].encode("utf-8").split('\n')
x = [xx.strip() for xx in x]
x = ''.join(x)
tag[attr] = unicode(x, 'utf-8')
else:
for i in range(len(tag[attr])):
x = tag[attr][i].encode("utf-8").split('\n')
x = [xx.strip() for xx in x]
x = ''.join(x)
tag[attr][i] = unicode(x, 'utf-8')
# Remove comments
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()
# remove empty tags
et = [1]
while et:
empty_tags = soup.findAll(lambda tag: not tag.name == 'img' and not 'br' in tag.name and not tag.contents and (tag.string is None or not tag.string.strip()))
et = [empty_tag.extract() for empty_tag in empty_tags]
# Remove some attrs form tags
for tag in soup.findAll(True):
try:
if tag['style'] == 'mso-bidi-font-weight:normal' or tag['style'] == 'mso-bidi-font-style:normal':
del tag['style']
except:
pass
try:
if tag['lang'] == 'EN-US':
del tag['lang']
except:
pass
# <span style='font-family:Courier'></span> into <tt></tt>
for a in soup.findAll('span'):
try:
if a['style'] == 'font-family:Courier' and len(a.attrs) == 1:
del a['style']
a.name = 'tt'
except:
pass
# remove some tags
for tag in soup.findAll():
try:
style = tag['style']
style = style.split(';')
for s in style:
if 'display:none' in s:
tag.extract()
break
except:
pass
# remove all *mso* `style` parameters
for tag in soup.findAll():
try:
style = tag['style']
style = style.split(';')
style = [s for s in style if 'mso' not in s and\
'tab-stops' not in s and\
'page-break' not in s and\
'font-family:YuTimes' not in s and\
'font-family:Extra' not in s and\
'Avant Garde' not in s]
style = ';'.join(style)
if style:
tag['style'] = style
else:
del tag['style']
except:
pass
# remove `div` tags without any attributes
# for tag in soup.findAll('div'):
# if not tag.attrs:
# print tag
# tag.replaceWithChildren()
for tag in soup.findAll('span'):
if not tag.attrs:
# print tag
tag.replaceWithChildren()
for tag in soup.findAll('o:p'):
tag.replaceWithChildren()
# Remove tags but leave their content only if specified parameter is the only
# one in the tag
bks = ['bk'+str(i) for i in range(10)]
bks_style = ['mso-bookmark:'+i for i in bks]
invalid_tags = {
'a':{'name':bks},
'span':{'style':bks_style+["mso-spacerun:yes"] }\
}
# def strip_tags(soup, invalid_tags, bks):
# for tag in soup.findAll(True):
# if tag.name in invalid_tags:
# try:
# attr = invalid_tags[tag.name].keys()[-1]
# if len(tag.attrs) == 1 and tag[attr] in invalid_tags[tag.name][attr]:
# pass
# else:
# continue
# s = ""
# for c in tag.contents:
# if not isinstance(c, NavigableString):
# c = strip_tags(BeautifulSoup(unicode(c), 'html.parser'), invalid_tags, bks)
# s += unicode(c)
# tag.replaceWith(s)
# except:
# pass
# return soup
# soup = strip_tags(soup, invalid_tags, bks)
for tag in invalid_tags:
for match in soup.findAll(tag):
attr = invalid_tags[match.name].keys()[-1]
try:
if len(match.attrs) == 1 and match[attr] in invalid_tags[match.name][attr]:
match.replaceWithChildren()
except:
pass
# # remove <span> without attributes
# def strip_tags(soup, invalid_tags):
# for tag in soup.findAll(True):
# if tag.name in invalid_tags and len(tag.attrs) == 0:
# # print tag
# s = unicode("")
# for c in tag.contents:
# if not isinstance(c, NavigableString):
# c = strip_tags(BeautifulSoup(unicode(c), 'html.parser'), invalid_tags)
# s += unicode(c)
# # print s, '\n*******\n\n\n\n'
# tag.replaceWith(s)
# return soup
# soup = strip_tags(soup, ['span'])
for tag in ['span']:
for match in soup.findAll(tag):
if len(match.attrs) == 0:
match.replaceWithChildren()
# remove empty tags - AGAIN and AGAIN and AGAIN
et = [1]
while et:
empty_tags = soup.findAll(lambda tag: not tag.name == 'img' and not 'br' in tag.name and not tag.contents and (tag.string is None or not tag.string.strip()))
et = [empty_tag.extract() for empty_tag in empty_tags]
#replace newlines inside tag content with spaces
for tag in soup.findAll(True):
if tag.name == 'pre' or not tag.contents:
continue
# for i in range(len(tag.contents))[::-1]:
# if tag.contents[i] == '\n':
# tag.contents[i] = u'\n'
# print tag.contents
i = tag.contents[0]
while i:
j = i.next_sibling
if isinstance(i, NavigableString) and i.parent.name!='pre':
x = i.encode("utf-8").strip().split('\n')
# print "**\n",repr(tag.contents[i]),"\n*"
# print x
# print i.previous_element
# x = [xx.strip() for xx in x]
x = ' '.join(x)
# print type(tag.contents[i])
if x:
if isinstance(i.previous_sibling, NavigableString) and i.parent.name!='pre':
# get content of previous
prev = i.previous_sibling.encode("utf-8")
this = soup.new_string(unicode(prev.strip()+x.strip(), 'utf-8'))
i.previous_sibling.replace_with( this )
i.extract()
i = j
continue
this = soup.new_string(unicode(x, 'utf-8'))
i.replace_with( this )
else:
i.extract()
i = j
# remove empty tags - AGAIN and AGAIN and AGAIN
eto = [1]
while eto:
eto = []
empty_tags = soup.findAll(lambda tag: not tag.contents and (tag.string is None or not tag.string.strip()))
# print empty_tags
for et in empty_tags:
if not et.attrs and et.name != 'br' and et.name != 'img':
eto.append(et.extract())
elif et.name != 'br' and et.name != 'img':
print "Removeing: ", et
eto.append(et.extract())
# replace footnotes stylesheets
def getNameClass(node):
n = [node.name]
try:
n = [node.name + '.' + c for c in node['class']]
except:
pass
return n
def ancestralSet(node):
try:
return getNameClass(node.parent) + ancestralSet(node.parent)
except:
return []
for s in soup.findAll('span', { "class" : "MsoFootnoteReference" }):
if 'span.MsoFootnoteReference' in ancestralSet(s):
s.replaceWithChildren()
# for s in soup.findAll('span', { "style" : "font-size:8.0pt"}):
# print list(s.children)[1]['style']
# try:
# if len(list(s.children)) == 3 and "font-size:8.0pt" in list(s.children)[1]['style']:
# s.replaceWithChildren()
# except:
# pass
# replace <abar> with <span class="CustomFootnote">
for s in soup.findAll('abar'):
if 'p.MsoFootnoteText' in ancestralSet(s):
s.replaceWithChildren()
else:
s.name = 'span'
s['class'] = "CustomFootnote"
# generate classes instead of styles
for tag in soup.findAll():
try:
if tag['style'] in stylesCSS:
try:
tn = [tag.name + '.' + c for c in tag['class']]
tag['class'].append(style_name % stylesCSSno[tag['style']])
except:
tn = [tag.name]
tag['class'] = [ style_name % stylesCSSno[tag['style']] ]
for t in tn:
if t not in stylesCSS[tag['style']]:
stylesCSS[tag['style']] += [t]
del tag['style']
else:
# remove style and add class
stylesCSSno[tag['style']] = style_counter
try:
stylesCSS[tag['style']] = [tag.name + '.' + c for c in tag['class']]
tag['class'].append(style_name % stylesCSSno[tag['style']])
except:
stylesCSS[tag['style']] = [tag.name]
tag['class'] = [ style_name % stylesCSSno[tag['style']] ]
del tag['style']
style_counter += 1
except:
pass
with open(soupName[:-4], 'w') as wfile:
wfile.write(soup.prettify(encoding='utf-8', formatter='html'))
# sys.exit('Here')
with open(soupName[:-4], 'r') as wfile:
clean = wfile.read()
def cleanUp(txt, ls):
for i in ls:
txt = txt.split(i)
txt = [tt.strip() for tt in txt]
txt = filter(None, txt)
txt = '\n'.join(txt)
return txt
clean = cleanUp(clean, ['if !vml', 'if !supportFootnotes', 'endif'])
soup = BeautifulSoup(clean, "html.parser")
puretxt = soup.prettify(encoding='utf-8', formatter='html')
puretxt = cleanUp(puretxt, ['</br>'])
# <br> cleanup and preserve indent
puretxt = puretxt.replace('<br/>', '<br>')
brs = puretxt.split('<br>')
brsprs = brs[0].rstrip()
for br_i in range(1, len(brs)):
# get space count
space_count = brsprs.rfind('\n') + 1
space_count = brsprs[space_count:]
space_count =len(space_count) - len(space_count.lstrip(' '))
brsprs += '<br>\n' + space_count*' ' + brs[br_i].strip()
puretxt = brsprs
def cleanUp(txt, ls):
for i in ls:
txt = txt.split(i)
txt = [tt for tt in txt]
txt = filter(None, txt)
# my join implementation
if txt == []:
txt = ""
else:
s = txt[0]
for j in txt[1:]:
if '/' in i:
# punctuation mark afterwards
if j.strip()[0] in ['.', ',', '?', '!', ':', ';', ')', ']', '\'', '"', '/', '&rsquo;', '&gt;']:
s = s.rstrip() + i + j.lstrip()
elif j.strip()[0:6] == '&nbsp;':
s = s.rstrip() + i + j.lstrip()
# two tags next to each other if their the same join the word: like <i>text</i><i>another text</i>
elif j.strip()[:len(i.replace('/', ''))] == i.replace('/', ''):
# remove tag form j at the begining and merge it with space(maybe)
s = s.rstrip() + j.strip()[len(i.replace('/', '')):].lstrip()
#there is another tag which is not in the list then do not remove newline
elif j.strip()[0] == '<' and (j.strip()[:j.strip().find(' ')]+'>') not in ls:
# if the previous symbol is tag as well
if s.strip()[-1] == '>' and s.rstrip()[s.strip().rfind('<'):] not in ls:
s += i + j
else:
s = s.rstrip() + i + j
else: # ['(', '[', '-']
s = s.rstrip() + i + ' ' + j.lstrip()
# txt = (i+' ').join(txt)
else: # tag before-if same merge if different do not and dont intent
if s.rstrip()[-1] in ['(', '[', '\'', '"', '&lsquo;', '&lt;']:
s = s.rstrip() + i + j.lstrip()
# non breaking space before
elif s.strip()[-6:] == '&nbsp;':
s = s.rstrip() + i + j.lstrip()
# if different tag before do not remove indent
elif s.strip()[-1] == '>' and s.rstrip()[s.strip().rfind('<'):] not in ls:
# if the next is tag as well
if j.strip()[0] == '<' and (j.strip()[:j.strip().find(' ')]+'>') not in ls:
s += i + j
else:
s += i + j.lstrip()
# if its first thing inside bracket ro pharenthesis
else:
s = s.rstrip() + ' ' + i + j.lstrip()
# txt = (' '+i).join(txt)
txt = s
return txt
puretxt = cleanUp(puretxt, ['<tt>', '</tt>', '<i>', '</i>', '<b>', '</b>'])
with open(soupName[:-4], 'w') as wfile:
wfile.write(puretxt)
# sys.exit('Cleanup')
# Print all tag names
tagSoup = BeautifulSoup(puretxt, "html.parser")
for tag in tagSoup.findAll():
try:
classes = tag['class']
if classes:
for c in classes:
if 'AutoStyle' not in c:
tc = tag.name + '.' + c
else:
tc = tag.name
if tc not in tag_list:
tag_list.append(tc)
else:
if tag.name not in tag_list:
tag_list.append(tag.name)
except:
if tag.name not in tag_list:
tag_list.append(tag.name)
# try:
# tn = tag.name + '._name_.' + tag['name']
# if tn not in tag_list:
# tag_list.append(tn)
# except:
# pass
try:
tc = tag.name + '.' + tag['style']
if tc not in style_list:
style_list.append(tc)
except:
if tag.name not in style_list:
style_list.append(tag.name)
# print all tags
tag_list = [t.encode("utf-8") for t in tag_list]; tag_list.sort()
style_list = [t.encode("utf-8") for t in style_list]; style_list.sort()
# pprint(tag_list)
print "Tag.Class"
print tag_list
# for t in tag_list:
# print t
# pprint(style_list)
# print "\n\nTag.Style"
# for t in style_list:
# print t
# save new styles
font_styles = []
for f in stylesCSS.keys():
ff = f.split(';')
for ffi in ff:
if 'font-family' in ffi:
if ffi not in font_styles:
font_styles.append(ffi.encode('utf-8'))
print font_styles
writeCSStoFile(stylesCSS, stylesCSSno)
import pyparsing, sys
def getFontFace(styles):
s = styles.split(';')
for i in s:
if 'font-family' in i:
return i.strip().replace('font-family', '').strip().strip(':').strip().strip('\'').strip('\"').strip()
sys.exit('Font-family not found')
def setOverlap(styleStr, listStyleStr):
s1 = styleStr.split(',')
s1 = [s.strip() for s in s1]
s2 = []
for i in listStyleStr:
ss = i.split(',')
ss = [s.strip() for s in ss]
s2 += ss
for s in s1:
if s in s2:
return False
return True
STYLES = {}
FONT_FACE = {}
PAGE = {}
used_styles = ['a', 'b', 'br', 'div', 'div.WordSection1', 'div.WordSection2', 'div.WordSection3', 'div.WordSection4', 'h1', 'h2', 'h3', 'hr', 'i', 'img', 'p', 'p.Caption1', 'p.MsoFootnoteText', 'p.MsoNormal', 'p.answer', 'p.caption', 'p.cijfer', 'p.citaat', 'p.exercise', 'p.figure', 'p.formule', 'p.grammar', 'p.inter-title', 'p.intermezzo', 'p.med-caption', 'p.med-figure', 'p.oms', 'p.oms-eerst', 'p.opsomming', 'p.p-eerst', 'p.p-el', 'p.p-laatst', 'p.pi', 'p.pi-eerst', 'p.pi-el', 'p.pi-laatst', 'p.programma', 'p.query', 'p.referenties', 'p.romeinscijfer', 'p.sektie', 'p.sektie1', 'p.small-caption', 'p.small-figure', 'p.tekst', 'pre', 'pre.inherit', 'pre.query', 'pre.source', 'pre.swish', 'pre.temp', 'script', 'span', 'span.CustomFootnote', 'span.MsoFootnoteReference', 'span.query', 'span.swish', 'table', 'tbody', 'td', 'tr', 'tt']
used_fonts = ['font-family:Symbol', 'font-family:Times', 'font-family:Helvetica', 'font-family:Courier']
out_file = "mso.css"
CSS_files = ["../simply-logical/bootstrap/css/" + i for i in ["Part_I.css", "Part_II.css", "Part_III.css", "Appendix.css"]]
CSSs = []
for i in CSS_files:
with open(i, 'r') as i_file:
CSSs.append(i_file.read())
# Remove all comments
comment = pyparsing.nestedExpr("/*", "*/").suppress()
CSSs = [comment.transformString(i) for i in CSSs]
for css in CSSs:
css_split = []
for c in css.split('{'):
css_split += [i.strip() for i in c.split('}') if i.strip()]
names = []
for n in css_split[0:][::2]: # odd
name = n.split(',')
name = [nam.strip() for nam in name if nam.strip()]
name = ','.join(name)
names.append(name)
tags = []
for n in css_split[1:][::2]: # even
tag = n.split(';')
tag = [t.strip() for t in tag if t.strip()]
tag = [t for t in tag if 'mso-' not in t and 'tab-stop' not in t]
tag_formatted = ""
for t in tag:
tag_formatted += ' ' + t + ';\n'
tags.append(tag_formatted)
if len(tags) != len(names):
sys.exit("Tags and names do not agree")
# remove empty tags
for i in range(len(tags))[::-1]:
if not tags[i]:
tags.pop(i)
names.pop(i)
# divide into right placeholders
# set_of_keys = setOfKeys()
for i in range(len(tags)):
if '@font-face' in names[i]:
if getFontFace(tags[i]) in FONT_FACE:
if FONT_FACE[getFontFace(tags[i])] != tags[i]:
print('Font face does not agree\n' + tags[i] +'\n dic:\n'+FONT_FACE[getFontFace(tags[i])])
else:
FONT_FACE[getFontFace(tags[i])] = tags[i]
elif '@page' in names[i]:
if names[i] in PAGE:
if PAGE[names[i]] != tags[i]:
sys.exit('@Page does not agree\n ' + tags[i])
else:
PAGE[names[i]] = tags[i]
else:
if names[i] in STYLES:
if STYLES[names[i]] != tags[i]:
print('Styles do not agree:'+names[i]+'\n' + tags[i] +'\nand\n' + STYLES[names[i]])
STYLES[names[i]] += '/*' + tags[i] + '*/'
else:
if setOverlap(names[i], STYLES.keys()):
STYLES[names[i]] = tags[i]
else:
sys.exit("Keys set is overlapping")
# filter used @Page
delMe = True
for i in PAGE.keys():
delMe = True
ni = i.strip().strip('@page').strip()
if not ni:
continue
else:
for j in used_styles:
if ni in j:
delMe = False
break
if delMe:
del PAGE[i]
# filter used STYLES
delMe = True
for i in STYLES.keys():
delMe = True
if setOverlap(i, used_styles):
del STYLES[i]
# filter used FONTS
delMe = True
for i in FONT_FACE.keys():
delMe = True
if 'font-family:'+i in used_fonts:
delMe = False
if delMe:
del FONT_FACE[i]
with open(out_file, 'w') as of:
s = ""
for i in FONT_FACE:
s += '@font-face' + '{\n' + FONT_FACE[i] + '}\n'
for i in PAGE:
s += i + '{\n' + PAGE[i] + '}\n'
for i in STYLES:
s += i + '{\n' + STYLES[i] + '}\n'
of.write(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.