So-Cool/extractor.py

## extractor.py
import os
import sys
from bs4 import BeautifulSoup

if len(sys.argv) != 3:
    sys.exit("First argument is HTML second is an output directory.")

with open(sys.argv[1], "r") as html:
    soup = BeautifulSoup(html.read(), "html.parser")

mydivs = soup.findAll("div", {"class":"extract"}) # swish, figure, exercise, infobox
for d in mydivs:
    name = d["class"][1] + "_" + d["id"] + ".html"
    with open(os.path.join(sys.argv[2], name), 'w') as of:
        of.write(str(d))

## soup.py
from bs4 import BeautifulSoup, Comment, NavigableString
# from pprint import pprint
# import sys

style_name = "AutoStyle%02d"
style_counter = 0
stylesCSS = {}
stylesCSSno = {}
def writeCSStoFile(CSS, CSSno):
  s = ""
  for style in CSS:
    for tag in CSS[style]:
      s += tag.encode("utf-8") + '.' + style_name%CSSno[style] + ','
    s = s[:-1] + '{\n'
    defs = style.split(';')
    defs = [d for d in defs if d.strip()]
    for d in defs:
      s+= '  ' + d.encode("utf-8") + ';\n'
    s += '}\n'
  with open('common_style.css', 'w') as CSSfile:
    CSSfile.write(s)

tag_list = []
style_list = []

soupNames = ["../simply-logical/Part I.htm.dat",\
             "../simply-logical/Part II.htm.dat",\
             "../simply-logical/Part III.htm.dat",\
             "../simply-logical/Appendix.htm.dat"]
for soupName in soupNames:
  soup = BeautifulSoup(open(soupName), "html.parser")


  # replace \n with blank in tags parameters (attributes)
  # Don't do pre tags which are swish blocks
  for tag in soup.findAll(True):
    if tag.name == 'pre':
      continue
    for attr in tag.attrs:
      if type(tag[attr]) == str or type(tag[attr]) == unicode:
        x = tag[attr].encode("utf-8").split('\n')
        x = [xx.strip() for xx in x]
        x = ''.join(x)
        tag[attr] = unicode(x, 'utf-8')
      else:
        for i in range(len(tag[attr])):
          x = tag[attr][i].encode("utf-8").split('\n')
          x = [xx.strip() for xx in x]
          x = ''.join(x)
          tag[attr][i] = unicode(x, 'utf-8')


  # Remove comments
  for element in soup(text=lambda text: isinstance(text, Comment)):
    element.extract()

  # remove empty tags
  et = [1]
  while et:
    empty_tags = soup.findAll(lambda tag: not tag.name == 'img' and not 'br' in tag.name and not tag.contents and (tag.string is None or not tag.string.strip()))
    et = [empty_tag.extract() for empty_tag in empty_tags]

  # Remove some attrs form tags
  for tag in soup.findAll(True):
    try:
        if tag['style'] == 'mso-bidi-font-weight:normal' or tag['style'] == 'mso-bidi-font-style:normal':
          del tag['style']
    except:
      pass
    try:
      if tag['lang'] == 'EN-US':
        del tag['lang']
    except:
      pass

  # <span style='font-family:Courier'></span> into <tt></tt>
  for a in soup.findAll('span'):
    try:
      if a['style'] == 'font-family:Courier' and len(a.attrs) == 1:
        del a['style']
        a.name = 'tt'
    except:
      pass

  # remove some tags
  for tag in soup.findAll():
    try:
      style = tag['style']
      style = style.split(';')
      for s in style:
        if 'display:none' in s:
          tag.extract()
          break
    except:
      pass
  # remove all *mso* `style` parameters
  for tag in soup.findAll():
    try:
      style = tag['style']
      style = style.split(';')
      style = [s for s in style if 'mso' not in s and\
                                  'tab-stops' not in s and\
                                  'page-break' not in s and\
                                  'font-family:YuTimes' not in s and\
                                  'font-family:Extra' not in s and\
                                  'Avant Garde' not in s]

      style = ';'.join(style)
      if style:
        tag['style'] = style
      else:
        del tag['style']
    except:
      pass
  # remove `div` tags without any attributes
  # for tag in soup.findAll('div'):
    # if not tag.attrs:
      # print tag
      # tag.replaceWithChildren()
  for tag in soup.findAll('span'):
    if not tag.attrs:
      # print tag
      tag.replaceWithChildren()
  for tag in soup.findAll('o:p'):
    tag.replaceWithChildren()

  # Remove tags but leave their content only if specified parameter is the only
  #  one in the tag
  bks = ['bk'+str(i) for i in range(10)]
  bks_style = ['mso-bookmark:'+i for i in bks]
  invalid_tags = {
    'a':{'name':bks},
    'span':{'style':bks_style+["mso-spacerun:yes"] }\
  }
  # def strip_tags(soup, invalid_tags, bks):
    # for tag in soup.findAll(True):
      # if tag.name in invalid_tags:
        # try:
          # attr = invalid_tags[tag.name].keys()[-1]
          # if len(tag.attrs) == 1 and tag[attr] in invalid_tags[tag.name][attr]:
            # pass
          # else:
            # continue
          # s = ""
          # for c in tag.contents:
            # if not isinstance(c, NavigableString):
              # c = strip_tags(BeautifulSoup(unicode(c), 'html.parser'), invalid_tags, bks)
            # s += unicode(c)
          # tag.replaceWith(s)
        # except:
          # pass
    # return soup
  # soup = strip_tags(soup, invalid_tags, bks)
  for tag in invalid_tags:
    for match in soup.findAll(tag):
      attr = invalid_tags[match.name].keys()[-1]
      try:
        if len(match.attrs) == 1 and match[attr] in invalid_tags[match.name][attr]:
          match.replaceWithChildren()
      except:
        pass


  # # remove <span> without attributes
  # def strip_tags(soup, invalid_tags):
    # for tag in soup.findAll(True):
      # if tag.name in invalid_tags and len(tag.attrs) == 0:
        # # print tag
        # s = unicode("")
        # for c in tag.contents:
          # if not isinstance(c, NavigableString):
            # c = strip_tags(BeautifulSoup(unicode(c), 'html.parser'), invalid_tags)
          # s += unicode(c)
        # # print s, '\n*******\n\n\n\n'
        # tag.replaceWith(s)
    # return soup
  # soup = strip_tags(soup, ['span'])
  for tag in ['span']:
    for match in soup.findAll(tag):
      if len(match.attrs) == 0:
        match.replaceWithChildren()


  # remove empty tags - AGAIN and AGAIN and AGAIN
  et = [1]
  while et:
    empty_tags = soup.findAll(lambda tag: not tag.name == 'img' and not 'br' in tag.name and not tag.contents and (tag.string is None or not tag.string.strip()))
    et = [empty_tag.extract() for empty_tag in empty_tags]


  #replace newlines inside tag content with spaces
  for tag in soup.findAll(True):
    if tag.name == 'pre' or not tag.contents:
      continue
    # for i in range(len(tag.contents))[::-1]:
      # if tag.contents[i] == '\n':
        # tag.contents[i] = u'\n'
    # print tag.contents
    i = tag.contents[0]
    while i:
      j = i.next_sibling
      if isinstance(i, NavigableString) and i.parent.name!='pre':
        x = i.encode("utf-8").strip().split('\n')
        # print "**\n",repr(tag.contents[i]),"\n*"
        # print x
        # print i.previous_element
        # x = [xx.strip() for xx in x]
        x = ' '.join(x)
        # print type(tag.contents[i])
        if x:
          if isinstance(i.previous_sibling, NavigableString) and i.parent.name!='pre':
            # get content of previous
            prev = i.previous_sibling.encode("utf-8")
            this = soup.new_string(unicode(prev.strip()+x.strip(), 'utf-8'))
            i.previous_sibling.replace_with( this )
            i.extract()
            i = j
            continue
          this = soup.new_string(unicode(x, 'utf-8'))
          i.replace_with( this )
        else:
          i.extract()
      i = j

  # remove empty tags - AGAIN and AGAIN and AGAIN
  eto = [1]
  while eto:
    eto = []
    empty_tags = soup.findAll(lambda tag: not tag.contents and (tag.string is None or not tag.string.strip()))
    # print empty_tags
    for et in empty_tags:
      if not et.attrs and et.name != 'br' and et.name != 'img':
        eto.append(et.extract())
      elif et.name != 'br' and et.name != 'img':
        print "Removeing: ", et
        eto.append(et.extract())


  # replace footnotes stylesheets
  def getNameClass(node):
    n = [node.name]
    try:
      n = [node.name + '.' + c for c in node['class']]
    except:
      pass
    return n
  def ancestralSet(node):
    try:
      return getNameClass(node.parent) + ancestralSet(node.parent)
    except:
      return []
  for s in soup.findAll('span', { "class" : "MsoFootnoteReference" }):
    if 'span.MsoFootnoteReference' in ancestralSet(s):
      s.replaceWithChildren()
  # for s in soup.findAll('span', { "style" : "font-size:8.0pt"}):
    # print list(s.children)[1]['style']
    # try:
      # if len(list(s.children)) == 3 and "font-size:8.0pt" in list(s.children)[1]['style']:
        # s.replaceWithChildren()
    # except:
      # pass


  # replace <abar> with <span class="CustomFootnote">
  for s in soup.findAll('abar'):
    if 'p.MsoFootnoteText' in ancestralSet(s):
      s.replaceWithChildren()
    else:
      s.name = 'span'
      s['class'] = "CustomFootnote"


  # generate classes instead of styles
  for tag in soup.findAll():
    try:
      if tag['style'] in stylesCSS:
        try:
          tn = [tag.name + '.' + c for c in tag['class']]
          tag['class'].append(style_name % stylesCSSno[tag['style']])
        except:
          tn = [tag.name]
          tag['class'] = [ style_name % stylesCSSno[tag['style']] ]
        for t in tn:
          if t not in stylesCSS[tag['style']]:
            stylesCSS[tag['style']] += [t]
        del tag['style']
      else:
        # remove style and add class
        stylesCSSno[tag['style']] = style_counter
        try:
          stylesCSS[tag['style']] = [tag.name + '.' + c for c in tag['class']]
          tag['class'].append(style_name % stylesCSSno[tag['style']])
        except:
          stylesCSS[tag['style']] = [tag.name]
          tag['class'] = [ style_name % stylesCSSno[tag['style']] ]
        del tag['style']
        style_counter += 1
    except:
      pass


  with open(soupName[:-4], 'w') as wfile:
    wfile.write(soup.prettify(encoding='utf-8', formatter='html'))

  # sys.exit('Here')
  with open(soupName[:-4], 'r') as wfile:
    clean = wfile.read()

  def cleanUp(txt, ls):
    for i in ls:
      txt = txt.split(i)
      txt = [tt.strip() for tt in txt]
      txt = filter(None, txt)
      txt = '\n'.join(txt)
    return txt

  clean = cleanUp(clean, ['if !vml', 'if !supportFootnotes', 'endif'])


  soup = BeautifulSoup(clean, "html.parser")
  puretxt = soup.prettify(encoding='utf-8', formatter='html')


  puretxt = cleanUp(puretxt, ['</br>'])
  # <br> cleanup and preserve indent
  puretxt = puretxt.replace('<br/>', '<br>')
  brs = puretxt.split('<br>')
  brsprs = brs[0].rstrip()
  for br_i in range(1, len(brs)):
    # get space count
    space_count = brsprs.rfind('\n') + 1
    space_count = brsprs[space_count:]
    space_count =len(space_count) - len(space_count.lstrip(' '))
    brsprs += '<br>\n' + space_count*' ' + brs[br_i].strip()
  puretxt = brsprs


  def cleanUp(txt, ls):
    for i in ls:
      txt = txt.split(i)
      txt = [tt for tt in txt]
      txt = filter(None, txt)
      # my join implementation
      if txt == []:
          txt = ""
      else:
        s = txt[0]
        for j in txt[1:]:
          if '/' in i:
            # punctuation mark afterwards
            if j.strip()[0] in ['.', ',', '?', '!', ':', ';', ')', ']', '\'', '"', '/', '&rsquo;', '&gt;']:
              s = s.rstrip() + i + j.lstrip()
            elif j.strip()[0:6] == '&nbsp;':
              s = s.rstrip() + i + j.lstrip()
            # two tags next to each other if their the same join the word: like <i>text</i><i>another text</i>
            elif j.strip()[:len(i.replace('/', ''))] == i.replace('/', ''):
              # remove tag form j at the begining and merge it with space(maybe)
              s = s.rstrip() + j.strip()[len(i.replace('/', '')):].lstrip()
            #there is another tag which is not in the list then do not remove newline
            elif j.strip()[0] == '<' and (j.strip()[:j.strip().find(' ')]+'>') not in ls:
              # if the previous symbol is tag as well
              if s.strip()[-1] == '>' and s.rstrip()[s.strip().rfind('<'):] not in ls:
                s += i + j
              else:
                s = s.rstrip() + i + j
            else: # ['(', '[', '-']
              s = s.rstrip() + i +  ' ' + j.lstrip()
            # txt = (i+' ').join(txt)
          else: # tag before-if same merge if different do not and dont intent
            if s.rstrip()[-1] in ['(', '[', '\'', '"', '&lsquo;', '&lt;']:
              s = s.rstrip() + i + j.lstrip()
            # non breaking space before
            elif s.strip()[-6:] == '&nbsp;':
              s = s.rstrip() + i + j.lstrip()
            # if different tag before do not remove indent
            elif s.strip()[-1] == '>' and s.rstrip()[s.strip().rfind('<'):] not in ls:
              # if the next is tag as well
              if j.strip()[0] == '<' and (j.strip()[:j.strip().find(' ')]+'>') not in ls:
                s += i + j
              else:
                s += i + j.lstrip()
            # if its first thing inside bracket ro pharenthesis
            else:
              s = s.rstrip() + ' ' + i + j.lstrip()
            # txt = (' '+i).join(txt)
        txt = s
    return txt

  puretxt = cleanUp(puretxt, ['<tt>', '</tt>', '<i>', '</i>', '<b>', '</b>'])


  with open(soupName[:-4], 'w') as wfile:
    wfile.write(puretxt)

  # sys.exit('Cleanup')

  # Print all tag names
  tagSoup = BeautifulSoup(puretxt, "html.parser")
  for tag in tagSoup.findAll():
    try:
      classes = tag['class']
      if classes:
        for c in classes:
          if 'AutoStyle' not in c:
            tc = tag.name + '.' + c
          else:
            tc = tag.name
          if tc not in tag_list:
            tag_list.append(tc)
      else:
        if tag.name not in tag_list:
          tag_list.append(tag.name)
    except:
      if tag.name not in tag_list:
        tag_list.append(tag.name)
    # try:
      # tn = tag.name + '._name_.' + tag['name']
      # if tn not in tag_list:
        # tag_list.append(tn)
    # except:
      # pass
    try:
      tc = tag.name + '.' + tag['style']
      if tc not in style_list:
        style_list.append(tc)
    except:
      if tag.name not in style_list:
        style_list.append(tag.name)

# print all tags
tag_list = [t.encode("utf-8") for t in tag_list]; tag_list.sort()
style_list = [t.encode("utf-8") for t in style_list]; style_list.sort()
# pprint(tag_list)
print "Tag.Class"
print tag_list
# for t in tag_list:
  # print t
# pprint(style_list)
# print "\n\nTag.Style"
# for t in style_list:
  # print t

# save new styles
font_styles = []
for f in stylesCSS.keys():
  ff = f.split(';')
  for ffi in ff:
    if 'font-family' in ffi:
      if ffi not in font_styles:
        font_styles.append(ffi.encode('utf-8'))
print font_styles
writeCSStoFile(stylesCSS, stylesCSSno)

## tiny.py
import pyparsing, sys

def getFontFace(styles):
  s = styles.split(';')
  for i in s:
    if 'font-family' in i:
      return i.strip().replace('font-family', '').strip().strip(':').strip().strip('\'').strip('\"').strip()
  sys.exit('Font-family not found')


def setOverlap(styleStr, listStyleStr):
  s1 = styleStr.split(',')
  s1 = [s.strip() for s in s1]

  s2 = []
  for i in listStyleStr:
    ss = i.split(',')
    ss = [s.strip() for s in ss]
    s2 += ss

  for s in s1:
    if s in s2:
      return False

  return True


STYLES = {}
FONT_FACE = {}
PAGE = {}

used_styles = ['a', 'b', 'br', 'div', 'div.WordSection1', 'div.WordSection2', 'div.WordSection3', 'div.WordSection4', 'h1', 'h2', 'h3', 'hr', 'i', 'img', 'p', 'p.Caption1', 'p.MsoFootnoteText', 'p.MsoNormal', 'p.answer', 'p.caption', 'p.cijfer', 'p.citaat', 'p.exercise', 'p.figure', 'p.formule', 'p.grammar', 'p.inter-title', 'p.intermezzo', 'p.med-caption', 'p.med-figure', 'p.oms', 'p.oms-eerst', 'p.opsomming', 'p.p-eerst', 'p.p-el', 'p.p-laatst', 'p.pi', 'p.pi-eerst', 'p.pi-el', 'p.pi-laatst', 'p.programma', 'p.query', 'p.referenties', 'p.romeinscijfer', 'p.sektie', 'p.sektie1', 'p.small-caption', 'p.small-figure', 'p.tekst', 'pre', 'pre.inherit', 'pre.query', 'pre.source', 'pre.swish', 'pre.temp', 'script', 'span', 'span.CustomFootnote', 'span.MsoFootnoteReference', 'span.query', 'span.swish', 'table', 'tbody', 'td', 'tr', 'tt']
used_fonts = ['font-family:Symbol', 'font-family:Times', 'font-family:Helvetica', 'font-family:Courier']

out_file = "mso.css"

CSS_files = ["../simply-logical/bootstrap/css/" + i for i in ["Part_I.css", "Part_II.css", "Part_III.css", "Appendix.css"]]
CSSs = []
for i in CSS_files:
  with open(i, 'r') as i_file:
    CSSs.append(i_file.read())

# Remove all comments
comment = pyparsing.nestedExpr("/*", "*/").suppress()
CSSs = [comment.transformString(i) for i in CSSs]

for css in CSSs:
  css_split = []
  for c in css.split('{'):
    css_split += [i.strip() for i in c.split('}') if i.strip()]

  names = []
  for n in css_split[0:][::2]: # odd
    name = n.split(',')
    name = [nam.strip() for nam in name if nam.strip()]
    name = ','.join(name)
    names.append(name)
  tags = []
  for n in css_split[1:][::2]: # even
    tag = n.split(';')
    tag = [t.strip() for t in tag if t.strip()]
    tag = [t for t in tag if 'mso-' not in t and 'tab-stop' not in t]
    tag_formatted = ""
    for t in tag:
      tag_formatted += '  ' + t + ';\n'
    tags.append(tag_formatted)

  if len(tags) != len(names):
    sys.exit("Tags and names do not agree")

  # remove empty tags
  for i in range(len(tags))[::-1]:
    if not tags[i]:
      tags.pop(i)
      names.pop(i)

  # divide into right placeholders
  # set_of_keys = setOfKeys()
  for i in range(len(tags)):
    if '@font-face' in names[i]:
      if getFontFace(tags[i]) in FONT_FACE:
        if FONT_FACE[getFontFace(tags[i])] != tags[i]:
          print('Font face does not agree\n' + tags[i] +'\n dic:\n'+FONT_FACE[getFontFace(tags[i])])
      else:
        FONT_FACE[getFontFace(tags[i])] = tags[i]
    elif '@page' in names[i]:
      if names[i] in PAGE:
        if PAGE[names[i]] != tags[i]:
          sys.exit('@Page does not agree\n  ' + tags[i])
      else:
        PAGE[names[i]] = tags[i]
    else:
      if names[i] in STYLES:
        if STYLES[names[i]] != tags[i]:
          print('Styles do not agree:'+names[i]+'\n' + tags[i] +'\nand\n' + STYLES[names[i]])
          STYLES[names[i]] += '/*' + tags[i] + '*/'
      else:
        if setOverlap(names[i], STYLES.keys()):
          STYLES[names[i]] = tags[i]
        else:
          sys.exit("Keys set is overlapping")

# filter used @Page
delMe = True
for i in PAGE.keys():
  delMe = True
  ni = i.strip().strip('@page').strip()
  if not ni:
    continue
  else:
    for j in used_styles:
      if ni in j:
        delMe = False
        break
    if delMe:
      del PAGE[i]

# filter used STYLES
delMe = True
for i in STYLES.keys():
  delMe = True
  if setOverlap(i, used_styles):
    del STYLES[i]

# filter used FONTS
delMe = True
for i in FONT_FACE.keys():
  delMe = True
  if 'font-family:'+i in used_fonts:
    delMe = False
  if delMe:
    del FONT_FACE[i]

with open(out_file, 'w') as of:
  s = ""
  for i in FONT_FACE:
    s += '@font-face' + '{\n' + FONT_FACE[i] + '}\n'
  for i in PAGE:
    s += i + '{\n' + PAGE[i] + '}\n'
  for i in STYLES:
    s += i + '{\n' + STYLES[i] + '}\n'
  of.write(s)
	import os
	import sys
	from bs4 import BeautifulSoup

	if len(sys.argv) != 3:
	sys.exit("First argument is HTML second is an output directory.")

	with open(sys.argv[1], "r") as html:
	soup = BeautifulSoup(html.read(), "html.parser")

	mydivs = soup.findAll("div", {"class":"extract"}) # swish, figure, exercise, infobox
	for d in mydivs:
	name = d["class"][1] + "_" + d["id"] + ".html"
	with open(os.path.join(sys.argv[2], name), 'w') as of:
	of.write(str(d))
	from bs4 import BeautifulSoup, Comment, NavigableString
	# from pprint import pprint
	# import sys

	style_name = "AutoStyle%02d"
	style_counter = 0
	stylesCSS = {}
	stylesCSSno = {}
	def writeCSStoFile(CSS, CSSno):
	s = ""
	for style in CSS:
	for tag in CSS[style]:
	s += tag.encode("utf-8") + '.' + style_name%CSSno[style] + ','
	s = s[:-1] + '{\n'
	defs = style.split(';')
	defs = [d for d in defs if d.strip()]
	for d in defs:
	s+= ' ' + d.encode("utf-8") + ';\n'
	s += '}\n'
	with open('common_style.css', 'w') as CSSfile:
	CSSfile.write(s)

	tag_list = []
	style_list = []

	soupNames = ["../simply-logical/Part I.htm.dat",\
	"../simply-logical/Part II.htm.dat",\
	"../simply-logical/Part III.htm.dat",\
	"../simply-logical/Appendix.htm.dat"]
	for soupName in soupNames:
	soup = BeautifulSoup(open(soupName), "html.parser")


	# replace \n with blank in tags parameters (attributes)
	# Don't do pre tags which are swish blocks
	for tag in soup.findAll(True):
	if tag.name == 'pre':
	continue
	for attr in tag.attrs:
	if type(tag[attr]) == str or type(tag[attr]) == unicode:
	x = tag[attr].encode("utf-8").split('\n')
	x = [xx.strip() for xx in x]
	x = ''.join(x)
	tag[attr] = unicode(x, 'utf-8')
	else:
	for i in range(len(tag[attr])):
	x = tag[attr][i].encode("utf-8").split('\n')
	x = [xx.strip() for xx in x]
	x = ''.join(x)
	tag[attr][i] = unicode(x, 'utf-8')


	# Remove comments
	for element in soup(text=lambda text: isinstance(text, Comment)):
	element.extract()

	# remove empty tags
	et = [1]
	while et:
	empty_tags = soup.findAll(lambda tag: not tag.name == 'img' and not 'br' in tag.name and not tag.contents and (tag.string is None or not tag.string.strip()))
	et = [empty_tag.extract() for empty_tag in empty_tags]

	# Remove some attrs form tags
	for tag in soup.findAll(True):
	try:
	if tag['style'] == 'mso-bidi-font-weight:normal' or tag['style'] == 'mso-bidi-font-style:normal':
	del tag['style']
	except:
	pass
	try:
	if tag['lang'] == 'EN-US':
	del tag['lang']
	except:
	pass

	# <span style='font-family:Courier'></span> into <tt></tt>
	for a in soup.findAll('span'):
	try:
	if a['style'] == 'font-family:Courier' and len(a.attrs) == 1:
	del a['style']
	a.name = 'tt'
	except:
	pass

	# remove some tags
	for tag in soup.findAll():
	try:
	style = tag['style']
	style = style.split(';')
	for s in style:
	if 'display:none' in s:
	tag.extract()
	break
	except:
	pass
	# remove all mso `style` parameters
	for tag in soup.findAll():
	try:
	style = tag['style']
	style = style.split(';')
	style = [s for s in style if 'mso' not in s and\
	'tab-stops' not in s and\
	'page-break' not in s and\
	'font-family:YuTimes' not in s and\
	'font-family:Extra' not in s and\
	'Avant Garde' not in s]

	style = ';'.join(style)
	if style:
	tag['style'] = style
	else:
	del tag['style']
	except:
	pass
	# remove `div` tags without any attributes
	# for tag in soup.findAll('div'):
	# if not tag.attrs:
	# print tag
	# tag.replaceWithChildren()
	for tag in soup.findAll('span'):
	if not tag.attrs:
	# print tag
	tag.replaceWithChildren()
	for tag in soup.findAll('o:p'):
	tag.replaceWithChildren()

	# Remove tags but leave their content only if specified parameter is the only
	# one in the tag
	bks = ['bk'+str(i) for i in range(10)]
	bks_style = ['mso-bookmark:'+i for i in bks]
	invalid_tags = {
	'a':{'name':bks},
	'span':{'style':bks_style+["mso-spacerun:yes"] }\
	}
	# def strip_tags(soup, invalid_tags, bks):
	# for tag in soup.findAll(True):
	# if tag.name in invalid_tags:
	# try:
	# attr = invalid_tags[tag.name].keys()[-1]
	# if len(tag.attrs) == 1 and tag[attr] in invalid_tags[tag.name][attr]:
	# pass
	# else:
	# continue
	# s = ""
	# for c in tag.contents:
	# if not isinstance(c, NavigableString):
	# c = strip_tags(BeautifulSoup(unicode(c), 'html.parser'), invalid_tags, bks)
	# s += unicode(c)
	# tag.replaceWith(s)
	# except:
	# pass
	# return soup
	# soup = strip_tags(soup, invalid_tags, bks)
	for tag in invalid_tags:
	for match in soup.findAll(tag):
	attr = invalid_tags[match.name].keys()[-1]
	try:
	if len(match.attrs) == 1 and match[attr] in invalid_tags[match.name][attr]:
	match.replaceWithChildren()
	except:
	pass


	# # remove <span> without attributes
	# def strip_tags(soup, invalid_tags):
	# for tag in soup.findAll(True):
	# if tag.name in invalid_tags and len(tag.attrs) == 0:
	# # print tag
	# s = unicode("")
	# for c in tag.contents:
	# if not isinstance(c, NavigableString):
	# c = strip_tags(BeautifulSoup(unicode(c), 'html.parser'), invalid_tags)
	# s += unicode(c)
	# # print s, '\n*******\n\n\n\n'
	# tag.replaceWith(s)
	# return soup
	# soup = strip_tags(soup, ['span'])
	for tag in ['span']:
	for match in soup.findAll(tag):
	if len(match.attrs) == 0:
	match.replaceWithChildren()


	# remove empty tags - AGAIN and AGAIN and AGAIN
	et = [1]
	while et:
	empty_tags = soup.findAll(lambda tag: not tag.name == 'img' and not 'br' in tag.name and not tag.contents and (tag.string is None or not tag.string.strip()))
	et = [empty_tag.extract() for empty_tag in empty_tags]


	#replace newlines inside tag content with spaces
	for tag in soup.findAll(True):
	if tag.name == 'pre' or not tag.contents:
	continue
	# for i in range(len(tag.contents))[::-1]:
	# if tag.contents[i] == '\n':
	# tag.contents[i] = u'\n'
	# print tag.contents
	i = tag.contents[0]
	while i:
	j = i.next_sibling
	if isinstance(i, NavigableString) and i.parent.name!='pre':
	x = i.encode("utf-8").strip().split('\n')
	# print "*\n",repr(tag.contents[i]),"\n"
	# print x
	# print i.previous_element
	# x = [xx.strip() for xx in x]
	x = ' '.join(x)
	# print type(tag.contents[i])
	if x:
	if isinstance(i.previous_sibling, NavigableString) and i.parent.name!='pre':
	# get content of previous
	prev = i.previous_sibling.encode("utf-8")
	this = soup.new_string(unicode(prev.strip()+x.strip(), 'utf-8'))
	i.previous_sibling.replace_with( this )
	i.extract()
	i = j
	continue
	this = soup.new_string(unicode(x, 'utf-8'))
	i.replace_with( this )
	else:
	i.extract()
	i = j

	# remove empty tags - AGAIN and AGAIN and AGAIN
	eto = [1]
	while eto:
	eto = []
	empty_tags = soup.findAll(lambda tag: not tag.contents and (tag.string is None or not tag.string.strip()))
	# print empty_tags
	for et in empty_tags:
	if not et.attrs and et.name != 'br' and et.name != 'img':
	eto.append(et.extract())
	elif et.name != 'br' and et.name != 'img':
	print "Removeing: ", et
	eto.append(et.extract())


	# replace footnotes stylesheets
	def getNameClass(node):
	n = [node.name]
	try:
	n = [node.name + '.' + c for c in node['class']]
	except:
	pass
	return n
	def ancestralSet(node):
	try:
	return getNameClass(node.parent) + ancestralSet(node.parent)
	except:
	return []
	for s in soup.findAll('span', { "class" : "MsoFootnoteReference" }):
	if 'span.MsoFootnoteReference' in ancestralSet(s):
	s.replaceWithChildren()
	# for s in soup.findAll('span', { "style" : "font-size:8.0pt"}):
	# print list(s.children)[1]['style']
	# try:
	# if len(list(s.children)) == 3 and "font-size:8.0pt" in list(s.children)[1]['style']:
	# s.replaceWithChildren()
	# except:
	# pass


	# replace <abar> with <span class="CustomFootnote">
	for s in soup.findAll('abar'):
	if 'p.MsoFootnoteText' in ancestralSet(s):
	s.replaceWithChildren()
	else:
	s.name = 'span'
	s['class'] = "CustomFootnote"


	# generate classes instead of styles
	for tag in soup.findAll():
	try:
	if tag['style'] in stylesCSS:
	try:
	tn = [tag.name + '.' + c for c in tag['class']]
	tag['class'].append(style_name % stylesCSSno[tag['style']])
	except:
	tn = [tag.name]
	tag['class'] = [ style_name % stylesCSSno[tag['style']] ]
	for t in tn:
	if t not in stylesCSS[tag['style']]:
	stylesCSS[tag['style']] += [t]
	del tag['style']
	else:
	# remove style and add class
	stylesCSSno[tag['style']] = style_counter
	try:
	stylesCSS[tag['style']] = [tag.name + '.' + c for c in tag['class']]
	tag['class'].append(style_name % stylesCSSno[tag['style']])
	except:
	stylesCSS[tag['style']] = [tag.name]
	tag['class'] = [ style_name % stylesCSSno[tag['style']] ]
	del tag['style']
	style_counter += 1
	except:
	pass


	with open(soupName[:-4], 'w') as wfile:
	wfile.write(soup.prettify(encoding='utf-8', formatter='html'))

	# sys.exit('Here')
	with open(soupName[:-4], 'r') as wfile:
	clean = wfile.read()

	def cleanUp(txt, ls):
	for i in ls:
	txt = txt.split(i)
	txt = [tt.strip() for tt in txt]
	txt = filter(None, txt)
	txt = '\n'.join(txt)
	return txt

	clean = cleanUp(clean, ['if !vml', 'if !supportFootnotes', 'endif'])


	soup = BeautifulSoup(clean, "html.parser")
	puretxt = soup.prettify(encoding='utf-8', formatter='html')


	puretxt = cleanUp(puretxt, ['</br>'])
	# <br> cleanup and preserve indent
	puretxt = puretxt.replace('<br/>', '<br>')
	brs = puretxt.split('<br>')
	brsprs = brs[0].rstrip()
	for br_i in range(1, len(brs)):
	# get space count
	space_count = brsprs.rfind('\n') + 1
	space_count = brsprs[space_count:]
	space_count =len(space_count) - len(space_count.lstrip(' '))
	brsprs += '<br>\n' + space_count*' ' + brs[br_i].strip()
	puretxt = brsprs


	def cleanUp(txt, ls):
	for i in ls:
	txt = txt.split(i)
	txt = [tt for tt in txt]
	txt = filter(None, txt)
	# my join implementation
	if txt == []:
	txt = ""
	else:
	s = txt[0]
	for j in txt[1:]:
	if '/' in i:
	# punctuation mark afterwards
	if j.strip()[0] in ['.', ',', '?', '!', ':', ';', ')', ']', '\'', '"', '/', '’', '>']:
	s = s.rstrip() + i + j.lstrip()
	elif j.strip()[0:6] == ' ':
	s = s.rstrip() + i + j.lstrip()
	# two tags next to each other if their the same join the word: like <i>text</i><i>another text</i>
	elif j.strip()[:len(i.replace('/', ''))] == i.replace('/', ''):
	# remove tag form j at the begining and merge it with space(maybe)
	s = s.rstrip() + j.strip()[len(i.replace('/', '')):].lstrip()
	#there is another tag which is not in the list then do not remove newline
	elif j.strip()[0] == '<' and (j.strip()[:j.strip().find(' ')]+'>') not in ls:
	# if the previous symbol is tag as well
	if s.strip()[-1] == '>' and s.rstrip()[s.strip().rfind('<'):] not in ls:
	s += i + j
	else:
	s = s.rstrip() + i + j
	else: # ['(', '[', '-']
	s = s.rstrip() + i + ' ' + j.lstrip()
	# txt = (i+' ').join(txt)
	else: # tag before-if same merge if different do not and dont intent
	if s.rstrip()[-1] in ['(', '[', '\'', '"', '‘', '<']:
	s = s.rstrip() + i + j.lstrip()
	# non breaking space before
	elif s.strip()[-6:] == ' ':
	s = s.rstrip() + i + j.lstrip()
	# if different tag before do not remove indent
	elif s.strip()[-1] == '>' and s.rstrip()[s.strip().rfind('<'):] not in ls:
	# if the next is tag as well
	if j.strip()[0] == '<' and (j.strip()[:j.strip().find(' ')]+'>') not in ls:
	s += i + j
	else:
	s += i + j.lstrip()
	# if its first thing inside bracket ro pharenthesis
	else:
	s = s.rstrip() + ' ' + i + j.lstrip()
	# txt = (' '+i).join(txt)
	txt = s
	return txt

	puretxt = cleanUp(puretxt, ['<tt>', '</tt>', '<i>', '</i>', '<b>', '</b>'])



	with open(soupName[:-4], 'w') as wfile:
	wfile.write(puretxt)

	# sys.exit('Cleanup')

	# Print all tag names
	tagSoup = BeautifulSoup(puretxt, "html.parser")
	for tag in tagSoup.findAll():
	try:
	classes = tag['class']
	if classes:
	for c in classes:
	if 'AutoStyle' not in c:
	tc = tag.name + '.' + c
	else:
	tc = tag.name
	if tc not in tag_list:
	tag_list.append(tc)
	else:
	if tag.name not in tag_list:
	tag_list.append(tag.name)
	except:
	if tag.name not in tag_list:
	tag_list.append(tag.name)
	# try:
	# tn = tag.name + '._name_.' + tag['name']
	# if tn not in tag_list:
	# tag_list.append(tn)
	# except:
	# pass
	try:
	tc = tag.name + '.' + tag['style']
	if tc not in style_list:
	style_list.append(tc)
	except:
	if tag.name not in style_list:
	style_list.append(tag.name)

	# print all tags
	tag_list = [t.encode("utf-8") for t in tag_list]; tag_list.sort()
	style_list = [t.encode("utf-8") for t in style_list]; style_list.sort()
	# pprint(tag_list)
	print "Tag.Class"
	print tag_list
	# for t in tag_list:
	# print t
	# pprint(style_list)
	# print "\n\nTag.Style"
	# for t in style_list:
	# print t

	# save new styles
	font_styles = []
	for f in stylesCSS.keys():
	ff = f.split(';')
	for ffi in ff:
	if 'font-family' in ffi:
	if ffi not in font_styles:
	font_styles.append(ffi.encode('utf-8'))
	print font_styles
	writeCSStoFile(stylesCSS, stylesCSSno)
	import pyparsing, sys

	def getFontFace(styles):
	s = styles.split(';')
	for i in s:
	if 'font-family' in i:
	return i.strip().replace('font-family', '').strip().strip(':').strip().strip('\'').strip('\"').strip()
	sys.exit('Font-family not found')


	def setOverlap(styleStr, listStyleStr):
	s1 = styleStr.split(',')
	s1 = [s.strip() for s in s1]

	s2 = []
	for i in listStyleStr:
	ss = i.split(',')
	ss = [s.strip() for s in ss]
	s2 += ss

	for s in s1:
	if s in s2:
	return False

	return True


	STYLES = {}
	FONT_FACE = {}
	PAGE = {}

	used_styles = ['a', 'b', 'br', 'div', 'div.WordSection1', 'div.WordSection2', 'div.WordSection3', 'div.WordSection4', 'h1', 'h2', 'h3', 'hr', 'i', 'img', 'p', 'p.Caption1', 'p.MsoFootnoteText', 'p.MsoNormal', 'p.answer', 'p.caption', 'p.cijfer', 'p.citaat', 'p.exercise', 'p.figure', 'p.formule', 'p.grammar', 'p.inter-title', 'p.intermezzo', 'p.med-caption', 'p.med-figure', 'p.oms', 'p.oms-eerst', 'p.opsomming', 'p.p-eerst', 'p.p-el', 'p.p-laatst', 'p.pi', 'p.pi-eerst', 'p.pi-el', 'p.pi-laatst', 'p.programma', 'p.query', 'p.referenties', 'p.romeinscijfer', 'p.sektie', 'p.sektie1', 'p.small-caption', 'p.small-figure', 'p.tekst', 'pre', 'pre.inherit', 'pre.query', 'pre.source', 'pre.swish', 'pre.temp', 'script', 'span', 'span.CustomFootnote', 'span.MsoFootnoteReference', 'span.query', 'span.swish', 'table', 'tbody', 'td', 'tr', 'tt']
	used_fonts = ['font-family:Symbol', 'font-family:Times', 'font-family:Helvetica', 'font-family:Courier']

	out_file = "mso.css"

	CSS_files = ["../simply-logical/bootstrap/css/" + i for i in ["Part_I.css", "Part_II.css", "Part_III.css", "Appendix.css"]]
	CSSs = []
	for i in CSS_files:
	with open(i, 'r') as i_file:
	CSSs.append(i_file.read())

	# Remove all comments
	comment = pyparsing.nestedExpr("/", "/").suppress()
	CSSs = [comment.transformString(i) for i in CSSs]

	for css in CSSs:
	css_split = []
	for c in css.split('{'):
	css_split += [i.strip() for i in c.split('}') if i.strip()]

	names = []
	for n in css_split[0:][::2]: # odd
	name = n.split(',')
	name = [nam.strip() for nam in name if nam.strip()]
	name = ','.join(name)
	names.append(name)
	tags = []
	for n in css_split[1:][::2]: # even
	tag = n.split(';')
	tag = [t.strip() for t in tag if t.strip()]
	tag = [t for t in tag if 'mso-' not in t and 'tab-stop' not in t]
	tag_formatted = ""
	for t in tag:
	tag_formatted += ' ' + t + ';\n'
	tags.append(tag_formatted)

	if len(tags) != len(names):
	sys.exit("Tags and names do not agree")

	# remove empty tags
	for i in range(len(tags))[::-1]:
	if not tags[i]:
	tags.pop(i)
	names.pop(i)

	# divide into right placeholders
	# set_of_keys = setOfKeys()
	for i in range(len(tags)):
	if '@font-face' in names[i]:
	if getFontFace(tags[i]) in FONT_FACE:
	if FONT_FACE[getFontFace(tags[i])] != tags[i]:
	print('Font face does not agree\n' + tags[i] +'\n dic:\n'+FONT_FACE[getFontFace(tags[i])])
	else:
	FONT_FACE[getFontFace(tags[i])] = tags[i]
	elif '@page' in names[i]:
	if names[i] in PAGE:
	if PAGE[names[i]] != tags[i]:
	sys.exit('@Page does not agree\n ' + tags[i])
	else:
	PAGE[names[i]] = tags[i]
	else:
	if names[i] in STYLES:
	if STYLES[names[i]] != tags[i]:
	print('Styles do not agree:'+names[i]+'\n' + tags[i] +'\nand\n' + STYLES[names[i]])
	STYLES[names[i]] += '/' + tags[i] + '/'
	else:
	if setOverlap(names[i], STYLES.keys()):
	STYLES[names[i]] = tags[i]
	else:
	sys.exit("Keys set is overlapping")

	# filter used @Page
	delMe = True
	for i in PAGE.keys():
	delMe = True
	ni = i.strip().strip('@page').strip()
	if not ni:
	continue
	else:
	for j in used_styles:
	if ni in j:
	delMe = False
	break
	if delMe:
	del PAGE[i]

	# filter used STYLES
	delMe = True
	for i in STYLES.keys():
	delMe = True
	if setOverlap(i, used_styles):
	del STYLES[i]

	# filter used FONTS
	delMe = True
	for i in FONT_FACE.keys():
	delMe = True
	if 'font-family:'+i in used_fonts:
	delMe = False
	if delMe:
	del FONT_FACE[i]

	with open(out_file, 'w') as of:
	s = ""
	for i in FONT_FACE:
	s += '@font-face' + '{\n' + FONT_FACE[i] + '}\n'
	for i in PAGE:
	s += i + '{\n' + PAGE[i] + '}\n'
	for i in STYLES:
	s += i + '{\n' + STYLES[i] + '}\n'
	of.write(s)