Skip to content

Instantly share code, notes, and snippets.

@gartenfeld
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gartenfeld/74f9cfdbc8faf1d98537 to your computer and use it in GitHub Desktop.
Save gartenfeld/74f9cfdbc8faf1d98537 to your computer and use it in GitHub Desktop.
Making XML more semantic using BeautifulSoup.
from bs4 import BeautifulSoup
from bs4.element import Tag
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
import locale
def is_tag(tag):
return isinstance(tag, Tag)
def inspect_file(file_name):
raw_file = source_path + file_name
raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup
entry = raw_soup.find('div', class_="entry")
gloss_blocks = entry.find_all('div', class_="gloss-block")
for gloss_block in gloss_blocks:
# Make new gloss-block wrapper
new_block = BeautifulSoup().new_tag("div", **{'class':'gloss-block'})
# Check for Sense Number
first_element = gloss_block.contents[0]
remove_sn = False
# If it's in a <b> tag and it's the first element, it's a Sense Number
if is_tag(first_element):
if first_element.name == 'b':
if first_element.string.isdigit:
# Make tag
sense_number = BeautifulSoup().new_tag("div", **{'class':'sense-number'})
sense_number.string = first_element.string
# Add tag to new block
new_block.append(sense_number)
# Remove element from soup
remove_sn = True
# Check for Lexical Class
first_i = gloss_block.find('i')
if first_i != None:
sib = first_i.find_previous_sibling()
if sib != None:
if sib.name == 'b' or sib.name == 'font':
# Make tag
lex_class = BeautifulSoup().new_tag("div", **{'class':'lexical-class'})
lex_class.string = first_i.string
# Add tag to new block
new_block.append(lex_class)
# Remove element from soup
first_i.decompose()
# Only now can sense-number be removed, after checking the lexical class tag
if remove_sn:
first_element.decompose()
# Enclose all domains with a symbol
domain_tags = gloss_block.find_all('font')
for domain_tag in domain_tags:
domain_tag.string = "⋅" + domain_tag.string.strip() + "⋅"
# Extract Summary Gloss
EN_spans = gloss_block.find_all('span', lang="en")
# Make summary-gloss wrapper tag
summary_wrapper = BeautifulSoup().new_tag("div", **{'class':'summary-gloss'})
for EN_span in EN_spans:
EN_string = "".join(EN_span.find_all(text=True)).strip()
if EN_string[-1:] == ",": EN_string = EN_string[:-1]
# Exclude redirecting links
if EN_string != 'ks' and EN_string != 'ks.':
# Make en-gloss tag
en_gloss = BeautifulSoup().new_tag("div", **{'class':'en-gloss'})
en_gloss.string = EN_string
summary_wrapper.append(en_gloss)
# Add tag to new block
new_block.append(summary_wrapper)
# Concatenate the remaining text into full-gloss
full_gloss = BeautifulSoup().new_tag("div", **{'class':'full-gloss'})
full_string = "".join(gloss_block.find_all(text=True)).strip()
# Remove redundant spaces
full_gloss.string = re.sub(r' +',' ',full_string)
new_block.append(full_gloss)
# Replace the block
gloss_block.replace_with(new_block)
# Write entry into file
f_output = open(raw_file,'w')
f_output.write(file_header+str(entry)+file_footer)
f_output.close()
return
def check_all(files_list):
for i, file_name in enumerate(files_list):
if i%5000 == 0: print ("Progress: " + str(int(i*100/len(files_list)))+"%")
inspect_file(file_name)
return
def load_directory(source_path):
files_list = []
for file_name in os.listdir(source_path):
try:
if file_name.endswith(".html"):
files_list.append(file_name)
except IndexError:
sys.stderr.write("Something went wrong with " + file_name + ".")
continue
locale.setlocale(locale.LC_ALL, 'en_AU')
nr_loaded = locale.format("%d", len(files_list), grouping=True)
print(nr_loaded + " files loaded.")
return files_list
if __name__ == '__main__':
source_path = "Data-FI-EN/"
file_header = """<html>
<head>
<meta charset="utf-8">
</head>
<body>
"""
file_footer ="""
</body>
</html>"""
print("Loading files...")
files_list = load_directory(source_path) # Load list of raw files
print("Checking all files...")
check_all(files_list)
print("Valmis!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment