Skip to content

Instantly share code, notes, and snippets.

@gartenfeld
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gartenfeld/5b1b807ddfce3fc22d6a to your computer and use it in GitHub Desktop.
Save gartenfeld/5b1b807ddfce3fc22d6a to your computer and use it in GitHub Desktop.
Re-formatting some XML using BeautifulSoup
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
import locale
def extract_index_word(entry):
first_head = entry.find('div', class_="head-block")
first_word = first_head.find('font', class_="large")
first_str_list = first_word.find_all(text=True)
index_word = "".join(first_str_list).strip()
return index_word
def inspect_file(file_name):
raw_file = source_path + file_name
raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup
# Load all content, since each file has only 1 entry DIV
entry = raw_soup.find('div', class_="entry")
# Find the first headword, save it as the index word
primary_index = extract_index_word(entry)
# Load headword block(s), there may be multiple
head_blocks = entry.find_all('div', class_="head-block")
for head_block in head_blocks:
# Prepare new head-block tag
new_block = BeautifulSoup().new_tag("div", **{'class':'head-block'})
# Extract all headword elements
headwords = head_block.find_all('font', class_="large")
# Save every component as an index word
# Concatenate all components into the headword string
head_phrase = ""
index_words = []
for sub_head in headwords:
sub_word = "".join(sub_head.find_all(text=True)) # Component with spaces
head_phrase += sub_word # Concatenate headword phrase
sub_word_trim = sub_word.strip().replace(", ", "")
# Save component as an index word
if sub_word_trim != primary_index:
index_words.append(sub_word_trim)
# Make headword tag with BeautifulSoup
headword_soup = BeautifulSoup().new_tag("div", **{'class':'headword'})
headword_soup.string = head_phrase
new_block.append(headword_soup)
# Make index-word tag with BeautifulSoup
indices_soup = BeautifulSoup().new_tag("div", **{'class':'indices'})
primary_tag = BeautifulSoup().new_tag("div", **{'class':'primary'})
primary_tag.string = primary_index
indices_soup.append(primary_tag)
for i_word in index_words:
component_tag = BeautifulSoup().new_tag("div", **{'class':'factor'})
component_tag.string = i_word
indices_soup.append(component_tag)
new_block.append(indices_soup)
# Remove headword elements from Soup
for h in headwords:
h.decompose()
# Extract all domain blocks (only 2 or 3 files have multiple)
# But many have multiple domains (comma-separated) within the same domain-block
domain_blocks = head_block.find_all('font', class_="xsmall")
if len(domain_blocks) > 0:
# Make an enclosure tag
domains_soup = BeautifulSoup().new_tag("div", **{'class':'domains'})
for domain_block in domain_blocks:
domain_text = "".join(domain_block.find_all(text=True))
split_set = domain_text.split(", ")
for domain_item in split_set:
# Write each domain into a tag
domain_tag = BeautifulSoup().new_tag("div", **{'class':'domain'})
domain_tag.string = domain_item
# Save each domain into the enclosure soup
domains_soup.append(domain_tag)
new_block.append(domains_soup)
# Remove domain elements from Soup
for d in domain_blocks:
if d != None: d.decompose()
# Extract homograph number (max. one per block)
sup = head_block.find('sup')
if sup != None:
sup_text = "".join(sup.find_all(text=True))
sup_tag = BeautifulSoup().new_tag("div", **{'class':'hg-number'})
sup_tag.string = sup_text
new_block.append(sup_tag)
# Remove SUP element
sup.decompose()
# Find all remaining text in head block
all_string = head_block.find_all(text=True)
# Concatenate notes into one string
notes = "".join(all_string).strip()
if notes != "":
notes_tag = BeautifulSoup().new_tag("div", **{'class':'notes'})
notes_tag.string = notes
new_block.append(notes_tag)
# Replace head-block
head_block.replace_with(new_block)
# Write entry into file
f_output = open(raw_file,'w')
f_output.write(file_header+str(entry)+file_footer)
f_output.close()
return
def check_all(files_list):
for i, file_name in enumerate(files_list):
# if i%5000 == 0: print ("Progress: " + str(int(i*100/len(files_list)))+"%")
inspect_file(file_name)
return
def load_directory(source_path):
files_list = []
for file_name in os.listdir(source_path):
try:
if file_name.endswith(".html"):
files_list.append(file_name)
except IndexError:
sys.stderr.write("Something went wrong with " + file_name + ".")
continue
return files_list
if __name__ == '__main__':
source_path = "Data-FI-EN/"
file_header = """<html>
<head>
<meta charset="utf-8">
</head>
<body>
"""
file_footer ="""
</body>
</html>"""
print("Loading files...")
files_list = load_directory(source_path) # Load list of raw files
nr_loaded = locale.format("%d", len(files_list), grouping=True)
print(nr_loaded + " files loaded.")
print("Checking all files...")
check_all(files_list)
print("Valmis!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment