Last active
August 29, 2015 14:10
-
-
Save gartenfeld/5b1b807ddfce3fc22d6a to your computer and use it in GitHub Desktop.
Re-formatting some XML using BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re # Regular Expressions | |
import collections # Data Types | |
import sys # File operations | |
import codecs # UniCode support | |
import os | |
import locale | |
def extract_index_word(entry): | |
first_head = entry.find('div', class_="head-block") | |
first_word = first_head.find('font', class_="large") | |
first_str_list = first_word.find_all(text=True) | |
index_word = "".join(first_str_list).strip() | |
return index_word | |
def inspect_file(file_name): | |
raw_file = source_path + file_name | |
raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup | |
# Load all content, since each file has only 1 entry DIV | |
entry = raw_soup.find('div', class_="entry") | |
# Find the first headword, save it as the index word | |
primary_index = extract_index_word(entry) | |
# Load headword block(s), there may be multiple | |
head_blocks = entry.find_all('div', class_="head-block") | |
for head_block in head_blocks: | |
# Prepare new head-block tag | |
new_block = BeautifulSoup().new_tag("div", **{'class':'head-block'}) | |
# Extract all headword elements | |
headwords = head_block.find_all('font', class_="large") | |
# Save every component as an index word | |
# Concatenate all components into the headword string | |
head_phrase = "" | |
index_words = [] | |
for sub_head in headwords: | |
sub_word = "".join(sub_head.find_all(text=True)) # Component with spaces | |
head_phrase += sub_word # Concatenate headword phrase | |
sub_word_trim = sub_word.strip().replace(", ", "") | |
# Save component as an index word | |
if sub_word_trim != primary_index: | |
index_words.append(sub_word_trim) | |
# Make headword tag with BeautifulSoup | |
headword_soup = BeautifulSoup().new_tag("div", **{'class':'headword'}) | |
headword_soup.string = head_phrase | |
new_block.append(headword_soup) | |
# Make index-word tag with BeautifulSoup | |
indices_soup = BeautifulSoup().new_tag("div", **{'class':'indices'}) | |
primary_tag = BeautifulSoup().new_tag("div", **{'class':'primary'}) | |
primary_tag.string = primary_index | |
indices_soup.append(primary_tag) | |
for i_word in index_words: | |
component_tag = BeautifulSoup().new_tag("div", **{'class':'factor'}) | |
component_tag.string = i_word | |
indices_soup.append(component_tag) | |
new_block.append(indices_soup) | |
# Remove headword elements from Soup | |
for h in headwords: | |
h.decompose() | |
# Extract all domain blocks (only 2 or 3 files have multiple) | |
# But many have multiple domains (comma-separated) within the same domain-block | |
domain_blocks = head_block.find_all('font', class_="xsmall") | |
if len(domain_blocks) > 0: | |
# Make an enclosure tag | |
domains_soup = BeautifulSoup().new_tag("div", **{'class':'domains'}) | |
for domain_block in domain_blocks: | |
domain_text = "".join(domain_block.find_all(text=True)) | |
split_set = domain_text.split(", ") | |
for domain_item in split_set: | |
# Write each domain into a tag | |
domain_tag = BeautifulSoup().new_tag("div", **{'class':'domain'}) | |
domain_tag.string = domain_item | |
# Save each domain into the enclosure soup | |
domains_soup.append(domain_tag) | |
new_block.append(domains_soup) | |
# Remove domain elements from Soup | |
for d in domain_blocks: | |
if d != None: d.decompose() | |
# Extract homograph number (max. one per block) | |
sup = head_block.find('sup') | |
if sup != None: | |
sup_text = "".join(sup.find_all(text=True)) | |
sup_tag = BeautifulSoup().new_tag("div", **{'class':'hg-number'}) | |
sup_tag.string = sup_text | |
new_block.append(sup_tag) | |
# Remove SUP element | |
sup.decompose() | |
# Find all remaining text in head block | |
all_string = head_block.find_all(text=True) | |
# Concatenate notes into one string | |
notes = "".join(all_string).strip() | |
if notes != "": | |
notes_tag = BeautifulSoup().new_tag("div", **{'class':'notes'}) | |
notes_tag.string = notes | |
new_block.append(notes_tag) | |
# Replace head-block | |
head_block.replace_with(new_block) | |
# Write entry into file | |
f_output = open(raw_file,'w') | |
f_output.write(file_header+str(entry)+file_footer) | |
f_output.close() | |
return | |
def check_all(files_list): | |
for i, file_name in enumerate(files_list): | |
# if i%5000 == 0: print ("Progress: " + str(int(i*100/len(files_list)))+"%") | |
inspect_file(file_name) | |
return | |
def load_directory(source_path): | |
files_list = [] | |
for file_name in os.listdir(source_path): | |
try: | |
if file_name.endswith(".html"): | |
files_list.append(file_name) | |
except IndexError: | |
sys.stderr.write("Something went wrong with " + file_name + ".") | |
continue | |
return files_list | |
if __name__ == '__main__': | |
source_path = "Data-FI-EN/" | |
file_header = """<html> | |
<head> | |
<meta charset="utf-8"> | |
</head> | |
<body> | |
""" | |
file_footer =""" | |
</body> | |
</html>""" | |
print("Loading files...") | |
files_list = load_directory(source_path) # Load list of raw files | |
nr_loaded = locale.format("%d", len(files_list), grouping=True) | |
print(nr_loaded + " files loaded.") | |
print("Checking all files...") | |
check_all(files_list) | |
print("Valmis!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment