gartenfeld/head_block_convert.py

## head_block_convert.py
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
import locale

def extract_index_word(entry):
	first_head = entry.find('div', class_="head-block")
	first_word = first_head.find('font', class_="large")
	first_str_list = first_word.find_all(text=True)
	index_word = "".join(first_str_list).strip()
	return index_word

def inspect_file(file_name):


	raw_file = source_path + file_name
	raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup

	# Load all content, since each file has only 1 entry DIV
	entry = raw_soup.find('div', class_="entry")

	# Find the first headword, save it as the index word
	primary_index = extract_index_word(entry)

	# Load headword block(s), there may be multiple
	head_blocks = entry.find_all('div', class_="head-block")

	for head_block in head_blocks:

		# Prepare new head-block tag
		new_block = BeautifulSoup().new_tag("div", **{'class':'head-block'})

		# Extract all headword elements
		headwords = head_block.find_all('font', class_="large")

		# Save every component as an index word
		# Concatenate all components into the headword string
		head_phrase = ""
		index_words = []
		for sub_head in headwords:
			sub_word = "".join(sub_head.find_all(text=True)) # Component with spaces
			head_phrase += sub_word # Concatenate headword phrase

			sub_word_trim = sub_word.strip().replace(", ", "")
			# Save component as an index word
			if sub_word_trim != primary_index:
				index_words.append(sub_word_trim)

		# Make headword tag with BeautifulSoup
		headword_soup = BeautifulSoup().new_tag("div", **{'class':'headword'})
		headword_soup.string = head_phrase

		new_block.append(headword_soup)

		# Make index-word tag with BeautifulSoup
		indices_soup = BeautifulSoup().new_tag("div", **{'class':'indices'})
		primary_tag = BeautifulSoup().new_tag("div", **{'class':'primary'})
		primary_tag.string = primary_index
		indices_soup.append(primary_tag)
		for i_word in index_words:
			component_tag = BeautifulSoup().new_tag("div", **{'class':'factor'})
			component_tag.string = i_word
			indices_soup.append(component_tag)

		new_block.append(indices_soup)

		# Remove headword elements from Soup
		for h in headwords:
			h.decompose()

		# Extract all domain blocks (only 2 or 3 files have multiple)
		# But many have multiple domains (comma-separated) within the same domain-block
		domain_blocks = head_block.find_all('font', class_="xsmall")


		if len(domain_blocks) > 0:

			# Make an enclosure tag
			domains_soup = BeautifulSoup().new_tag("div", **{'class':'domains'})

			for domain_block in domain_blocks:
				domain_text = "".join(domain_block.find_all(text=True))
				split_set = domain_text.split(", ")
				for domain_item in split_set:
					# Write each domain into a tag
					domain_tag = BeautifulSoup().new_tag("div", **{'class':'domain'})
					domain_tag.string = domain_item
					# Save each domain into the enclosure soup
					domains_soup.append(domain_tag)

			new_block.append(domains_soup)

			# Remove domain elements from Soup
			for d in domain_blocks:
				if d != None: d.decompose()

		# Extract homograph number (max. one per block)
		sup = head_block.find('sup')
		if sup != None:
			sup_text = "".join(sup.find_all(text=True))
			sup_tag = BeautifulSoup().new_tag("div", **{'class':'hg-number'})
			sup_tag.string = sup_text

			new_block.append(sup_tag)

			# Remove SUP element
			sup.decompose()


		# Find all remaining text in head block
		all_string = head_block.find_all(text=True)

		# Concatenate notes into one string
		notes = "".join(all_string).strip()

		if notes != "":
			notes_tag = BeautifulSoup().new_tag("div", **{'class':'notes'})
			notes_tag.string = notes

			new_block.append(notes_tag)

		# Replace head-block
		head_block.replace_with(new_block)


	# Write entry into file
	f_output = open(raw_file,'w')
	f_output.write(file_header+str(entry)+file_footer)
	f_output.close()

	return

def check_all(files_list):
	for i, file_name in enumerate(files_list):
		# if i%5000 == 0: print ("Progress: " + str(int(i*100/len(files_list)))+"%")
		inspect_file(file_name)

	return

def load_directory(source_path):
	files_list = []
	for file_name in os.listdir(source_path):
		try:
			if file_name.endswith(".html"):
				files_list.append(file_name)
		except IndexError:
			sys.stderr.write("Something went wrong with " + file_name + ".")
			continue
	return files_list

if __name__ == '__main__':

	source_path = "Data-FI-EN/"
	file_header = """<html>
<head>
<meta charset="utf-8">
</head>
<body>
"""
	file_footer ="""
</body>
</html>"""
	print("Loading files...")
	files_list = load_directory(source_path) # Load list of raw files
	nr_loaded = locale.format("%d", len(files_list), grouping=True)
	print(nr_loaded + " files loaded.")
	print("Checking all files...")
	check_all(files_list)

	print("Valmis!")
	from bs4 import BeautifulSoup
	import re # Regular Expressions
	import collections # Data Types
	import sys # File operations
	import codecs # UniCode support
	import os
	import locale

	def extract_index_word(entry):
	first_head = entry.find('div', class_="head-block")
	first_word = first_head.find('font', class_="large")
	first_str_list = first_word.find_all(text=True)
	index_word = "".join(first_str_list).strip()
	return index_word

	def inspect_file(file_name):


	raw_file = source_path + file_name
	raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup

	# Load all content, since each file has only 1 entry DIV
	entry = raw_soup.find('div', class_="entry")

	# Find the first headword, save it as the index word
	primary_index = extract_index_word(entry)

	# Load headword block(s), there may be multiple
	head_blocks = entry.find_all('div', class_="head-block")

	for head_block in head_blocks:

	# Prepare new head-block tag
	new_block = BeautifulSoup().new_tag("div", **{'class':'head-block'})

	# Extract all headword elements
	headwords = head_block.find_all('font', class_="large")

	# Save every component as an index word
	# Concatenate all components into the headword string
	head_phrase = ""
	index_words = []
	for sub_head in headwords:
	sub_word = "".join(sub_head.find_all(text=True)) # Component with spaces
	head_phrase += sub_word # Concatenate headword phrase

	sub_word_trim = sub_word.strip().replace(", ", "")
	# Save component as an index word
	if sub_word_trim != primary_index:
	index_words.append(sub_word_trim)

	# Make headword tag with BeautifulSoup
	headword_soup = BeautifulSoup().new_tag("div", **{'class':'headword'})
	headword_soup.string = head_phrase

	new_block.append(headword_soup)

	# Make index-word tag with BeautifulSoup
	indices_soup = BeautifulSoup().new_tag("div", **{'class':'indices'})
	primary_tag = BeautifulSoup().new_tag("div", **{'class':'primary'})
	primary_tag.string = primary_index
	indices_soup.append(primary_tag)
	for i_word in index_words:
	component_tag = BeautifulSoup().new_tag("div", **{'class':'factor'})
	component_tag.string = i_word
	indices_soup.append(component_tag)

	new_block.append(indices_soup)

	# Remove headword elements from Soup
	for h in headwords:
	h.decompose()

	# Extract all domain blocks (only 2 or 3 files have multiple)
	# But many have multiple domains (comma-separated) within the same domain-block
	domain_blocks = head_block.find_all('font', class_="xsmall")


	if len(domain_blocks) > 0:

	# Make an enclosure tag
	domains_soup = BeautifulSoup().new_tag("div", **{'class':'domains'})

	for domain_block in domain_blocks:
	domain_text = "".join(domain_block.find_all(text=True))
	split_set = domain_text.split(", ")
	for domain_item in split_set:
	# Write each domain into a tag
	domain_tag = BeautifulSoup().new_tag("div", **{'class':'domain'})
	domain_tag.string = domain_item
	# Save each domain into the enclosure soup
	domains_soup.append(domain_tag)

	new_block.append(domains_soup)

	# Remove domain elements from Soup
	for d in domain_blocks:
	if d != None: d.decompose()

	# Extract homograph number (max. one per block)
	sup = head_block.find('sup')
	if sup != None:
	sup_text = "".join(sup.find_all(text=True))
	sup_tag = BeautifulSoup().new_tag("div", **{'class':'hg-number'})
	sup_tag.string = sup_text

	new_block.append(sup_tag)

	# Remove SUP element
	sup.decompose()


	# Find all remaining text in head block
	all_string = head_block.find_all(text=True)

	# Concatenate notes into one string
	notes = "".join(all_string).strip()

	if notes != "":
	notes_tag = BeautifulSoup().new_tag("div", **{'class':'notes'})
	notes_tag.string = notes

	new_block.append(notes_tag)

	# Replace head-block
	head_block.replace_with(new_block)


	# Write entry into file
	f_output = open(raw_file,'w')
	f_output.write(file_header+str(entry)+file_footer)
	f_output.close()

	return

	def check_all(files_list):
	for i, file_name in enumerate(files_list):
	# if i%5000 == 0: print ("Progress: " + str(int(i*100/len(files_list)))+"%")
	inspect_file(file_name)

	return

	def load_directory(source_path):
	files_list = []
	for file_name in os.listdir(source_path):
	try:
	if file_name.endswith(".html"):
	files_list.append(file_name)
	except IndexError:
	sys.stderr.write("Something went wrong with " + file_name + ".")
	continue
	return files_list

	if __name__ == '__main__':

	source_path = "Data-FI-EN/"
	file_header = """<html>
	<head>
	<meta charset="utf-8">
	</head>
	<body>
	"""
	file_footer ="""
	</body>
	</html>"""
	print("Loading files...")
	files_list = load_directory(source_path) # Load list of raw files
	nr_loaded = locale.format("%d", len(files_list), grouping=True)
	print(nr_loaded + " files loaded.")
	print("Checking all files...")
	check_all(files_list)

	print("Valmis!")