Skip to content

Instantly share code, notes, and snippets.

@gartenfeld
Created November 1, 2014 11:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gartenfeld/bdcc93f166dd1fc60dfb to your computer and use it in GitHub Desktop.
Save gartenfeld/bdcc93f166dd1fc60dfb to your computer and use it in GitHub Desktop.
Cleaning up and separating lexical data files.
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
def clear_output_file(out_file):
file_header ="""<html>
<head>
<meta charset="utf-8">
</head>
<body>
"""
f_output = open(out_file,'w')
f_output.write(file_header)
f_output.close()
return
def add_footer(out_file):
file_footer ="""
</body>
</html>"""
f_output = open(out_file,'a')
f_output.write(file_footer)
f_output.close()
return
def has_lang(tag):
return tag.has_attr('lang')
def triage_source_files(files_list):
for file_name in files_list: # For each raw file
raw_file = source_path + file_name
raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup
entries = raw_soup.find_all('div', class_="entry")
for entry in entries:
mark = entry.find('font',class_="pieniharmaa")
if mark != None: entry.decompose() # Remove non-relevant content
re_soup = BeautifulSoup(str(entries))
lexical_entries = re_soup.find_all('div', class_="entry")
if len(lexical_entries) > 0: # Exclude files with no valid content
for entry in lexical_entries:
lang_tag = entry.find(has_lang) # Locate the very first 'lang' attribute in the entry
lang = lang_tag['lang'] # Grab the attribute value
if lang == "fi":
out_path = out_fi_path
else:
out_path = out_en_path
out_file = out_path + file_name
clear_output_file(out_file)
f_output = open(out_file,'a')
f_output.write(str(entry))
f_output.close()
add_footer(out_file)
return
def load_directory(source_path):
files_list = []
for file_name in os.listdir(source_path):
try:
if file_name.endswith(".html"):
files_list.append(file_name)
except IndexError:
sys.stderr.write("Something went wrong with " + file_name + ".")
continue
return files_list
if __name__ == '__main__':
source_path = "Files-Raw/"
out_fi_path = "Data-FI-EN/"
out_en_path = "Data-EN-FI/"
print("Loading files...")
files_list = load_directory(source_path) # Load list of raw files
print("Triaging files...")
triage_source_files(files_list) # Separate FI-EN and EN-FI entries.
print("Valmis!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment