Last active August 3, 2018 15:26
[WORK IN PROGRESS] little recipes I use in processing (mostly Greek) texts
### strip specific accents
def strip_accents(w):
return unicodedata.normalize("NFC", "".join(
for ch in unicodedata.normalize("NFD", w)
if ch not in ["\u0300", "\u0301", "\u0342"]
### strip all diacritics
def strip_diacritics(w):
return unicodedata.normalize("NFC", "".join(
for ch in unicodedata.normalize("NFD", w)
if unicodedata.category(ch) != "Mn"
### iterate over a file, line-by-line skipping blank lines and Python-style comments starting with #
with open(filename) as f:
for line in f:
row = line.split("#")[0].strip()
if not row:
### override certain values with a dictionary
# if `s` is a key in the dictionary, it is replaced with the value otherwise it is passed through
s = OVERRIDES.get(s, s)
### process the lines of two files pair-wise
with open(filename1) as file1:
with open(filename2) as file2:
for line1, line2 in zip(file1, file2):
