Last active
August 3, 2018 15:26
-
-
Save jtauber/8cd283a64dd14f978b0ef6e5d1bfa312 to your computer and use it in GitHub Desktop.
[WORK IN PROGRESS] little recipes I use in processing (mostly Greek) texts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### strip specific accents | |
def strip_accents(w): | |
return unicodedata.normalize("NFC", "".join( | |
ch | |
for ch in unicodedata.normalize("NFD", w) | |
if ch not in ["\u0300", "\u0301", "\u0342"] | |
)) | |
### strip all diacritics | |
def strip_diacritics(w): | |
return unicodedata.normalize("NFC", "".join( | |
ch | |
for ch in unicodedata.normalize("NFD", w) | |
if unicodedata.category(ch) != "Mn" | |
)) | |
### iterate over a file, line-by-line skipping blank lines and Python-style comments starting with # | |
with open(filename) as f: | |
for line in f: | |
row = line.split("#")[0].strip() | |
if not row: | |
continue | |
... | |
### override certain values with a dictionary | |
# if `s` is a key in the dictionary, it is replaced with the value otherwise it is passed through | |
s = OVERRIDES.get(s, s) | |
### process the lines of two files pair-wise | |
with open(filename1) as file1: | |
with open(filename2) as file2: | |
for line1, line2 in zip(file1, file2): | |
... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment