Skip to content

Instantly share code, notes, and snippets.

@ivanvoid
Created November 9, 2021 02:27
Show Gist options
  • Save ivanvoid/0a0cb734e5b36a8dbce42c2d3cee6daa to your computer and use it in GitHub Desktop.
Save ivanvoid/0a0cb734e5b36a8dbce42c2d3cee6daa to your computer and use it in GitHub Desktop.
loading data
# Load data from files to dict
import os
import glob
import string
import unicodedata
all_letters = string.ascii_letters + " .,;'-"
category_lines = {}
all_categories = []
def findFiles(path): return glob.glob(path)
# Read a file and split into lines
def readLines(filename):
with open(filename, encoding='utf-8') as some_file:
return [unicodeToAscii(line.strip()) for line in some_file]
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
# Build the category_lines dictionary, a list of lines per category
for filename in findFiles('data/names/*.txt'):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = readLines(filename)
category_lines[category] = lines
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment