Skip to content

Instantly share code, notes, and snippets.

@mstaflex
Last active August 29, 2015 14:07
Show Gist options
  • Save mstaflex/daa51ac2c658867a634c to your computer and use it in GitHub Desktop.
Save mstaflex/daa51ac2c658867a634c to your computer and use it in GitHub Desktop.
Gist to select names from a database file according to a regex scheme and a country. Mainly for games like "my daughter will be called a name with an a and an u" :P The database file can be found here https://gist.github.com/mstaflex/161edf0c61a764a3345f
import re
EAST_WEST_BORDER_COLUMN = 44
REGEX_COMPILED = p = re.compile('([\w]*n[\w]*u[\w]*)|[\w]*u[\w]*n[\w]*')
input_file_data_base = "nam_dict.txt"
output_file_filtered = "name_database_filtered.txt"
output_file = "possible_names_selection.txt"
country_column = {30: "Great Britain", 31: "Ireland", 32: "U.S.A.", 33: "Italy", 34: "Malta", 35: "Portugal", 36: "Spain", 37: "France", 38: "Belgium", 39: "Luxembourg", 40: "the Netherlands", 41: "East Frisia", 42: "Germany", 43: "Austria", 44: "Swiss", 45: "Iceland", 46: "Denmark", 47: "Norway", 48: "Sweden", 49: "Finland", 50: "Estonia", 51: "Latvia", 52: "Lithuania", 53: "Poland", 54: "Czech Republic", 55: "Slovakia", 56: "Hungary", 57: "Romania", 58: "Bulgaria", 59: "Bosnia and Herzegovina", 60: "Croatia", 61: "Kosovo", 62: "Macedonia", 63: "Montenegro", 64: "Serbia", 65: "Slovenia", 66: "Albania", 67: "Greece", 68: "Russia", 69: "Belarus", 70: "Moldova", 71: "Ukraine", 72: "Armenia", 73: "Azerbaijan", 74: "Georgia", 75: "Kazakhstan/Uzbekistan,etc.", 76: "Turkey", 77: "Arabia/Persia", 78: "Israel", 79: "Chine", 80: "India/Sri Lanka", 81: "Japan", 82: "Korea", 83: "Vietnam", 84: "others" }
selected_countries = ["Germany", "U.S.A.", "France", "Spain", "Austria", "Swiss"]
def regex_filter(name):
if REGEX_COMPILED.match(name) is None:
return False
return True
def pre_filter(input, output, select_weight=2, gender_filter="M"):
with open(input, "r") as f:
with open(output, "w") as fo:
line = f.readline()
while line:
line = f.readline()
if line.startswith("#"):
continue
if line.startswith(gender_filter) or line.startswith("?"+gender_filter) or line.startswith("? "):
continue
pro_cnt = 0
counter_cnt = 0
name = line[3:27].strip()
if "+" in name:
continue
if not regex_filter(name):
continue
for column in range(30, 30 + len(country_column.keys())):
if line[column] in [" ", "+", "-"]:
continue
val = int(line[column], 16)
if country_column[column] in selected_countries:
pro_cnt += val
else:
counter_cnt += val
if select_weight * counter_cnt > pro_cnt:
continue
fo.write("%s\n" % (name))
def uniquify(input, output):
with open(input, "r") as f:
with open(output, "w") as fo:
hashi = {}
name = f.readline().strip()
while name:
try:
name = f.readline().strip()
hashi[name] = 1
except:
raise
name = f.readline().strip()
for name in hashi.keys():
fo.write("%s\n" % (name))
pre_filter(input_file_data_base, output_file_filtered, select_weight=1)
uniquify(output_file_filtered, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment