mstaflex/name_filter.py

## name_filter.py
import re

EAST_WEST_BORDER_COLUMN = 44
REGEX_COMPILED = p = re.compile('([\w]*n[\w]*u[\w]*)|[\w]*u[\w]*n[\w]*')

input_file_data_base = "nam_dict.txt"
output_file_filtered = "name_database_filtered.txt"
output_file = "possible_names_selection.txt"

country_column = {30: "Great Britain", 31: "Ireland", 32: "U.S.A.", 33: "Italy", 34: "Malta", 35: "Portugal", 36: "Spain", 37: "France", 38: "Belgium", 39: "Luxembourg", 40: "the Netherlands", 41: "East Frisia", 42: "Germany", 43: "Austria", 44: "Swiss", 45: "Iceland", 46: "Denmark", 47: "Norway", 48: "Sweden", 49: "Finland", 50: "Estonia", 51: "Latvia", 52: "Lithuania", 53: "Poland", 54: "Czech Republic", 55: "Slovakia", 56: "Hungary", 57: "Romania", 58: "Bulgaria", 59: "Bosnia and Herzegovina", 60: "Croatia", 61: "Kosovo", 62: "Macedonia", 63: "Montenegro", 64: "Serbia", 65: "Slovenia", 66: "Albania", 67: "Greece", 68: "Russia", 69: "Belarus", 70: "Moldova", 71: "Ukraine", 72: "Armenia", 73: "Azerbaijan", 74: "Georgia", 75: "Kazakhstan/Uzbekistan,etc.", 76: "Turkey", 77: "Arabia/Persia", 78: "Israel", 79: "Chine", 80: "India/Sri Lanka", 81: "Japan", 82: "Korea", 83: "Vietnam", 84: "others" }
selected_countries = ["Germany", "U.S.A.", "France", "Spain", "Austria", "Swiss"]

def regex_filter(name):
	if REGEX_COMPILED.match(name) is None:
		return False
	return True

def pre_filter(input, output, select_weight=2, gender_filter="M"):
	with open(input, "r") as f:
		with open(output, "w") as fo:
			line = f.readline()
			while line:
				line = f.readline()
				if line.startswith("#"):
					continue
				if line.startswith(gender_filter) or line.startswith("?"+gender_filter) or line.startswith("? "):
					continue
				pro_cnt = 0
				counter_cnt = 0
				name = line[3:27].strip()
				if "+" in name:
					continue
				if not regex_filter(name):
					continue
				for column in range(30, 30 + len(country_column.keys())):
					if line[column] in [" ", "+", "-"]:
						continue
					val = int(line[column], 16)
					if country_column[column] in selected_countries:
						pro_cnt += val
					else:
						counter_cnt += val
				if  select_weight * counter_cnt > pro_cnt:
					continue
				fo.write("%s\n" % (name))


def uniquify(input, output):
	with open(input, "r") as f:
		with open(output, "w") as fo:
			hashi = {}
			name = f.readline().strip()
			while name:
				try:
					name = f.readline().strip()
					hashi[name] = 1
				except:
					raise
				name = f.readline().strip()
			for name in hashi.keys():
				fo.write("%s\n" % (name))


pre_filter(input_file_data_base, output_file_filtered, select_weight=1)
uniquify(output_file_filtered, output_file)
	import re

	EAST_WEST_BORDER_COLUMN = 44
	REGEX_COMPILED = p = re.compile('([\w]n[\w]u[\w])\|[\w]u[\w]n[\w]')

	input_file_data_base = "nam_dict.txt"
	output_file_filtered = "name_database_filtered.txt"
	output_file = "possible_names_selection.txt"

	country_column = {30: "Great Britain", 31: "Ireland", 32: "U.S.A.", 33: "Italy", 34: "Malta", 35: "Portugal", 36: "Spain", 37: "France", 38: "Belgium", 39: "Luxembourg", 40: "the Netherlands", 41: "East Frisia", 42: "Germany", 43: "Austria", 44: "Swiss", 45: "Iceland", 46: "Denmark", 47: "Norway", 48: "Sweden", 49: "Finland", 50: "Estonia", 51: "Latvia", 52: "Lithuania", 53: "Poland", 54: "Czech Republic", 55: "Slovakia", 56: "Hungary", 57: "Romania", 58: "Bulgaria", 59: "Bosnia and Herzegovina", 60: "Croatia", 61: "Kosovo", 62: "Macedonia", 63: "Montenegro", 64: "Serbia", 65: "Slovenia", 66: "Albania", 67: "Greece", 68: "Russia", 69: "Belarus", 70: "Moldova", 71: "Ukraine", 72: "Armenia", 73: "Azerbaijan", 74: "Georgia", 75: "Kazakhstan/Uzbekistan,etc.", 76: "Turkey", 77: "Arabia/Persia", 78: "Israel", 79: "Chine", 80: "India/Sri Lanka", 81: "Japan", 82: "Korea", 83: "Vietnam", 84: "others" }
	selected_countries = ["Germany", "U.S.A.", "France", "Spain", "Austria", "Swiss"]

	def regex_filter(name):
	if REGEX_COMPILED.match(name) is None:
	return False
	return True

	def pre_filter(input, output, select_weight=2, gender_filter="M"):
	with open(input, "r") as f:
	with open(output, "w") as fo:
	line = f.readline()
	while line:
	line = f.readline()
	if line.startswith("#"):
	continue
	if line.startswith(gender_filter) or line.startswith("?"+gender_filter) or line.startswith("? "):
	continue
	pro_cnt = 0
	counter_cnt = 0
	name = line[3:27].strip()
	if "+" in name:
	continue
	if not regex_filter(name):
	continue
	for column in range(30, 30 + len(country_column.keys())):
	if line[column] in [" ", "+", "-"]:
	continue
	val = int(line[column], 16)
	if country_column[column] in selected_countries:
	pro_cnt += val
	else:
	counter_cnt += val
	if select_weight * counter_cnt > pro_cnt:
	continue
	fo.write("%s\n" % (name))


	def uniquify(input, output):
	with open(input, "r") as f:
	with open(output, "w") as fo:
	hashi = {}
	name = f.readline().strip()
	while name:
	try:
	name = f.readline().strip()
	hashi[name] = 1
	except:
	raise
	name = f.readline().strip()
	for name in hashi.keys():
	fo.write("%s\n" % (name))


	pre_filter(input_file_data_base, output_file_filtered, select_weight=1)
	uniquify(output_file_filtered, output_file)