raghavrv/adult_snippet_2.py

## adult_snippet_2.py
# Load categories of categorical features from descr


descr = """@attribute Age real [17.0, 90.0]
@attribute Workclass {Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked}
@attribute Fnlwgt real [12285.0, 1490400.0]
@attribute Education {Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool}
@attribute Education-num real [1.0, 16.0]
@attribute Marital-status {Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse}
@attribute Occupation {Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces}
@attribute Relationship {Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried}
@attribute Race {White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black}
@attribute Sex {Female, Male}
@attribute Capital-gain real [0.0, 99999.0]
@attribute Capital-loss real [0.0, 4356.0]
@attribute Hours-per-week real [1.0, 99.0]
@attribute Native-country {United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands}"""

cat_feats = []
feat_names = []

for feat in descr.splitlines():
    feat = feat.strip('@attribute ')
    if '{' not in feat:
        cat_feats.append(())
        feat_names.append(feat.split(' real ')[0])
        continue

    f_i_name, f_i_vals = feat.split(' {')
    feat_names.append(f_i_name)
    cat_feats.append(tuple(sorted(f_i_vals.strip(' }').split(', '))))

## adult_snippet_3.py
# Load data from adult dataset
data = []
target = []

with open('adult.dat') as adult:
    for line in adult.readlines():
        data_i = []
        line = line.strip('\r\n ').split(',')
        # print line
        for j, d in enumerate(line):
            if j > 3:
                # We have deleted the 3rd col from feat_names
                # and cat_feats
                j -= 1
            if j in (0, 2, 9, 10, 11):
                data_i.append(float(d)) #if d != "?" else np.nan)
            # Skip 3 We have numerical data for that
            elif j in (1, 4, 5, 6, 7, 8, 12):
                try:
                    data_i.append(float(cat_feats[j].index(d))
                                  if d != "?" else np.nan)
                except:
                    print d, j, feat_names[j]
                    raise
        data.append(data_i)
        target.append(int(line[-1] == '<=50K'))
	# Load categories of categorical features from descr


	descr = """@attribute Age real [17.0, 90.0]
	@attribute Workclass {Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked}
	@attribute Fnlwgt real [12285.0, 1490400.0]
	@attribute Education {Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool}
	@attribute Education-num real [1.0, 16.0]
	@attribute Marital-status {Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse}
	@attribute Occupation {Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces}
	@attribute Relationship {Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried}
	@attribute Race {White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black}
	@attribute Sex {Female, Male}
	@attribute Capital-gain real [0.0, 99999.0]
	@attribute Capital-loss real [0.0, 4356.0]
	@attribute Hours-per-week real [1.0, 99.0]
	@attribute Native-country {United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands}"""

	cat_feats = []
	feat_names = []

	for feat in descr.splitlines():
	feat = feat.strip('@attribute ')
	if '{' not in feat:
	cat_feats.append(())
	feat_names.append(feat.split(' real ')[0])
	continue

	f_i_name, f_i_vals = feat.split(' {')
	feat_names.append(f_i_name)
	cat_feats.append(tuple(sorted(f_i_vals.strip(' }').split(', '))))
	# Load data from adult dataset
	data = []
	target = []

	with open('adult.dat') as adult:
	for line in adult.readlines():
	data_i = []
	line = line.strip('\r\n ').split(',')
	# print line
	for j, d in enumerate(line):
	if j > 3:
	# We have deleted the 3rd col from feat_names
	# and cat_feats
	j -= 1
	if j in (0, 2, 9, 10, 11):
	data_i.append(float(d)) #if d != "?" else np.nan)
	# Skip 3 We have numerical data for that
	elif j in (1, 4, 5, 6, 7, 8, 12):
	try:
	data_i.append(float(cat_feats[j].index(d))
	if d != "?" else np.nan)
	except:
	print d, j, feat_names[j]
	raise
	data.append(data_i)
	target.append(int(line[-1] == '<=50K'))