Skip to content

Instantly share code, notes, and snippets.

@raghavrv
Last active February 17, 2016 15:34
Show Gist options
  • Save raghavrv/8ba99007ebb0c96d81cc to your computer and use it in GitHub Desktop.
Save raghavrv/8ba99007ebb0c96d81cc to your computer and use it in GitHub Desktop.
Snippet to load and vectorize the adult dataset with missing values - https://archive.ics.uci.edu/ml/datasets/Adult
# Load categories of categorical features from descr
descr = """@attribute Age real [17.0, 90.0]
@attribute Workclass {Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked}
@attribute Fnlwgt real [12285.0, 1490400.0]
@attribute Education {Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool}
@attribute Education-num real [1.0, 16.0]
@attribute Marital-status {Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse}
@attribute Occupation {Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces}
@attribute Relationship {Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried}
@attribute Race {White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black}
@attribute Sex {Female, Male}
@attribute Capital-gain real [0.0, 99999.0]
@attribute Capital-loss real [0.0, 4356.0]
@attribute Hours-per-week real [1.0, 99.0]
@attribute Native-country {United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands}"""
cat_feats = []
feat_names = []
for feat in descr.splitlines():
feat = feat.strip('@attribute ')
if '{' not in feat:
cat_feats.append(())
feat_names.append(feat.split(' real ')[0])
continue
f_i_name, f_i_vals = feat.split(' {')
feat_names.append(f_i_name)
cat_feats.append(tuple(sorted(f_i_vals.strip(' }').split(', '))))
# Load data from adult dataset
data = []
target = []
with open('adult.dat') as adult:
for line in adult.readlines():
data_i = []
line = line.strip('\r\n ').split(',')
# print line
for j, d in enumerate(line):
if j > 3:
# We have deleted the 3rd col from feat_names
# and cat_feats
j -= 1
if j in (0, 2, 9, 10, 11):
data_i.append(float(d)) #if d != "?" else np.nan)
# Skip 3 We have numerical data for that
elif j in (1, 4, 5, 6, 7, 8, 12):
try:
data_i.append(float(cat_feats[j].index(d))
if d != "?" else np.nan)
except:
print d, j, feat_names[j]
raise
data.append(data_i)
target.append(int(line[-1] == '<=50K'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment