Skip to content

Instantly share code, notes, and snippets.

@mosdragon
Last active March 7, 2016 20:07
Show Gist options
  • Save mosdragon/ad893f877a631260e3e8 to your computer and use it in GitHub Desktop.
Save mosdragon/ad893f877a631260e3e8 to your computer and use it in GitHub Desktop.
A utility for taking discrete valued attributes and replacing with numerical values
import numpy as np
import pandas as pd
def clean_data(df, fill_method='ffill'):
'''
Takes in a pandas DataFrame and changes all string attributes and outputs
to integers. Returns the DataFrame and the mapping
fill_method can be:
- 'ffill': forward fill
- 'bfill': backward fill
'''
# Find all columns where data type is string
text_cols = df.select_dtypes(include=['object']).columns.values
text_cols = list(text_cols)
mappings = dict()
if text_cols:
def apply_mapping(index):
text = df.loc[index][colname]
num = colname_mapping[text]
df.loc[index, colname] = num
for colname in text_cols:
view = df[colname]
distinct = view.unique()
colname_mapping = {text : num for num, text in enumerate(distinct)}
mappings[colname] = colname_mapping
map(apply_mapping, df.index)
# Replace True/False with 0 or 1
bool_cols = df.select_dtypes(include=['bool']).columns.values
bool_cols = list(bool_cols)
if bool_cols:
for colname in bool_cols:
df[colname] = df[colname].astype(int)
# # Fill missing values using forward filling
df = df.fillna(method=fill_method)
return (df, mappings)
def process_dataset(filename, outname):
df = pd.read_csv(filename)
colnames = list(df.columns)
attributes = df[colnames[:-1]].copy()
outputs = df[colnames[-1]]
attributes, mappings = clean_data(attributes)
#
df = pd.concat([attributes, outputs], axis=1)
print df
df.to_csv(outname, index=False)
if __name__ == '__main__':
filename = 'original.csv' # Which file you're reading from
outname = 'dataset.csv' # Name of the file to write to (erases if already exists)
process_dataset(filename, outname)
@mosdragon
Copy link
Author

You'll need pandas and numpy

pip install pandas numpy

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment