Skip to content

Instantly share code, notes, and snippets.

@mmmayo13
Last active August 18, 2018 19:08
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save mmmayo13/9859a457760db10ec4842be3aa1a2334 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
'''
Some functions for loading a dataset and performing simple data preparation
'''
def load_dataset(filename, filetype='csv', header=True):
'''
Loads a dataset from file
Parameters:
-----------
filename: str
Name of data file
filetype: str
The type of data file (csv, tsv)
Returns:
--------
DataFrame
Dataset as pandas DataFrame
'''
in_file = open(filename)
data = []
header_row = ''
# Read the file line by line into instance structure
for line in in_file.readlines():
# Skip comments
if not line.startswith("#"):
# TSV file
if filetype == 'tsv':
if header:
header_row = line.strip().split('\t')
else:
raw = line.strip().split('\t')
# CSV file
elif filetype =='csv':
if header:
header_row = line.strip().split(',')
else:
raw = line.strip().split(',')
# Neither = problem
else:
print 'Invalid file type'
exit()
# Append to dataset appropriately
if not header:
data.append(raw)
header = False
# Build a new dataframe of the data instance list of lists and return
df = pd.DataFrame(data, columns=header_row)
return df
def to_numeric(dataset, attr_name):
'''
Performs a simple categorical to numeric attribute value transformation
Parameters:
-----------
dataset: DataFrame
Dataset on which to perform transformation
attr_name: str
Dataset attribute name to convert from nominal to numeric values
Returns:
--------
DataFrame
DataFrame of with data transformation performed
dict
Python dictionary of attribute name to integer mappings
'''
# Get unique entries in column
unique_vals = dataset[attr_name].unique()
# Create dict
val_dict = {}
for val in unique_vals:
if not val in val_dict:
val_dict[val] = len(val_dict)
# Replace values in attr_name col as per dict
dataset[attr_name].replace(val_dict, inplace=True)
# Return dataset and value dictionary
return dataset, val_dict
def to_matrix(dataset):
'''
Converts a pandas DataFrame dataset to a numpy matrix representation
Parameters:
-----------
dataset: DataFrame
Dataset to convert to matrix representation
Returns:
--------
ndarray
numpy ndarray representation of dataset
'''
return dataset.as_matrix()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment