Last active
August 18, 2018 19:08
Star
You must be signed in to star a gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
''' | |
Some functions for loading a dataset and performing simple data preparation | |
''' | |
def load_dataset(filename, filetype='csv', header=True): | |
''' | |
Loads a dataset from file | |
Parameters: | |
----------- | |
filename: str | |
Name of data file | |
filetype: str | |
The type of data file (csv, tsv) | |
Returns: | |
-------- | |
DataFrame | |
Dataset as pandas DataFrame | |
''' | |
in_file = open(filename) | |
data = [] | |
header_row = '' | |
# Read the file line by line into instance structure | |
for line in in_file.readlines(): | |
# Skip comments | |
if not line.startswith("#"): | |
# TSV file | |
if filetype == 'tsv': | |
if header: | |
header_row = line.strip().split('\t') | |
else: | |
raw = line.strip().split('\t') | |
# CSV file | |
elif filetype =='csv': | |
if header: | |
header_row = line.strip().split(',') | |
else: | |
raw = line.strip().split(',') | |
# Neither = problem | |
else: | |
print 'Invalid file type' | |
exit() | |
# Append to dataset appropriately | |
if not header: | |
data.append(raw) | |
header = False | |
# Build a new dataframe of the data instance list of lists and return | |
df = pd.DataFrame(data, columns=header_row) | |
return df | |
def to_numeric(dataset, attr_name): | |
''' | |
Performs a simple categorical to numeric attribute value transformation | |
Parameters: | |
----------- | |
dataset: DataFrame | |
Dataset on which to perform transformation | |
attr_name: str | |
Dataset attribute name to convert from nominal to numeric values | |
Returns: | |
-------- | |
DataFrame | |
DataFrame of with data transformation performed | |
dict | |
Python dictionary of attribute name to integer mappings | |
''' | |
# Get unique entries in column | |
unique_vals = dataset[attr_name].unique() | |
# Create dict | |
val_dict = {} | |
for val in unique_vals: | |
if not val in val_dict: | |
val_dict[val] = len(val_dict) | |
# Replace values in attr_name col as per dict | |
dataset[attr_name].replace(val_dict, inplace=True) | |
# Return dataset and value dictionary | |
return dataset, val_dict | |
def to_matrix(dataset): | |
''' | |
Converts a pandas DataFrame dataset to a numpy matrix representation | |
Parameters: | |
----------- | |
dataset: DataFrame | |
Dataset to convert to matrix representation | |
Returns: | |
-------- | |
ndarray | |
numpy ndarray representation of dataset | |
''' | |
return dataset.as_matrix() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment