Skip to content

Instantly share code, notes, and snippets.

@leonaburime
Created July 22, 2014 22:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leonaburime/c9fc192b5c215367236e to your computer and use it in GitHub Desktop.
Save leonaburime/c9fc192b5c215367236e to your computer and use it in GitHub Desktop.
File used to read in data -possibly into dataframes - for various sample python programs.
import urllib2, pandas as pd
d = {
'mushroom' :{
'features': [
'class','cap-shape', 'cap-surface', 'cap-color',
'bruises?','odor','gill-attachment',
'gill-spacing', 'gill-size', 'gill-color',
'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
'stalk-surface-below-ring', 'stalk-color-above-ring',
'stalk-color-below-ring','veil-type', 'veil-color',
'ring-number', 'ring-type', 'spore-print-color',
'population', 'habitat'
],
'file': 'Data/agaricus-lepiota.data',
'outcome_options' : {'good':'e', 'bad':'p', 'default': 'p'},
'outcome_name' : 'class',
#'url': urllib2.urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'),
},
'play':{
'features':['outlook','temperature','humidity','wind','play'],
'features_attributes':{
'outlook':['sunny','overcast', 'rain'],
'temperature':['hot', 'mild', 'cool'],
'humidity':['high', 'normal', 'low'],
'wind':['weak','strong'],
'play':['yes','no']
},
'file':'Data/play.txt',
'outcome_name':'play',
'outcome_options':{'yes','no'}
}
}
def get(name, d=d):
return d[name]
def createDataFrame( column_names, datafile, testSize=0):
data = []
#Now we are going to open up the mushroom file and read it line by line
for line in open(datafile).readlines():
if '?' in line: continue #Lets filter out the values which are incomplete
#Lets strip whitespaces and split the values into lists to append them to our data list
data.append( line.strip().split(',') )
#print line.strip().split(',') #Uncomment line below to see what type of data you are getting
#Lets create the dataframe w/ 'features' as column attributes and data as rows
test,train = None, None
train = pd.DataFrame(data[testSize:], columns=column_names) #Training set
if testSize > 0:
test = pd.DataFrame(data[:testSize], columns=column_names) #Test set
return train, test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment