leonaburime/datafile.py

## datafile.py
import urllib2, pandas as pd


d = {
	'mushroom' :{
		'features':  [
			'class','cap-shape', 'cap-surface', 'cap-color',
			'bruises?','odor','gill-attachment',
			'gill-spacing', 'gill-size', 'gill-color',
			'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
			'stalk-surface-below-ring', 'stalk-color-above-ring',
			'stalk-color-below-ring','veil-type', 'veil-color',
			'ring-number', 'ring-type', 'spore-print-color',
			'population', 'habitat'
		],
		'file': 'Data/agaricus-lepiota.data',
		'outcome_options' : {'good':'e', 'bad':'p', 'default': 'p'},
		'outcome_name' : 'class',
		#'url': urllib2.urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'),
	},

	'play':{
		'features':['outlook','temperature','humidity','wind','play'],
		'features_attributes':{
			'outlook':['sunny','overcast', 'rain'],
			'temperature':['hot', 'mild', 'cool'],
			'humidity':['high', 'normal', 'low'],
			'wind':['weak','strong'],
			'play':['yes','no']
		},
		'file':'Data/play.txt',
		'outcome_name':'play',
		'outcome_options':{'yes','no'}

	}

}

def get(name, d=d):
	return d[name]

def createDataFrame( column_names, datafile, testSize=0):

		data = []

		#Now we are going to open up the mushroom file and read it line by line
		for line in open(datafile).readlines():

			if '?' in line: continue #Lets filter out the values which are incomplete

			#Lets strip whitespaces and split the values into lists to append them to our data list
			data.append( line.strip().split(',') )

			#print line.strip().split(',') #Uncomment line below to see what type of data you are getting

		#Lets create the dataframe w/ 'features' as column attributes and data as rows
		test,train = None, None

		train = pd.DataFrame(data[testSize:], columns=column_names) #Training set
		if testSize > 0:
			test =  pd.DataFrame(data[:testSize], columns=column_names) #Test set
		return train, test
	import urllib2, pandas as pd




	d = {
	'mushroom' :{
	'features': [
	'class','cap-shape', 'cap-surface', 'cap-color',
	'bruises?','odor','gill-attachment',
	'gill-spacing', 'gill-size', 'gill-color',
	'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
	'stalk-surface-below-ring', 'stalk-color-above-ring',
	'stalk-color-below-ring','veil-type', 'veil-color',
	'ring-number', 'ring-type', 'spore-print-color',
	'population', 'habitat'
	],
	'file': 'Data/agaricus-lepiota.data',
	'outcome_options' : {'good':'e', 'bad':'p', 'default': 'p'},
	'outcome_name' : 'class',
	#'url': urllib2.urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'),
	},

	'play':{
	'features':['outlook','temperature','humidity','wind','play'],
	'features_attributes':{
	'outlook':['sunny','overcast', 'rain'],
	'temperature':['hot', 'mild', 'cool'],
	'humidity':['high', 'normal', 'low'],
	'wind':['weak','strong'],
	'play':['yes','no']
	},
	'file':'Data/play.txt',
	'outcome_name':'play',
	'outcome_options':{'yes','no'}

	}

	}

	def get(name, d=d):
	return d[name]

	def createDataFrame( column_names, datafile, testSize=0):

	data = []

	#Now we are going to open up the mushroom file and read it line by line
	for line in open(datafile).readlines():

	if '?' in line: continue #Lets filter out the values which are incomplete

	#Lets strip whitespaces and split the values into lists to append them to our data list
	data.append( line.strip().split(',') )

	#print line.strip().split(',') #Uncomment line below to see what type of data you are getting

	#Lets create the dataframe w/ 'features' as column attributes and data as rows
	test,train = None, None

	train = pd.DataFrame(data[testSize:], columns=column_names) #Training set
	if testSize > 0:
	test = pd.DataFrame(data[:testSize], columns=column_names) #Test set
	return train, test