jef5ez/extraction.py Secret

## extraction.py
"""In this code block, you must define the function `extract_main_dataset`.
`extract_main_dataset` must take no arguments and return a tuple (X, y), where X is a
Numpy array with shape (n_samples, n_features) corresponding to the features of your
main dataset and y is the Numpy array corresponding to the ground truth labels of each
sample.
"""
import numpy as np
import pandas as pd
# data available through kaggle https://www.kaggle.com/c/zillow-prize-1/data
def extract_main_dataset():
    train = pd.read_csv('../data/train_2016.csv')
    prop = pd.read_csv('../data/properties_2016.csv')
    for c, dtype in zip(prop.columns, prop.dtypes):
        if dtype == np.float64:
            prop[c] = prop[c].astype(np.float32)
    merged = train.merge(prop, how='left', on='parcelid')
    x_train = merged.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
    y_train = merged['logerror'].values
    for c in x_train.dtypes[x_train.dtypes == object].index.values:
        x_train[c] = (x_train[c] == True)
    return x_train, y_train

## learner.py
from xgboost import XGBRegressor

base_learner = XGBRegressor(seed=8)

## serialization.log
::1 - - [2017-05-30 23:26:29] "POST /ensemble/base-learner-origins/1/verify/?path=/home/joseph/zillow/xcessivModel/xgboost HTTP/1.1" 200 1059 0.149282
::1 - - [2017-05-30 23:28:25] "GET /ensemble/base-learner-origins/1/confirm/?path=/home/joseph/zillow/xcessivModel/xgboost HTTP/1.1" 200 1058 0.182894
::1 - - [2017-05-30 23:29:34] "POST /ensemble/base-learner-origins/1/create-base-learner/?path=/home/joseph/zillow/xcessivModel/xgboost HTTP/1.1" 200 739 0.018246
23:29:34 default: xcessiv.rqtasks.generate_meta_features('/home/joseph/zillow/xcessivModel/xgboost', 1) (96755f8c-5f4a-4520-973d-73edd73e7203)
[2017-05-30 23:29:34,749] ERROR in app: Exception on /ensemble/base-learners/ [GET]
Traceback (most recent call last):
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1982, in wsgi_app
    response = self.full_dispatch_request()
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1614, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1517, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/_compat.py", line 33, in reraise
    raise value
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1612, in full_dispatch_request
    rv = self.dispatch_request()
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1598, in dispatch_request
    return self.view_functions[rule.endpoint](**req.view_args)
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/xcessiv/views.py", line 414, in get_base_learners
    return jsonify(map(lambda x: x.serialize, base_learners))
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/json.py", line 263, in jsonify
    (dumps(data, indent=indent, separators=separators), '\n'),
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/json.py", line 123, in dumps
    rv = _json.dumps(obj, **kwargs)
  File "/usr/lib/python3.5/json/__init__.py", line 237, in dumps
    **kw).encode(obj)
  File "/usr/lib/python3.5/json/encoder.py", line 200, in encode
    chunks = list(chunks)
  File "/usr/lib/python3.5/json/encoder.py", line 436, in _iterencode
    o = _default(o)
  File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/json.py", line 80, in default
    return _json.JSONEncoder.default(self, o)
  File "/usr/lib/python3.5/json/encoder.py", line 179, in default
    raise TypeError(repr(o) + " is not JSON serializable")
TypeError: <map object at 0x7fab23a226a0> is not JSON serializable
::1 - - [2017-05-30 23:29:34] "GET /ensemble/base-learners/?path=/home/joseph/zillow/xcessivModel/xgboost HTTP/1.1" 500 412 0.004038
/home/joseph/environments/tf-env/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/joseph/environments/tf-env/lib/python3.5/site-packages/xcessiv/rqtasks.py:121: DtypeWarning: Columns (22,32,34,49,55) have mixed types. Specify dtype option on import or set low_memory=False.
  X, y = extraction.return_train_dataset()
23:29:55 default: Job OK (96755f8c-5f4a-4520-973d-73edd73e7203)
23:29:55 Result is kept for 500 seconds
23:29:55
23:29:55 *** Listening on default...
	"""In this code block, you must define the function `extract_main_dataset`.
	`extract_main_dataset` must take no arguments and return a tuple (X, y), where X is a
	Numpy array with shape (n_samples, n_features) corresponding to the features of your
	main dataset and y is the Numpy array corresponding to the ground truth labels of each
	sample.
	"""
	import numpy as np
	import pandas as pd
	# data available through kaggle https://www.kaggle.com/c/zillow-prize-1/data
	def extract_main_dataset():
	train = pd.read_csv('../data/train_2016.csv')
	prop = pd.read_csv('../data/properties_2016.csv')
	for c, dtype in zip(prop.columns, prop.dtypes):
	if dtype == np.float64:
	prop[c] = prop[c].astype(np.float32)
	merged = train.merge(prop, how='left', on='parcelid')
	x_train = merged.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
	y_train = merged['logerror'].values
	for c in x_train.dtypes[x_train.dtypes == object].index.values:
	x_train[c] = (x_train[c] == True)
	return x_train, y_train
	from xgboost import XGBRegressor

	base_learner = XGBRegressor(seed=8)
	::1 - - [2017-05-30 23:26:29] "POST /ensemble/base-learner-origins/1/verify/?path=/home/joseph/zillow/xcessivModel/xgboost HTTP/1.1" 200 1059 0.149282
	::1 - - [2017-05-30 23:28:25] "GET /ensemble/base-learner-origins/1/confirm/?path=/home/joseph/zillow/xcessivModel/xgboost HTTP/1.1" 200 1058 0.182894
	::1 - - [2017-05-30 23:29:34] "POST /ensemble/base-learner-origins/1/create-base-learner/?path=/home/joseph/zillow/xcessivModel/xgboost HTTP/1.1" 200 739 0.018246
	23:29:34 default: xcessiv.rqtasks.generate_meta_features('/home/joseph/zillow/xcessivModel/xgboost', 1) (96755f8c-5f4a-4520-973d-73edd73e7203)
	[2017-05-30 23:29:34,749] ERROR in app: Exception on /ensemble/base-learners/ [GET]
	Traceback (most recent call last):
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1982, in wsgi_app
	response = self.full_dispatch_request()
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1614, in full_dispatch_request
	rv = self.handle_user_exception(e)
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1517, in handle_user_exception
	reraise(exc_type, exc_value, tb)
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/_compat.py", line 33, in reraise
	raise value
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1612, in full_dispatch_request
	rv = self.dispatch_request()
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/app.py", line 1598, in dispatch_request
	return self.view_functions[rule.endpoint](**req.view_args)
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/xcessiv/views.py", line 414, in get_base_learners
	return jsonify(map(lambda x: x.serialize, base_learners))
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/json.py", line 263, in jsonify
	(dumps(data, indent=indent, separators=separators), '\n'),
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/json.py", line 123, in dumps
	rv = _json.dumps(obj, **kwargs)
	File "/usr/lib/python3.5/json/__init__.py", line 237, in dumps
	**kw).encode(obj)
	File "/usr/lib/python3.5/json/encoder.py", line 200, in encode
	chunks = list(chunks)
	File "/usr/lib/python3.5/json/encoder.py", line 436, in _iterencode
	o = _default(o)
	File "/home/joseph/environments/tf-env/lib/python3.5/site-packages/flask/json.py", line 80, in default
	return _json.JSONEncoder.default(self, o)
	File "/usr/lib/python3.5/json/encoder.py", line 179, in default
	raise TypeError(repr(o) + " is not JSON serializable")
	TypeError: <map object at 0x7fab23a226a0> is not JSON serializable
	::1 - - [2017-05-30 23:29:34] "GET /ensemble/base-learners/?path=/home/joseph/zillow/xcessivModel/xgboost HTTP/1.1" 500 412 0.004038
	/home/joseph/environments/tf-env/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
	"This module will be removed in 0.20.", DeprecationWarning)
	/home/joseph/environments/tf-env/lib/python3.5/site-packages/xcessiv/rqtasks.py:121: DtypeWarning: Columns (22,32,34,49,55) have mixed types. Specify dtype option on import or set low_memory=False.
	X, y = extraction.return_train_dataset()
	23:29:55 default: Job OK (96755f8c-5f4a-4520-973d-73edd73e7203)
	23:29:55 Result is kept for 500 seconds
	23:29:55
	23:29:55 *** Listening on default...