erogol/logistic_ensemble.py

## logistic_ensemble.py
def linear_model_ensemble(X, y, X_test, fold_num, fold_num_sec, grid_search_range, oobe=True, x_val=True ):

      '''
      X         - Train set

      y         - Train set labels with. Labels are 1 for pos instances and -1 for neg instances

      fold_num1 - Fold size for the first step X-validation to set the hyper-params
                and feature selectors

      fold_num2 - Fold size for the second step X-validation to test the generalization
                performance of the ensemble

      grid_search_range - Given list of values to be used as a C candidates in grid search

      oobe      - Use oobe values for the prediction

      x-val     - Boolean value for whether use second step X-val or not
      '''


      # Import Necessary Modules
      from sklearn.cross_validation import KFold, StratifiedKFold
      from sklearn.metrics import classification_report,roc_auc_score,accuracy_score
      import sklearn.linear_model as lm
      from sklearn.feature_selection import SelectPercentile
      from sklearn.feature_selection import chi2


      rd = lm.LogisticRegression(dual=True, tol=1e-5,
                         fit_intercept=True, intercept_scaling=1.0,
                         class_weight=None, random_state=None)

      # Training and feature selection

      num_feats = X.shape[1]
      lentrain = X.shape[0]
      scores = np.zeros((0,))   # list of scores of classifers by test folds
      clfs = []                 # list of classifers trained by each train fold.
      feat_selects = []         # list of feature selectors

      kf = KFold(lentrain, n_folds=fold_num, indices=True)

      '''
      First X-val iteration. Each iteration trains a different classifer whose
      parameters are optimized and discriminative features are selected
      for the given partition of examples.
      '''
      for train, test in kf:

          # set data folds
          train_fold, test_fold, train_y, test_y = X[train], X[test], y[train], y[test]

          # Feature selection
          feat_select = SelectPercentile(score_func=chi2, percentile=16).fit(train_fold,train_y.astype(float))
          feat_selects.append(feat_select)
          train_fold = feat_select.transform(train_fold)
          test_fold = feat_select.transform(test_fold)

          tuned_parameters = [{'C': grid_search_range }]

          # Hyper parameter optimization
          rd_fitte, score = find_best_parameters(train_fold,train_y,test_fold, test_y,rd,tuned_parameters)
          clfs.append(rd_fitte)
          scores = np.append(scores,score)

      '''
      Next step X-val to see the generalization performance of the ensemble
      '''
      if x_val:
          pred_vals = []
          skf = StratifiedKFold(y, n_folds=fold_num_sec, indices=True)
          clf_scores = np.array(())
          for train, test in skf:
                train_fold, test_fold, train_y, test_y = X[train], X[test], y[train], y[test]
                for counter,clf in enumerate(clfs):
                    test_fold_transed = feat_selects[counter].transform(test_fold)

                    if oobe == True:
                        pred_val = clf.predict(test_fold_transed)*scores[counter]
                    else:
                        pred_val = clf.predict(test_fold_transed)

                    if counter == 0:
                        pred_vals = pred_val
                    else:
                        pred_vals = pred_vals+pred_val

                # Compute current fold's prediction score
                pred = pred_vals/len(clfs)
                clf_score = roc_auc_score(test_y.astype(float),pred)
                clf_scores = np.append(clf_scores,clf_score)

          # validation result
          print "Final X-val result",clf_scores.mean()

      '''
      Full Training Time
      '''
      print "training on full data"
      pred_all =[]
      for counter,clf in enumerate(clfs):
          clf.fit(X,y)

          if oobe == True:
              pred = clf.predict(X_test)*scores[counter]
          else:
              pred = clf.predict(X_test)

          if counter ==0 :
              pred_all = pred
          else:
              pred_all = pred_all+pred
          pred_all = pred_all/len(clfs)

      return pred_all, clfs
	def linear_model_ensemble(X, y, X_test, fold_num, fold_num_sec, grid_search_range, oobe=True, x_val=True ):

	'''
	X - Train set

	y - Train set labels with. Labels are 1 for pos instances and -1 for neg instances

	fold_num1 - Fold size for the first step X-validation to set the hyper-params
	and feature selectors

	fold_num2 - Fold size for the second step X-validation to test the generalization
	performance of the ensemble

	grid_search_range - Given list of values to be used as a C candidates in grid search

	oobe - Use oobe values for the prediction

	x-val - Boolean value for whether use second step X-val or not
	'''


	# Import Necessary Modules
	from sklearn.cross_validation import KFold, StratifiedKFold
	from sklearn.metrics import classification_report,roc_auc_score,accuracy_score
	import sklearn.linear_model as lm
	from sklearn.feature_selection import SelectPercentile
	from sklearn.feature_selection import chi2


	rd = lm.LogisticRegression(dual=True, tol=1e-5,
	fit_intercept=True, intercept_scaling=1.0,
	class_weight=None, random_state=None)

	# Training and feature selection

	num_feats = X.shape[1]
	lentrain = X.shape[0]
	scores = np.zeros((0,)) # list of scores of classifers by test folds
	clfs = [] # list of classifers trained by each train fold.
	feat_selects = [] # list of feature selectors

	kf = KFold(lentrain, n_folds=fold_num, indices=True)

	'''
	First X-val iteration. Each iteration trains a different classifer whose
	parameters are optimized and discriminative features are selected
	for the given partition of examples.
	'''
	for train, test in kf:

	# set data folds
	train_fold, test_fold, train_y, test_y = X[train], X[test], y[train], y[test]

	# Feature selection
	feat_select = SelectPercentile(score_func=chi2, percentile=16).fit(train_fold,train_y.astype(float))
	feat_selects.append(feat_select)
	train_fold = feat_select.transform(train_fold)
	test_fold = feat_select.transform(test_fold)

	tuned_parameters = [{'C': grid_search_range }]

	# Hyper parameter optimization
	rd_fitte, score = find_best_parameters(train_fold,train_y,test_fold, test_y,rd,tuned_parameters)
	clfs.append(rd_fitte)
	scores = np.append(scores,score)

	'''
	Next step X-val to see the generalization performance of the ensemble
	'''
	if x_val:
	pred_vals = []
	skf = StratifiedKFold(y, n_folds=fold_num_sec, indices=True)
	clf_scores = np.array(())
	for train, test in skf:
	train_fold, test_fold, train_y, test_y = X[train], X[test], y[train], y[test]
	for counter,clf in enumerate(clfs):
	test_fold_transed = feat_selects[counter].transform(test_fold)

	if oobe == True:
	pred_val = clf.predict(test_fold_transed)*scores[counter]
	else:
	pred_val = clf.predict(test_fold_transed)

	if counter == 0:
	pred_vals = pred_val
	else:
	pred_vals = pred_vals+pred_val

	# Compute current fold's prediction score
	pred = pred_vals/len(clfs)
	clf_score = roc_auc_score(test_y.astype(float),pred)
	clf_scores = np.append(clf_scores,clf_score)

	# validation result
	print "Final X-val result",clf_scores.mean()

	'''
	Full Training Time
	'''
	print "training on full data"
	pred_all =[]
	for counter,clf in enumerate(clfs):
	clf.fit(X,y)

	if oobe == True:
	pred = clf.predict(X_test)*scores[counter]
	else:
	pred = clf.predict(X_test)

	if counter ==0 :
	pred_all = pred
	else:
	pred_all = pred_all+pred
	pred_all = pred_all/len(clfs)

	return pred_all, clfs