Skip to content

Instantly share code, notes, and snippets.

View dylanjf's full-sized avatar

Dylan Friedmann dylanjf

  • Chegg
  • Brooklyn, NY
View GitHub Profile
library(Matrix)
amazon_train = sparse.model.matrix(~. - 1, data = amazon_train)
amazon_test$ACTION1 = 1
#this is really just a placeholder variable so that the training examples wont be removed when I call match.
#also, make sure you put this in the same column as it is in the training set...
amazon_test = sparse.model.matrix(~. - 1, data = amazon_test)
@dylanjf
dylanjf / gist:5832136
Last active December 18, 2015 19:19
greedy selection R
#read and separate data
amazon_train = read.csv("C:/Users/dylanjf/Desktop/amazon/train.csv")
amazon_train = amazon_train[,-10]
amazon_Ytrain = as.matrix(amazon_train[,1])
amazon_train = amazon_train[,-1]
amazon_test = read.csv("C:/Users/dylanjf/Desktop/amazon/test.csv")
amazon_test = amazon_test[,c(1:8)]
###"naive bayesian" approach... if matched in training to test, return the probability that the person
was accepted over all training examples###
#concatenate all base variables except for RESOURCE
train_person = paste(amazon_Xtrain[,2],amazon_Xtrain[,3],amazon_Xtrain[,4],
amazon_Xtrain[,5],amazon_Xtrain[,6],amazon_Xtrain[,7],amazon_Xtrain[,8], sep = "")
test_person = paste(amazon_Xtest[,2],amazon_Xtest[,3],amazon_Xtest[,4],
amazon_Xtest[,5],amazon_Xtest[,6],amazon_Xtest[,7],amazon_Xtest[,8], sep = "")
@dylanjf
dylanjf / gist:7011219
Last active December 25, 2015 17:09
3 rep 10 fold CV
########3 rep 10 fold CV to determine feature sparsity percentage via RFE#########
#X = concatenated text features for training set (title, body, url) transformed via TfIdfVectorizer
#y = training set classification (0, 1)
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.cross_validation import KFold
from sklearn import metrics
import numpy as np
class CorrMatrix():
"""
creates and displays the correlation matrix for a data set in a
memory efficient manner.
additionally, allows the option to enable feature selection, cutting off
highly correlated features at a given threshold.
@dylanjf
dylanjf / gist:389f7b56ef73ad6ae75b
Last active August 29, 2015 14:06
rare count by column
from csv import DictReader
import numpy as np
import pandas as pd
class NestedDictCreator(dict):
"""
Implementation of perl's autovivification feature.
Used to generate nested dictionaries on the fly
"""
from abc import ABCMeta, abstractmethod
from typing import Dict
class Base(metaclass=ABCMeta):
def __init__(self, a):
self.a = a
@abstractmethod
def sub_thing(self, **data) -> Dict:
pass
from abc import ABCMeta, abstractmethod
from typing import Dict
class Base(metaclass=ABCMeta):
def __init__(self, a):
self.a = a
@abstractmethod
def sub_thing(self, **data) -> Dict:
pass