Skip to content

Instantly share code, notes, and snippets.

@jeongmincha
Last active June 14, 2017 07:28
Show Gist options
  • Save jeongmincha/4f94aadf08b16b525dbc8e93a2a3ffee to your computer and use it in GitHub Desktop.
Save jeongmincha/4f94aadf08b16b525dbc8e93a2a3ffee to your computer and use it in GitHub Desktop.
ridge linear regression for Kickstarter dataset
import glob
import pandas as pd
# read files
lst = []
for file in glob.glob('../../../../Downloads/data_feature/*.csv'):
df = pd.read_csv(file, index_col=None, header=0)
lst.append(df)
df = pd.concat(lst)
df['percentage'] = df['usd_pledged'] / df['goal']
df['duration'] = df['deadline'] - df['launched_at']
def boolean_to_number(x):
if x is True:
return 1
elif x is False:
return 0
df['spotlight_num'] = df['spotlight'].apply(boolean_to_number)
target_features = ['usd_pledged', 'backers_count', 'percentage']
input_features = ['goal', 'deadline', 'launched_at', 'duration', 'spotlight_num'] + list(df.columns[33:-3])
from sklearn import linear_model
x = df[input_features]
y = df[target_features]
ratio = 0.75
dev_split = int(len(x) * ratio)
train_x = x[:dev_split]
train_y = y[:dev_split]
test_x = x[dev_split:]
test_y = y[dev_split:]
print ("... Normalize features")
def normalize_features(x):
from sklearn import preprocessing
min_max_scalar = preprocessing.MinMaxScaler()
return pd.DataFrame(min_max_scalar.fit_transform(x))
train_x_scaled = normalize_features(train_x)
train_y_scaled = normalize_features(train_y)
test_x_scaled = normalize_features(test_x)
test_y_scaled = normalize_features(test_y)
print ("... Training")
reg = linear_model.Ridge(alpha=.5)
reg.fit(train_x_scaled, train_y_scaled)
#### print r2 score!
print (reg.score(test_x_scaled, test_y_scaled))
### coefficients
coef = pd.DataFrame(reg.coef_, columns=input_features)
### make word list
word_list = []
word_list.extend(open('../../../../Downloads/data_feature/unigram_list.txt').readlines())
word_list.extend(open('../../../../Downloads/data_feature/bigram_list.txt').readlines())
word_list.extend(open('../../../../Downloads/data_feature/trigram_list.txt').readlines())
def list_view(top_idxs, lst):
top_n_lst = []
for idx in top_idxs:
if idx.isdigit():
top_n_lst.append(lst[int(idx)])
else:
top_n_lst.append(idx)
return top_n_lst
import pickle
target_idx = 0 # 0 = usd_pledged, 1 = 'backers_count' 2 = 'percentage'
for idx in target_idx:
print ("target variable: ", target_features[target_idx])
top_n = 50
top_n_idxs1 = coef.transpose().nlargest(top_n, target_idx)[target_idx].index
top_n_idxs2 = coef.transpose().nsmallest(top_n, target_idx)[target_idx].index
with open(target_features[target_idx] + '_largest_50.pickle', 'wb') as handle:
pickle.dump(list_view(top_n_idxs1, word_list), handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(target_features[target_idx] + '_smallest_50.pickle', 'wb') as handle:
pickle.dump(list_view(top_n_idxs2, word_list), handle, protocol=pickle.HIGHEST_PROTOCOL)
print (list_view(top_n_idxs1, word_list))
print (list_view(top_n_idxs2, word_list))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment