Last active
June 14, 2017 07:28
-
-
Save jeongmincha/4f94aadf08b16b525dbc8e93a2a3ffee to your computer and use it in GitHub Desktop.
ridge linear regression for Kickstarter dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import pandas as pd | |
# read files | |
lst = [] | |
for file in glob.glob('../../../../Downloads/data_feature/*.csv'): | |
df = pd.read_csv(file, index_col=None, header=0) | |
lst.append(df) | |
df = pd.concat(lst) | |
df['percentage'] = df['usd_pledged'] / df['goal'] | |
df['duration'] = df['deadline'] - df['launched_at'] | |
def boolean_to_number(x): | |
if x is True: | |
return 1 | |
elif x is False: | |
return 0 | |
df['spotlight_num'] = df['spotlight'].apply(boolean_to_number) | |
target_features = ['usd_pledged', 'backers_count', 'percentage'] | |
input_features = ['goal', 'deadline', 'launched_at', 'duration', 'spotlight_num'] + list(df.columns[33:-3]) | |
from sklearn import linear_model | |
x = df[input_features] | |
y = df[target_features] | |
ratio = 0.75 | |
dev_split = int(len(x) * ratio) | |
train_x = x[:dev_split] | |
train_y = y[:dev_split] | |
test_x = x[dev_split:] | |
test_y = y[dev_split:] | |
print ("... Normalize features") | |
def normalize_features(x): | |
from sklearn import preprocessing | |
min_max_scalar = preprocessing.MinMaxScaler() | |
return pd.DataFrame(min_max_scalar.fit_transform(x)) | |
train_x_scaled = normalize_features(train_x) | |
train_y_scaled = normalize_features(train_y) | |
test_x_scaled = normalize_features(test_x) | |
test_y_scaled = normalize_features(test_y) | |
print ("... Training") | |
reg = linear_model.Ridge(alpha=.5) | |
reg.fit(train_x_scaled, train_y_scaled) | |
#### print r2 score! | |
print (reg.score(test_x_scaled, test_y_scaled)) | |
### coefficients | |
coef = pd.DataFrame(reg.coef_, columns=input_features) | |
### make word list | |
word_list = [] | |
word_list.extend(open('../../../../Downloads/data_feature/unigram_list.txt').readlines()) | |
word_list.extend(open('../../../../Downloads/data_feature/bigram_list.txt').readlines()) | |
word_list.extend(open('../../../../Downloads/data_feature/trigram_list.txt').readlines()) | |
def list_view(top_idxs, lst): | |
top_n_lst = [] | |
for idx in top_idxs: | |
if idx.isdigit(): | |
top_n_lst.append(lst[int(idx)]) | |
else: | |
top_n_lst.append(idx) | |
return top_n_lst | |
import pickle | |
target_idx = 0 # 0 = usd_pledged, 1 = 'backers_count' 2 = 'percentage' | |
for idx in target_idx: | |
print ("target variable: ", target_features[target_idx]) | |
top_n = 50 | |
top_n_idxs1 = coef.transpose().nlargest(top_n, target_idx)[target_idx].index | |
top_n_idxs2 = coef.transpose().nsmallest(top_n, target_idx)[target_idx].index | |
with open(target_features[target_idx] + '_largest_50.pickle', 'wb') as handle: | |
pickle.dump(list_view(top_n_idxs1, word_list), handle, protocol=pickle.HIGHEST_PROTOCOL) | |
with open(target_features[target_idx] + '_smallest_50.pickle', 'wb') as handle: | |
pickle.dump(list_view(top_n_idxs2, word_list), handle, protocol=pickle.HIGHEST_PROTOCOL) | |
print (list_view(top_n_idxs1, word_list)) | |
print (list_view(top_n_idxs2, word_list)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment