Lucas Allen lgallen

## gist:f513fe2d24c4b407382a
cat mtcars.csv


"","mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
"Mazda RX4",21,6,160,110,3.9,2.62,16.46,0,1,4,4
"Mazda RX4 Wag",21,6,160,110,3.9,2.875,17.02,0,1,4,4
"Datsun 710",22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
"Hornet 4 Drive",21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
"Hornet Sportabout",18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
"Valiant",18.1,6,225,105,2.76,3.46,20.22,1,0,3,1

## tweet_dumper.py
#!/usr/bin/env python
# encoding: utf-8

import tweepy #https://github.com/tweepy/tweepy
import csv

#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""

## doc2vec_hyperparameters.txt
#doc2vec parameters
vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes

## cosine_similarity_vectorized.R
cosine_similarity_vec <- function(row_index, df){
  row <- df[row_index,]
  mat <- df[-row_index,]
  numerator <- rowSums(sweep(mat, MARGIN=2, row, "*"))
  denominator <- sqrt(sum(row**2)) * sqrt(rowSums(mat**2))
  similarities <- numerator/denominator
  game_numbers <- 1:dim(df)[1]
  game_numbers <- game_numbers[! game_numbers %in% row_index]
  df_similarity <- data.frame(game_numbers, similarities)
  df_similarity <- df_similarity %>% arrange(desc(similarities))

## remove_leading.sh
cd <path_to_directory_containing_files> && for file in *<file_type>; do mv "$file" "${file:<number_of_leading_characters_to_remove>}"; done

## unique_pairs.py
# Generated as example for Springboard mentees
import pandas as pd
df = pd.DataFrame()
df['code'] = ['1', '1', '2', '3', '3', '3', '3', '4', '4']
df['country'] = ['usa', '', 'france', 'japan', 'japan', '', 'japan', 'brazil', 'brazil']
df['extracolumn'] = ['i', 'do', 'not', 'need', 'the', 'stuff', 'in', 'this', 'column']
new_df = df[['code', 'country']].drop_duplicates()
new_df = new_df[new_df['country'] != '']
new_df

## examine_coef.py
# Grabbing the preprocessor
pre = fit_model.named_steps['preprocessor']

# Getting the numerical and categorical features from the pipeline
num_feats = pre.transformers_[0][2]
cat_feats = pre.transformers_[1][1]['onehot']\
                   .get_feature_names(categorical_features)
all_feats = num_feats+list(cat_feats)

# Dataframe for visual examination of coefficients
	cat mtcars.csv


	"","mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
	"Mazda RX4",21,6,160,110,3.9,2.62,16.46,0,1,4,4
	"Mazda RX4 Wag",21,6,160,110,3.9,2.875,17.02,0,1,4,4
	"Datsun 710",22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
	"Hornet 4 Drive",21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
	"Hornet Sportabout",18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
	"Valiant",18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
	#!/usr/bin/env python
	# encoding: utf-8

	import tweepy #https://github.com/tweepy/tweepy
	import csv

	#Twitter API credentials
	consumer_key = ""
	consumer_secret = ""
	access_key = ""
	#doc2vec parameters
	vector_size = 300
	window_size = 15
	min_count = 1
	sampling_threshold = 1e-5
	negative_size = 5
	train_epoch = 100
	dm = 0 #0 = dbow; 1 = dmpv
	worker_count = 1 #number of parallel processes
	cosine_similarity_vec <- function(row_index, df){
	row <- df[row_index,]
	mat <- df[-row_index,]
	numerator <- rowSums(sweep(mat, MARGIN=2, row, "*"))
	denominator <- sqrt(sum(row*2)) sqrt(rowSums(mat**2))
	similarities <- numerator/denominator
	game_numbers <- 1:dim(df)[1]
	game_numbers <- game_numbers[! game_numbers %in% row_index]
	df_similarity <- data.frame(game_numbers, similarities)
	df_similarity <- df_similarity %>% arrange(desc(similarities))
	# Generated as example for Springboard mentees
	import pandas as pd
	df = pd.DataFrame()
	df['code'] = ['1', '1', '2', '3', '3', '3', '3', '4', '4']
	df['country'] = ['usa', '', 'france', 'japan', 'japan', '', 'japan', 'brazil', 'brazil']
	df['extracolumn'] = ['i', 'do', 'not', 'need', 'the', 'stuff', 'in', 'this', 'column']
	new_df = df[['code', 'country']].drop_duplicates()
	new_df = new_df[new_df['country'] != '']
	new_df
	# Grabbing the preprocessor
	pre = fit_model.named_steps['preprocessor']

	# Getting the numerical and categorical features from the pipeline
	num_feats = pre.transformers_[0][2]
	cat_feats = pre.transformers_[1][1]['onehot']\
	.get_feature_names(categorical_features)
	all_feats = num_feats+list(cat_feats)

	# Dataframe for visual examination of coefficients