Srishti Saha srishtis

## remove_short_words.m
% This function removes words that have length shorter than thresh_len (in this snippet: 3)
% It also removes extra spaces

function words_new = remove_small_words(words)
    temp1 = strsplit(words, ' ');
    words_new='';
    for i = 1:numel(temp1)
        k = temp1{i};
        a=length(k);
        thresh_len= 3;

## hw3_solutions.m
%Q1: Write a function called circle that takes a scalar input r. It needs to return an output called
%area that is the area of a circle with radius r and a second output, cf that is the circumference of
%the same circle. You are allowed to use the built-in function pi. In fact, you need to use it to get
%the value of π as accurately as possible.

% function to compute the area and circumference of a circle
function [area, cf]= circle (r,pi)
    area= pi*r^2;
    cf=2*pi*r;
    fprintf('the circumference of the circle is %f\n',cf);

## matrix_element_power.m
%simple/ easy-to-understand solution
function ans = matrix_pow(a)
  append=[]
  for i=1:numel(a)
    addi= a(i)^i
    append= [append addi]
  end
  sum(append)
end


## kickstarter_cat_conversion.py
for c in kick_projects.columns:
    #this gives us the list of columns and the respective data types
    col_type = kick_projects[c].dtype
    #looking through all categorical columns in the list above
    if col_type == 'object' :
        a=kick_projects[c].unique()
        keys= range(a.shape[0])
        #initiating a dictionary
        diction={}
        for idx,val in enumerate(a):

## kickstarter_feature_engineering.py

#Creating Goal and Duration percentile buckets
#will create percentile buckets for the goal amount in a category#will c
kick_projects['goal_cat_perc'] =  kick_projects.groupby(['category'])['goal'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3]))

#will create percentile buckets for the duration in a category
kick_projects['duration_cat_perc'] =  kick_projects.groupby(['category'])['duration'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop'))

## Kickstarter_LGBM_gain.py
class LGBMClassifier_GainFE(lgb.LGBMClassifier):
    @property
    def feature_importances_(self):
        if self._n_features is None:
            raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
        return self.booster_.feature_importance(importance_type='gain')

## Kickstarter_name_feature_engg.py
#creating features from the project name

#length of name
kick_projects['name_len'] = kick_projects.name.str.len()

# presence of !
kick_projects['name_exclaim'] = (kick_projects.name.str[-1] == '!').astype(int)

# presence of ?
kick_projects['name_question'] = (kick_projects.name.str[-1] == '?').astype(int)

## Kickstarter_EDA.py
#################################################################################
############ EDA and basic tests on Kickstarter Data ############################
#################################################################################

#printing all summary of the kickstarter data
#this will give the dimensions of data set : (rows, columns)
print(kickstarters_2017.shape)
#columns and data types
print(kickstarters_2017.info())
#basic stats of columns

## correlation_house_price_pred.py
# Plot the Correlation map to see how features are correlated with target: SalePrice
corr_matrix = train_df.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corr_matrix, vmax=0.9, square=True)

## scatter_house_price_pred.py
#scatterplot for selected columns (defined in cols)
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars','GarageArea' ,'TotalBsmtSF', 'FullBath', 'YearBuilt','TotRmsAbvGrd']
sns.pairplot(train_df[cols], size = 2.5)
plt.show();
	% This function removes words that have length shorter than thresh_len (in this snippet: 3)
	% It also removes extra spaces

	function words_new = remove_small_words(words)
	temp1 = strsplit(words, ' ');
	words_new='';
	for i = 1:numel(temp1)
	k = temp1{i};
	a=length(k);
	thresh_len= 3;
	%Q1: Write a function called circle that takes a scalar input r. It needs to return an output called
	%area that is the area of a circle with radius r and a second output, cf that is the circumference of
	%the same circle. You are allowed to use the built-in function pi. In fact, you need to use it to get
	%the value of π as accurately as possible.

	% function to compute the area and circumference of a circle
	function [area, cf]= circle (r,pi)
	area= pi*r^2;
	cf=2pir;
	fprintf('the circumference of the circle is %f\n',cf);
	%simple/ easy-to-understand solution
	function ans = matrix_pow(a)
	append=[]
	for i=1:numel(a)
	addi= a(i)^i
	append= [append addi]
	end
	sum(append)
	end
	for c in kick_projects.columns:
	#this gives us the list of columns and the respective data types
	col_type = kick_projects[c].dtype
	#looking through all categorical columns in the list above
	if col_type == 'object' :
	a=kick_projects[c].unique()
	keys= range(a.shape[0])
	#initiating a dictionary
	diction={}
	for idx,val in enumerate(a):

	#Creating Goal and Duration percentile buckets
	#will create percentile buckets for the goal amount in a category#will c
	kick_projects['goal_cat_perc'] = kick_projects.groupby(['category'])['goal'].transform(
	lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3]))

	#will create percentile buckets for the duration in a category
	kick_projects['duration_cat_perc'] = kick_projects.groupby(['category'])['duration'].transform(
	lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop'))
	class LGBMClassifier_GainFE(lgb.LGBMClassifier):
	@property
	def feature_importances_(self):
	if self._n_features is None:
	raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
	return self.booster_.feature_importance(importance_type='gain')
	#creating features from the project name

	#length of name
	kick_projects['name_len'] = kick_projects.name.str.len()

	# presence of !
	kick_projects['name_exclaim'] = (kick_projects.name.str[-1] == '!').astype(int)

	# presence of ?
	kick_projects['name_question'] = (kick_projects.name.str[-1] == '?').astype(int)
	#################################################################################
	############ EDA and basic tests on Kickstarter Data ############################
	#################################################################################

	#printing all summary of the kickstarter data
	#this will give the dimensions of data set : (rows, columns)
	print(kickstarters_2017.shape)
	#columns and data types
	print(kickstarters_2017.info())
	#basic stats of columns
	# Plot the Correlation map to see how features are correlated with target: SalePrice
	corr_matrix = train_df.corr()
	plt.subplots(figsize=(12,9))
	sns.heatmap(corr_matrix, vmax=0.9, square=True)
	#scatterplot for selected columns (defined in cols)
	sns.set()
	cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars','GarageArea' ,'TotalBsmtSF', 'FullBath', 'YearBuilt','TotRmsAbvGrd']
	sns.pairplot(train_df[cols], size = 2.5)
	plt.show();