Harshit Tyagi harshitcodes

## data_extractor.sql
CREATE OR REPLACE TABLE ecommerce_analytics.subset_user_sessions
OPTIONS(
 description="Obfuscated data from Google merchandise store"
) AS
SELECT * FROM `next-marketing-analytics.ecommerce.all_sessions_raw`
WHERE date = '20170101';

## nerualnet_class.py
class NeuralNet:
    def __init__(self, x, y):
        self.input      = x
        self.y          = y
        self.weights1   = np.random.rand(self.input.shape[1],4)
        self.weights2   = np.random.rand(4,1)
        self.output     = np.zeros(y.shape)

## insert_rows.py
ran_int = np.random.randint(0, 10000, size=(rows, 2))

# rows to be written one by one
for i in range(rows):
  pointer['Num1'] = ran_int[i, 0]
  pointer['Num2'] = ran_int[i, 1]
  pointer.append()
  # this appends the data and
  # moves the pointer one row forward
tab.flush()

## pytable.py
filename = '<path to your folder>' + 'tab.h5'
h5 = tb.open_file(filename, 'w')

# creating 2million rows for the database
rows = 2000000
table_model = {
'Num1': tb.IntCol(pos=1),
'Num2': tb.IntCol(pos=2)
}
filters = tb.Filters(complevel=0) # no compression

## pickle.py
import numpy as np
from random import gauss
path = '<enter the path where you want to write the file>'

a1 = [gauss(1.5, 2) for i in range(1000000)]
import pickle
pkl_file = open(path + "serialized_data.pkl", 'wb')

%time pickle.dump(a1, pkl_file)

## methods_1.py
def record_comp_time(func_list, data_list, rep=3, number=1):
    ''' Function to compare the performance of different functions.
    Args:
    func_list : list
      list with function names as strings
    data_list : list
      list with data set names as strings
    rep : int
        number of repetitions of the whole comparison
    number : int

## seaborn_visualisation.py
plt.subplots(figsize=(12,10))
list1=[]

# extending the list of genres to collect all the genres of all the profitable movies
for i in profit_data['genres']:
    list1.extend(i)


genre_count_series = pd.Series(list1).value_counts()[:10].sort_values(ascending=True)
# output looks like

## genre.py
# formatting the data in the genres columns.
movies_df['genres']=movies_df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies_df['genres']=movies_df['genres'].str.split(',')
movies_df.head()

## profit_data.py
#selecting the movies having profit $50M or more
profit_data = movies_df[movies_df['profit'] >= 50000000]

#reindexing new data
profit_data.index = range(len(profit_data))

#we will start from 1 instead of 0
profit_data.index = profit_data.index + 1

## answer_8.py
# Most profitable year from the given dataset.

profits_year.idxmax()
	CREATE OR REPLACE TABLE ecommerce_analytics.subset_user_sessions
	OPTIONS(
	description="Obfuscated data from Google merchandise store"
	) AS
	SELECT * FROM `next-marketing-analytics.ecommerce.all_sessions_raw`
	WHERE date = '20170101';
	class NeuralNet:
	def __init__(self, x, y):
	self.input = x
	self.y = y
	self.weights1 = np.random.rand(self.input.shape[1],4)
	self.weights2 = np.random.rand(4,1)
	self.output = np.zeros(y.shape)
	ran_int = np.random.randint(0, 10000, size=(rows, 2))

	# rows to be written one by one
	for i in range(rows):
	pointer['Num1'] = ran_int[i, 0]
	pointer['Num2'] = ran_int[i, 1]
	pointer.append()
	# this appends the data and
	# moves the pointer one row forward
	tab.flush()
	filename = '<path to your folder>' + 'tab.h5'
	h5 = tb.open_file(filename, 'w')

	# creating 2million rows for the database
	rows = 2000000
	table_model = {
	'Num1': tb.IntCol(pos=1),
	'Num2': tb.IntCol(pos=2)
	}
	filters = tb.Filters(complevel=0) # no compression
	import numpy as np
	from random import gauss
	path = '<enter the path where you want to write the file>'

	a1 = [gauss(1.5, 2) for i in range(1000000)]
	import pickle
	pkl_file = open(path + "serialized_data.pkl", 'wb')

	%time pickle.dump(a1, pkl_file)
	def record_comp_time(func_list, data_list, rep=3, number=1):
	''' Function to compare the performance of different functions.
	Args:
	func_list : list
	list with function names as strings
	data_list : list
	list with data set names as strings
	rep : int
	number of repetitions of the whole comparison
	number : int
	plt.subplots(figsize=(12,10))
	list1=[]

	# extending the list of genres to collect all the genres of all the profitable movies
	for i in profit_data['genres']:
	list1.extend(i)


	genre_count_series = pd.Series(list1).value_counts()[:10].sort_values(ascending=True)
	# output looks like
	# formatting the data in the genres columns.
	movies_df['genres']=movies_df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
	movies_df['genres']=movies_df['genres'].str.split(',')
	movies_df.head()
	#selecting the movies having profit $50M or more
	profit_data = movies_df[movies_df['profit'] >= 50000000]

	#reindexing new data
	profit_data.index = range(len(profit_data))

	#we will start from 1 instead of 0
	profit_data.index = profit_data.index + 1
	# Most profitable year from the given dataset.

	profits_year.idxmax()