Skip to content

Instantly share code, notes, and snippets.

View harshitcodes's full-sized avatar

Harshit Tyagi harshitcodes

View GitHub Profile
CREATE OR REPLACE TABLE ecommerce_analytics.subset_user_sessions
OPTIONS(
description="Obfuscated data from Google merchandise store"
) AS
SELECT * FROM `next-marketing-analytics.ecommerce.all_sessions_raw`
WHERE date = '20170101';
class NeuralNet:
def __init__(self, x, y):
self.input = x
self.y = y
self.weights1 = np.random.rand(self.input.shape[1],4)
self.weights2 = np.random.rand(4,1)
self.output = np.zeros(y.shape)
ran_int = np.random.randint(0, 10000, size=(rows, 2))
# rows to be written one by one
for i in range(rows):
pointer['Num1'] = ran_int[i, 0]
pointer['Num2'] = ran_int[i, 1]
pointer.append()
# this appends the data and
# moves the pointer one row forward
tab.flush()
filename = '<path to your folder>' + 'tab.h5'
h5 = tb.open_file(filename, 'w')
# creating 2million rows for the database
rows = 2000000
table_model = {
'Num1': tb.IntCol(pos=1),
'Num2': tb.IntCol(pos=2)
}
filters = tb.Filters(complevel=0) # no compression
import numpy as np
from random import gauss
path = '<enter the path where you want to write the file>'
a1 = [gauss(1.5, 2) for i in range(1000000)]
import pickle
pkl_file = open(path + "serialized_data.pkl", 'wb')
%time pickle.dump(a1, pkl_file)
def record_comp_time(func_list, data_list, rep=3, number=1):
''' Function to compare the performance of different functions.
Args:
func_list : list
list with function names as strings
data_list : list
list with data set names as strings
rep : int
number of repetitions of the whole comparison
number : int
plt.subplots(figsize=(12,10))
list1=[]
# extending the list of genres to collect all the genres of all the profitable movies
for i in profit_data['genres']:
list1.extend(i)
genre_count_series = pd.Series(list1).value_counts()[:10].sort_values(ascending=True)
# output looks like
# formatting the data in the genres columns.
movies_df['genres']=movies_df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies_df['genres']=movies_df['genres'].str.split(',')
movies_df.head()
#selecting the movies having profit $50M or more
profit_data = movies_df[movies_df['profit'] >= 50000000]
#reindexing new data
profit_data.index = range(len(profit_data))
#we will start from 1 instead of 0
profit_data.index = profit_data.index + 1
# Most profitable year from the given dataset.
profits_year.idxmax()