This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_year_from_title(title): | |
# function to find the year in the given list | |
# if not found assigns zero as year | |
# ASSUMPTION: There is no NA values | |
# in the title feature | |
int_list = [] | |
now = datetime.datetime.now() | |
for item in title: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_features_from_description(df, | |
column_name, | |
new_feature_name, | |
extract_words): | |
# function to extract features from the column_name | |
# searches column_name feature for a given list | |
# ASSUMPTION: There is no NA values | |
# in the description feature | |
check_regex = (r'\b(?:{})\b' | |
.format('|' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_distribution(df, target, column_values, column_name): | |
# funtion to print distribution of a continuous variable | |
# for categorical data | |
for value in column_values: | |
subset = df[df[column_name] == value] | |
g = sns.kdeplot(subset[target], | |
label=value, | |
linewidth=3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_histogram(df, column, b=None): | |
# funtion to print histogram | |
# with mean and median | |
# using distplot | |
# set the histogram, mean and median | |
g = sns.distplot(df[column], kde=False, bins=b) | |
plt.axvline(x=df[column].mean(), | |
linewidth=3, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def CleanData(df, drop_columns, target_name): | |
# this function drops not used features | |
# and duplicate rows | |
# and empty rows of target(poinst) | |
# returns cleaned df | |
interim_df = df.drop(columns=drop_columns) | |
interim_df_2 = (interim_df | |
.drop_duplicates(ignore_index=True)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def missing_values_table(df): | |
# Total missing values | |
mis_val = df.isnull().sum() | |
# Percentage of missing values | |
mis_val_percent = 100 * df.isnull().sum() / len(df) | |
# Make a table with the results | |
mis_val_table = pd.concat([mis_val, mis_val_percent], | |
axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data manipulation libraries | |
import pandas as pd | |
import numpy as np | |
pd.set_option('display.max_columns', 60) | |
# data visualization libraries | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from IPython.core.pylabtools import figsize |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source https://towardsdatascience.com/debugging-jupyter-notebooks-will-boost-your-productivity-a33387f4fa62 | |
import contextlib | |
with contextlib.redirect_stdout(None): | |
import pixiedust | |
# New Cell | |
def find_max (values): | |
max = 0 | |
import pdb; pdb.set_trace() | |
for val in values: | |
if val > max: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
np.random.seed(42) | |
import plotly.offline as pyo | |
import plotly.graph_objs as go | |
# create data | |
x = np.random.randint(1, 101, 100) | |
y = np.random.randint(1, 101, 100) | |
# data object to be used in figure object |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def encode_column(column): | |
if column > 0: | |
return 1 | |
if column <= 0: | |
return 0 | |
def aggregate_by_ordered_quantity(dataframe, column_list): | |
'''this function: | |
1. aggregates a given dataframe by column list, |