Timo Böhm timo-boehm

## load_ext_gist.py
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from helperfunctions import complicated_function_to_return_a_number

complicated_function_to_return_a_number()
# Output: 123

## matplotlib_gist.py
from numpy.random import randint
import matplotlib.pyplot as plt

# Sample 1000 random values to create a scatterplot
x = randint(low=1, high=1000, size=100)
y = randint(low=1, high=1000, size=100)

# This will show nothing in a Jupyter Notebook
plt.scatter(x, y)
plt.show()

## system_gist.py
# Easy to read version
%system date

# Shorthand with "!!" instead of "%system" works equally well
!!date

## time_gist.py
import numpy as np
from numpy.random import randint

# A function to simulate one million dice throws.
def one_million_dice():
    return randint(low=1, high=7, size=1000000)

# Let's try %time first
%time throws = one_million_dice()
%time mean = np.mean(throws)

## who_ls_gist.py
# Outputs a list of all interactive variables in your environment
%who_ls

# Reduces the output to interactive variables of type "function"
%who_ls function

## read_movie_data.py
import pandas as pd

df = pd.read_csv(path, encoding="ISO-8859-1", usecols=["imdbId", "Title", "Genre", "Poster"])
df.set_index(["imdbId"], inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(subset="Poster", keep=False, inplace=True)

## extract_year.py
import re

re_year = re.compile("\((\d{4})\)")
df["year"] = df.Title.map(lambda x: int(re_year.findall(x)[0]) if re_year.findall(x) else None)

## extract_genres.py
df["Genre"] = df.Genre.map(lambda x: x.split("|"))
all_genres = set([item for l in df.Genre for item in l])
for genre in all_genres:
    new_var = "is_" + re.sub(r'\W+', '', genre.lower())
    df[new_var] = df.Genre.map(lambda x: genre in x)
df.drop(["Genre"], axis=1, inplace=True)

## distribution_decades.py
df_range["decade"] = df_range.year.apply(lambda x: str(int(x))[2] +"0s")
print(f"Movies per decade in the dataset:\n{df_range.decade.value_counts()}\n")

# Outputs:
# 10s    1381
# 00s    1327
# 90s     855
# 80s     582
# 70s     469

## sample_decades.py
min_number = np.min(df.decade.value_counts())
df_sample = df.groupby("decade").apply(lambda x: x.sample(min_number))
	# load the autoreload extension
	%load_ext autoreload
	# Set extension to reload modules every time before executing code
	%autoreload 2

	from helperfunctions import complicated_function_to_return_a_number

	complicated_function_to_return_a_number()
	# Output: 123
	from numpy.random import randint
	import matplotlib.pyplot as plt

	# Sample 1000 random values to create a scatterplot
	x = randint(low=1, high=1000, size=100)
	y = randint(low=1, high=1000, size=100)

	# This will show nothing in a Jupyter Notebook
	plt.scatter(x, y)
	plt.show()
	# Easy to read version
	%system date

	# Shorthand with "!!" instead of "%system" works equally well
	!!date
	import numpy as np
	from numpy.random import randint

	# A function to simulate one million dice throws.
	def one_million_dice():
	return randint(low=1, high=7, size=1000000)

	# Let's try %time first
	%time throws = one_million_dice()
	%time mean = np.mean(throws)
	# Outputs a list of all interactive variables in your environment
	%who_ls

	# Reduces the output to interactive variables of type "function"
	%who_ls function
	import pandas as pd

	df = pd.read_csv(path, encoding="ISO-8859-1", usecols=["imdbId", "Title", "Genre", "Poster"])
	df.set_index(["imdbId"], inplace=True)
	df.dropna(inplace=True)
	df.drop_duplicates(subset="Poster", keep=False, inplace=True)
	import re

	re_year = re.compile("\((\d{4})\)")
	df["year"] = df.Title.map(lambda x: int(re_year.findall(x)[0]) if re_year.findall(x) else None)
	df["Genre"] = df.Genre.map(lambda x: x.split("\|"))
	all_genres = set([item for l in df.Genre for item in l])
	for genre in all_genres:
	new_var = "is_" + re.sub(r'\W+', '', genre.lower())
	df[new_var] = df.Genre.map(lambda x: genre in x)
	df.drop(["Genre"], axis=1, inplace=True)
	df_range["decade"] = df_range.year.apply(lambda x: str(int(x))[2] +"0s")
	print(f"Movies per decade in the dataset:\n{df_range.decade.value_counts()}\n")

	# Outputs:
	# 10s 1381
	# 00s 1327
	# 90s 855
	# 80s 582
	# 70s 469
	min_number = np.min(df.decade.value_counts())
	df_sample = df.groupby("decade").apply(lambda x: x.sample(min_number))