Skip to content

Instantly share code, notes, and snippets.

View timo-boehm's full-sized avatar

Timo Böhm timo-boehm

  • Cologne, Germany
View GitHub Profile
@timo-boehm
timo-boehm / load_ext_gist.py
Created August 17, 2018 09:52
load_ext_gist
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2
from helperfunctions import complicated_function_to_return_a_number
complicated_function_to_return_a_number()
# Output: 123
from numpy.random import randint
import matplotlib.pyplot as plt
# Sample 1000 random values to create a scatterplot
x = randint(low=1, high=1000, size=100)
y = randint(low=1, high=1000, size=100)
# This will show nothing in a Jupyter Notebook
plt.scatter(x, y)
plt.show()
# Easy to read version
%system date
# Shorthand with "!!" instead of "%system" works equally well
!!date
import numpy as np
from numpy.random import randint
# A function to simulate one million dice throws.
def one_million_dice():
return randint(low=1, high=7, size=1000000)
# Let's try %time first
%time throws = one_million_dice()
%time mean = np.mean(throws)
# Outputs a list of all interactive variables in your environment
%who_ls
# Reduces the output to interactive variables of type "function"
%who_ls function
import pandas as pd
df = pd.read_csv(path, encoding="ISO-8859-1", usecols=["imdbId", "Title", "Genre", "Poster"])
df.set_index(["imdbId"], inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(subset="Poster", keep=False, inplace=True)
import re
re_year = re.compile("\((\d{4})\)")
df["year"] = df.Title.map(lambda x: int(re_year.findall(x)[0]) if re_year.findall(x) else None)
df["Genre"] = df.Genre.map(lambda x: x.split("|"))
all_genres = set([item for l in df.Genre for item in l])
for genre in all_genres:
new_var = "is_" + re.sub(r'\W+', '', genre.lower())
df[new_var] = df.Genre.map(lambda x: genre in x)
df.drop(["Genre"], axis=1, inplace=True)
df_range["decade"] = df_range.year.apply(lambda x: str(int(x))[2] +"0s")
print(f"Movies per decade in the dataset:\n{df_range.decade.value_counts()}\n")
# Outputs:
# 10s 1381
# 00s 1327
# 90s 855
# 80s 582
# 70s 469
min_number = np.min(df.decade.value_counts())
df_sample = df.groupby("decade").apply(lambda x: x.sample(min_number))