You need to download the source of the data (the html) of IMDb and save it as data.html, then run parse_imdb.py
, this will generate the csv.
Then you can run plot.py
.
Remember to use python3
actor;season_1;season_2;season_3;season_4;season_5;season_6 | |
Tyrion Lannister;52:45;65:0;50:0;47:45;44:0;34:0 | |
Jon Snow;54:45;33:45;36:0;42:30;49:15;52:0 | |
Daenerys Targaryen;54:0;38:45;30:45;27:30;41:45;28:45 | |
Cersei Lannister;28:0;36:15;22:0;37:15;49:45;28:30 | |
Sansa Stark;24:0;34:0;33:30;32:45;34:45;40:30 | |
Arya Stark;33:30;38:45;26:30;25:45;34:45;30:0 | |
Jaime Lannister;21:0;18:15;34:45;36:0;21:45;30:45 | |
Theon Greyjoy;16:45;40:15;22:30;12:15;15:30;16:30 | |
Samwell Tarly;21:15;11:15;24:0;26:0;23:30;16:0 | |
Jorah Mormont;27:30;17:30;18:45;13:30;29:45;10:30 | |
Petyr 'Littlefinger' Baelish;25:45;21:15;8:45;19:0;18:30;9:0 | |
Eddard 'Ned' Stark;92:30;0:0;0:0;0:0;0:0;7:15 | |
Brienne of Tarth;0:0;18:0;21:15;20:0;14:45;15:30 | |
Davos Seaworth;0:0;19:15;19:0;11:30;12:30;27:15 | |
Bran Stark;25:45;13:15;16:45;12:0;0:0;18:15 | |
Catelyn Stark;42:15;24:0;16:30;0:0;0:0;0:0 | |
Lord Varys;19:0;14:0;15:45;6:30;10:30;16:0 | |
Tywin Lannister;10:15;17:15;21:15;28:30;1:0;0:0 | |
Margaery Tyrell;0:0;9:30;21:15;17:45;11:15;18:15 | |
Robb Stark;24:45;27:30;25:30;0:0;0:0;0:0 | |
Stannis Baratheon;0:0;19:45;14:15;10:0;29:15;0:0 | |
Sandor 'The Hound' Clegane;6:45;11:45;16:0;28:30;0:0;9:45 | |
Joffrey Baratheon;17:45;17:30;20:15;14:45;0:0;0:0 | |
Ramsay Bolton;0:0;0:0;12:45;13:30;18:30;21:15 | |
Melisandre;0:0;12:45;18:0;5:15;11:45;18:0 | |
Bronn;15:0;14:30;6:45;8:45;13:30;5:30 | |
Gilly;0:0;3:15;13:30;9:15;11:45;15:15 | |
Ygritte;0:0;17:45;25:0;8:15;0:0;0:0 | |
Shae;8:0;16:0;14:30;8:45;0:0;0:0 | |
Daario Naharis;0:0;0:0;6:30;7:30;19:15;12:45 | |
Missandei;0:0;0:0;11:30;9:30;13:15;11:30 | |
Tommen Baratheon;0:45;4:15;0:0;9:45;11:30;17:0 | |
Tormund Giantsbane;0:0;0:0;8:45;7:15;12:15;13:30 | |
Podrick Payne;0:0;3:45;5:30;16:0;9:0;7:0 | |
Olenna Tyrell;0:0;0:0;13:45;8:0;8:0;8:30 | |
High Sparrow;0:0;0:0;0:0;0:0;17:0;20:30 | |
Barristan Selmy;8:45;0:0;13:30;7:45;7:15;0:0 | |
Grand Maester Pycelle;13:45;6:15;2:45;5:15;2:30;4:45 | |
Grey Worm;0:0;0:0;5:30;6:30;10:15;10:0 | |
Loras Tyrell;5:45;8:0;6:15;1:45;4:30;5:45 | |
Talisa Maegyr;0:0;14:45;16:0;0:0;0:0;0:0 | |
Robert Baratheon;30:30;0:0;0:0;0:0;0:0;0:0 | |
Roose Bolton;0:0;3:30;8:45;5:45;9:0;3:15 | |
Osha;8:0;9:30;8:45;0:0;0:0;3:30 | |
Hodor;3:0;6:0;7:0;7:0;0:0;6:30 | |
Gendry;2:45;9:0;17:0;0:0;0:0;0:0 | |
Oberyn Martell;0:0;0:0;0:0;28:30;0:0;0:0 | |
Eddison Tollett;0:0;5:0;3:45;5:45;4:15;9:30 | |
Yara Greyjoy;0:0;10:0;2:45;2:15;0:0;12:0 | |
Meera Reed;0:0;0:0;8:15;9:15;0:0;9:30 | |
Jaqen H'ghar;0:0;8:0;0:0;0:0;11:15;7:15 | |
Alliser Thorne;6:45;0:0;0:0;9:15;5:45;4:15 | |
Khal Drogo;22:45;2:15;0:0;0:0;0:0;0:0 | |
Renly Baratheon;9:30;14:30;0:0;0:0;0:0;0:0 | |
Maester Luwin;6:45;16:45;0:0;0:0;0:0;0:0 | |
Ros;8:15;10:15;3:15;0:0;0:0;0:0 | |
Grenn;6:15;5:0;3:0;7:0;0:0;0:0 | |
Mance Rayder;0:0;0:0;6:45;7:0;7:15;0:0 | |
Jeor Mormont;10:0;7:0;4:0;0:0;0:0;0:0 | |
Viserys Targaryen;20:30;0:0;0:0;0:0;0:0;0:0 | |
Qyburn;0:0;0:0;5:45;2:15;4:15;7:0 | |
Jojen Reed;0:0;0:0;9:45;9:15;0:0;0:0 | |
Maester Aemon;5:30;0:0;1:15;6:0;6:15;0:0 | |
Gregor 'The Mountain' Clegane;2:30;1:45;0:0;4:45;0:30;9:15 | |
Ellaria Sand;0:0;0:0;0:0;6:45;8:45;3:0 | |
Lancel Lannister;3:15;6:30;0:0;0:0;4:0;5:15 | |
Shireen Baratheon;0:0;0:0;4:0;2:45;11:30;0:0 | |
Edmure Tully;0:0;0:0;10:30;0:0;0:0;7:15 | |
Lysa Arryn;6:0;0:0;0:0;10:30;0:0;0:0 | |
Meryn Trant;3:0;2:45;1:15;1:45;7:15;0:0 | |
Brynden 'Blackfish' Tully;0:0;0:0;9:30;0:0;0:0;6:30 | |
Walder Frey;3:0;0:0;7:0;0:0;0:0;5:45 | |
Thoros of Myr;0:0;0:0;12:45;0:0;0:0;2:15 | |
Janos Slynt;1:0;4:15;0:0;5:15;3:45;0:0 | |
Locke;0:0;0:0;7:0;6:0;0:0;0:0 | |
Myranda;0:0;0:0;3:30;1:30;7:15;0:45 | |
Rickon Stark;1:15;4:15;3:45;0:0;0:0;3:15 | |
Rodrik Cassel;10:0;2:0;0:0;0:0;0:0;0:45 | |
Waif;0:0;0:0;0:0;0:0;4:30;8:15 | |
Hot Pie;0:45;5:30;4:30;2:0;0:0;0:0 | |
Rast;4:15;0:0;4:0;4:30;0:0;0:0 | |
Septa Unella;0:0;0:0;0:0;0:0;7:30;5:0 | |
Olly;0:0;0:0;0:0;3:15;7:15;2:0 | |
Doreah;9:30;3:0;0:0;0:0;0:0;0:0 | |
Balon Greyjoy;0:0;6:30;2:30;0:0;0:0;3:15 | |
Benjen Stark;6:15;0:0;0:0;0:0;0:0;5:45 | |
Pypar;7:0;0:0;0:45;4:15;0:0;0:0 | |
Yoren;6:0;6:0;0:0;0:0;0:0;0:0 | |
Myrcella Baratheon;0:45;1:30;0:0;0:0;8:0;1:30 | |
Hizdahr zo Loraq;0:0;0:0;0:0;2:0;9:45;0:0 | |
Mace Tyrell;0:0;0:0;0:0;3:45;3:30;4:15 | |
Robin Arryn;3:45;0:0;0:0;4:45;0:30;2:15 | |
Beric Dondarrion;0:30;0:0;8:15;0:0;0:0;2:30 | |
Karl Tanner;0:0;0:0;0:0;2:30;8:45;0:0 | |
Selyse Baratheon;0:0;0:15;2:0;5:15;3:45;0:0 | |
Xaro Xhoan Daxos;0:0;10:30;0:0;0:0;0:0;0:0 | |
Irri;7:0;3:0;0:0;0:0;0:0;0:0 | |
Lady Crane;0:0;0:0;0:0;0:0;0:0;10:0 | |
Qhorin Halfhand;0:0;9:30;0:0;0:0;0:0;0:0 | |
Orell;0:0;0:0;9:0;0:0;0:0;0:0 |
from bs4 import BeautifulSoup | |
from html import unescape | |
import re | |
import requests as r | |
def time_to_float(time): | |
time = time.split(":") | |
if len(time) == 1: | |
return int(time[0]) | |
else: | |
hours, minutes = time | |
if not hours: | |
hours = 0 | |
return int(hours) + int(minutes)/60 | |
def float_to_time(value): | |
if value >= 1: | |
minute = int((value % int(value))*60) | |
hour = int(value) | |
return "{0}:{1}".format(hour, minute) | |
else: | |
return "0:{0}".format(int(value*60)) | |
# I downloaded the page an saved as data.html | |
soup = BeautifulSoup(open("data.html"), "html.parser") | |
mydivs = soup.find_all("div", class_=["info", "description"])[1:] | |
name_regex = re.compile(r">(.*?)</a>") | |
season_regex = re.compile(r"\* [Ss]eason (\d):.*<(.*)>") | |
characters = dict() | |
for div in mydivs: | |
if div.a: | |
text = str(unescape(div.a)) | |
character = name_regex.findall(text)[0] | |
characters[character] = [0, 0, 0, 0, 0, 0] | |
else: | |
for children in div.children: | |
if "NavigableString" in type(children).__name__: | |
text = str(children) | |
duration = season_regex.findall(text) | |
if duration: | |
season, time = duration[0] | |
characters[character][int(season) - 1] += time_to_float(time) | |
with open("all_seasons.csv", "w") as db: | |
db.write("actor;season_1;season_2;season_3;season_4;season_5;season_6\n") | |
for character in characters: | |
db.write("{0};{1}\n".format( | |
character, | |
";".join(list(map(float_to_time, characters[character]))))) |
#!/anaconda/bin/python3 | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
plt.style.use("ggplot") | |
# the time is stored as minutes:seconds, ignoring if they make more than | |
# one hour. | |
def time_to_float(value): | |
if value != "0": | |
minute, second = list(map(int, value.split(":"))) | |
second = second/60 | |
return minute + second | |
else: | |
return 0 | |
def plot_by_time(df): | |
fig = plt.figure(figsize=(8,20)) | |
ax = fig.add_subplot(111) | |
df.plot.barh(ax=ax, x='actor', y=df.columns[1:len(df.columns)-1], \ | |
stacked=True, width=.6) | |
ax.axvline(df.median()['total'], color='black', linestyle="dashed") | |
ax.set_title("Screen time of GOT characters") | |
ax.set_ylabel("") | |
ax.set_xlabel("Time in minutes") | |
fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) | |
plt.legend() | |
ax.legend(["Median", "Season 1", "Season 2", "Season 3", "Season 4", "Season 5", "Season 6"]) | |
fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/") | |
fig.savefig("all_actors.png", dpi=300, format="png") | |
def plot_by_time_by_season(df): | |
fig = plt.figure(figsize=(16,12)) | |
index = 1 | |
cmap = plt.get_cmap('Set1') | |
colors = [cmap(i) for i in np.linspace(0, 1, 6)] | |
for season in df.columns[1:len(df.columns)-1]: | |
# we need to sort depending of the season. | |
temp = df.sort_values(by=season) | |
ax = fig.add_subplot(2, 3, index) | |
ax.set_title(season.replace("_", " ").title()) | |
# We just make one figure, so we plot the top 10 actors by season | |
temp.iloc[-10:].plot.barh(ax=ax, x='actor', y=season, \ | |
stacked=True, width=.6, legend=False, \ | |
color = colors[index - 1]) | |
index += 1 | |
ax.set_ylabel("") | |
ax.set_xlabel("Time in minutes") | |
fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) | |
fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/") | |
fig.savefig("char_by_season.png", dpi=300, format="png") | |
def by_house(df): | |
# in most cases, last name is the house. | |
df['house'] = df['actor'].map(lambda x: x.split(" ")[-1]) | |
# fix for snow :( | |
df.loc[df['actor'] == "Jon Snow", 'house'] = 'Stark' | |
# delete people without last name | |
df = df.loc[df['actor'].map(lambda x: len(x.split(" "))) > 1] | |
grouped = df.groupby(by='house').sum() | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
grouped = grouped.reset_index().sort_values(by='total') | |
grouped.iloc[-10:].plot.barh(ax=ax, x='house', y=df.columns[1:len(df.columns)-2], \ | |
stacked=True, width=.6) | |
ax.legend(["Season 1", "Season 2", "Season 3", "Season 4", "Season 5", "Season 6"]) | |
ax.set_title("Screen time by house") | |
ax.set_xlabel("Minutes") | |
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) | |
fig.savefig("by_house.png", dpi=300, format="png") | |
def by_season(df): | |
# in most cases, last name is the house. | |
df['house'] = df['actor'].map(lambda x: x.split(" ")[-1]) | |
fig = plt.figure(figsize=(19, 10)) | |
# fix for snow :( | |
df.loc[df['actor'] == "Jon Snow", 'house'] = 'Stark' | |
# delete people without last name | |
df = df.loc[df['actor'].map(lambda x: len(x.split(" "))) > 1] | |
grouped = df.groupby(by='house').sum() | |
grouped = grouped.reset_index() | |
cmap = plt.get_cmap('Set1') | |
colors = [cmap(i) for i in np.linspace(0, 1, 6)] | |
row = 0 | |
for i, season in enumerate(list(map(lambda x: "season_%d" % x, range(1, 7)))): | |
if i > 0 and i % 3 == 0: | |
row += 1 | |
i = i % 3 | |
index = row * 3 + i + 1 | |
ax = fig.add_subplot(2, 3, index) | |
temp = grouped.sort_values(by=season).iloc[-10:] | |
temp.plot.barh(ax=ax, x='house', y=season, legend=False, color=colors[index - 1]) | |
ax.set_ylabel("House") | |
ax.set_xlabel("Minutes") | |
ax.set_title(season.replace("_", " ").title()) | |
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) | |
fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/") | |
fig.savefig("by_season.png", dpi=300, format="png") | |
df = pd.read_csv("all_seasons.csv", sep=";") | |
df['total'] = 0 | |
for col in df.columns[1:len(df.columns) - 1]: | |
df[col] = df[col].map(time_to_float) | |
df['total'] += df[col] | |
df = df.sort_values(by="total") | |
for season in df.columns[1:len(df.columns)-1]: | |
print(df[season].sum() / 60) | |
# Just uncomment the plot you want | |
#plot_by_time(df) | |
#plot_by_time_by_season(df) | |
#by_house(df) | |
#by_season(df) | |
#plt.show() |