Skip to content

Instantly share code, notes, and snippets.

@etra0
Created July 13, 2017 22:50
Show Gist options
  • Save etra0/8f4ce594fbdbb1b2a2c98052a619c7de to your computer and use it in GitHub Desktop.
Save etra0/8f4ce594fbdbb1b2a2c98052a619c7de to your computer and use it in GitHub Desktop.
Game of Thrones Screen Time

You need to download the source of the data (the html) of IMDb and save it as data.html, then run parse_imdb.py, this will generate the csv.

Then you can run plot.py.

Remember to use python3

We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
actor;season_1;season_2;season_3;season_4;season_5;season_6
Tyrion Lannister;52:45;65:0;50:0;47:45;44:0;34:0
Jon Snow;54:45;33:45;36:0;42:30;49:15;52:0
Daenerys Targaryen;54:0;38:45;30:45;27:30;41:45;28:45
Cersei Lannister;28:0;36:15;22:0;37:15;49:45;28:30
Sansa Stark;24:0;34:0;33:30;32:45;34:45;40:30
Arya Stark;33:30;38:45;26:30;25:45;34:45;30:0
Jaime Lannister;21:0;18:15;34:45;36:0;21:45;30:45
Theon Greyjoy;16:45;40:15;22:30;12:15;15:30;16:30
Samwell Tarly;21:15;11:15;24:0;26:0;23:30;16:0
Jorah Mormont;27:30;17:30;18:45;13:30;29:45;10:30
Petyr 'Littlefinger' Baelish;25:45;21:15;8:45;19:0;18:30;9:0
Eddard 'Ned' Stark;92:30;0:0;0:0;0:0;0:0;7:15
Brienne of Tarth;0:0;18:0;21:15;20:0;14:45;15:30
Davos Seaworth;0:0;19:15;19:0;11:30;12:30;27:15
Bran Stark;25:45;13:15;16:45;12:0;0:0;18:15
Catelyn Stark;42:15;24:0;16:30;0:0;0:0;0:0
Lord Varys;19:0;14:0;15:45;6:30;10:30;16:0
Tywin Lannister;10:15;17:15;21:15;28:30;1:0;0:0
Margaery Tyrell;0:0;9:30;21:15;17:45;11:15;18:15
Robb Stark;24:45;27:30;25:30;0:0;0:0;0:0
Stannis Baratheon;0:0;19:45;14:15;10:0;29:15;0:0
Sandor 'The Hound' Clegane;6:45;11:45;16:0;28:30;0:0;9:45
Joffrey Baratheon;17:45;17:30;20:15;14:45;0:0;0:0
Ramsay Bolton;0:0;0:0;12:45;13:30;18:30;21:15
Melisandre;0:0;12:45;18:0;5:15;11:45;18:0
Bronn;15:0;14:30;6:45;8:45;13:30;5:30
Gilly;0:0;3:15;13:30;9:15;11:45;15:15
Ygritte;0:0;17:45;25:0;8:15;0:0;0:0
Shae;8:0;16:0;14:30;8:45;0:0;0:0
Daario Naharis;0:0;0:0;6:30;7:30;19:15;12:45
Missandei;0:0;0:0;11:30;9:30;13:15;11:30
Tommen Baratheon;0:45;4:15;0:0;9:45;11:30;17:0
Tormund Giantsbane;0:0;0:0;8:45;7:15;12:15;13:30
Podrick Payne;0:0;3:45;5:30;16:0;9:0;7:0
Olenna Tyrell;0:0;0:0;13:45;8:0;8:0;8:30
High Sparrow;0:0;0:0;0:0;0:0;17:0;20:30
Barristan Selmy;8:45;0:0;13:30;7:45;7:15;0:0
Grand Maester Pycelle;13:45;6:15;2:45;5:15;2:30;4:45
Grey Worm;0:0;0:0;5:30;6:30;10:15;10:0
Loras Tyrell;5:45;8:0;6:15;1:45;4:30;5:45
Talisa Maegyr;0:0;14:45;16:0;0:0;0:0;0:0
Robert Baratheon;30:30;0:0;0:0;0:0;0:0;0:0
Roose Bolton;0:0;3:30;8:45;5:45;9:0;3:15
Osha;8:0;9:30;8:45;0:0;0:0;3:30
Hodor;3:0;6:0;7:0;7:0;0:0;6:30
Gendry;2:45;9:0;17:0;0:0;0:0;0:0
Oberyn Martell;0:0;0:0;0:0;28:30;0:0;0:0
Eddison Tollett;0:0;5:0;3:45;5:45;4:15;9:30
Yara Greyjoy;0:0;10:0;2:45;2:15;0:0;12:0
Meera Reed;0:0;0:0;8:15;9:15;0:0;9:30
Jaqen H'ghar;0:0;8:0;0:0;0:0;11:15;7:15
Alliser Thorne;6:45;0:0;0:0;9:15;5:45;4:15
Khal Drogo;22:45;2:15;0:0;0:0;0:0;0:0
Renly Baratheon;9:30;14:30;0:0;0:0;0:0;0:0
Maester Luwin;6:45;16:45;0:0;0:0;0:0;0:0
Ros;8:15;10:15;3:15;0:0;0:0;0:0
Grenn;6:15;5:0;3:0;7:0;0:0;0:0
Mance Rayder;0:0;0:0;6:45;7:0;7:15;0:0
Jeor Mormont;10:0;7:0;4:0;0:0;0:0;0:0
Viserys Targaryen;20:30;0:0;0:0;0:0;0:0;0:0
Qyburn;0:0;0:0;5:45;2:15;4:15;7:0
Jojen Reed;0:0;0:0;9:45;9:15;0:0;0:0
Maester Aemon;5:30;0:0;1:15;6:0;6:15;0:0
Gregor 'The Mountain' Clegane;2:30;1:45;0:0;4:45;0:30;9:15
Ellaria Sand;0:0;0:0;0:0;6:45;8:45;3:0
Lancel Lannister;3:15;6:30;0:0;0:0;4:0;5:15
Shireen Baratheon;0:0;0:0;4:0;2:45;11:30;0:0
Edmure Tully;0:0;0:0;10:30;0:0;0:0;7:15
Lysa Arryn;6:0;0:0;0:0;10:30;0:0;0:0
Meryn Trant;3:0;2:45;1:15;1:45;7:15;0:0
Brynden 'Blackfish' Tully;0:0;0:0;9:30;0:0;0:0;6:30
Walder Frey;3:0;0:0;7:0;0:0;0:0;5:45
Thoros of Myr;0:0;0:0;12:45;0:0;0:0;2:15
Janos Slynt;1:0;4:15;0:0;5:15;3:45;0:0
Locke;0:0;0:0;7:0;6:0;0:0;0:0
Myranda;0:0;0:0;3:30;1:30;7:15;0:45
Rickon Stark;1:15;4:15;3:45;0:0;0:0;3:15
Rodrik Cassel;10:0;2:0;0:0;0:0;0:0;0:45
Waif;0:0;0:0;0:0;0:0;4:30;8:15
Hot Pie;0:45;5:30;4:30;2:0;0:0;0:0
Rast;4:15;0:0;4:0;4:30;0:0;0:0
Septa Unella;0:0;0:0;0:0;0:0;7:30;5:0
Olly;0:0;0:0;0:0;3:15;7:15;2:0
Doreah;9:30;3:0;0:0;0:0;0:0;0:0
Balon Greyjoy;0:0;6:30;2:30;0:0;0:0;3:15
Benjen Stark;6:15;0:0;0:0;0:0;0:0;5:45
Pypar;7:0;0:0;0:45;4:15;0:0;0:0
Yoren;6:0;6:0;0:0;0:0;0:0;0:0
Myrcella Baratheon;0:45;1:30;0:0;0:0;8:0;1:30
Hizdahr zo Loraq;0:0;0:0;0:0;2:0;9:45;0:0
Mace Tyrell;0:0;0:0;0:0;3:45;3:30;4:15
Robin Arryn;3:45;0:0;0:0;4:45;0:30;2:15
Beric Dondarrion;0:30;0:0;8:15;0:0;0:0;2:30
Karl Tanner;0:0;0:0;0:0;2:30;8:45;0:0
Selyse Baratheon;0:0;0:15;2:0;5:15;3:45;0:0
Xaro Xhoan Daxos;0:0;10:30;0:0;0:0;0:0;0:0
Irri;7:0;3:0;0:0;0:0;0:0;0:0
Lady Crane;0:0;0:0;0:0;0:0;0:0;10:0
Qhorin Halfhand;0:0;9:30;0:0;0:0;0:0;0:0
Orell;0:0;0:0;9:0;0:0;0:0;0:0
from bs4 import BeautifulSoup
from html import unescape
import re
import requests as r
def time_to_float(time):
time = time.split(":")
if len(time) == 1:
return int(time[0])
else:
hours, minutes = time
if not hours:
hours = 0
return int(hours) + int(minutes)/60
def float_to_time(value):
if value >= 1:
minute = int((value % int(value))*60)
hour = int(value)
return "{0}:{1}".format(hour, minute)
else:
return "0:{0}".format(int(value*60))
# I downloaded the page an saved as data.html
soup = BeautifulSoup(open("data.html"), "html.parser")
mydivs = soup.find_all("div", class_=["info", "description"])[1:]
name_regex = re.compile(r">(.*?)</a>")
season_regex = re.compile(r"\* [Ss]eason (\d):.*<(.*)>")
characters = dict()
for div in mydivs:
if div.a:
text = str(unescape(div.a))
character = name_regex.findall(text)[0]
characters[character] = [0, 0, 0, 0, 0, 0]
else:
for children in div.children:
if "NavigableString" in type(children).__name__:
text = str(children)
duration = season_regex.findall(text)
if duration:
season, time = duration[0]
characters[character][int(season) - 1] += time_to_float(time)
with open("all_seasons.csv", "w") as db:
db.write("actor;season_1;season_2;season_3;season_4;season_5;season_6\n")
for character in characters:
db.write("{0};{1}\n".format(
character,
";".join(list(map(float_to_time, characters[character])))))
#!/anaconda/bin/python3
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plt.style.use("ggplot")
# the time is stored as minutes:seconds, ignoring if they make more than
# one hour.
def time_to_float(value):
if value != "0":
minute, second = list(map(int, value.split(":")))
second = second/60
return minute + second
else:
return 0
def plot_by_time(df):
fig = plt.figure(figsize=(8,20))
ax = fig.add_subplot(111)
df.plot.barh(ax=ax, x='actor', y=df.columns[1:len(df.columns)-1], \
stacked=True, width=.6)
ax.axvline(df.median()['total'], color='black', linestyle="dashed")
ax.set_title("Screen time of GOT characters")
ax.set_ylabel("")
ax.set_xlabel("Time in minutes")
fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
plt.legend()
ax.legend(["Median", "Season 1", "Season 2", "Season 3", "Season 4", "Season 5", "Season 6"])
fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/")
fig.savefig("all_actors.png", dpi=300, format="png")
def plot_by_time_by_season(df):
fig = plt.figure(figsize=(16,12))
index = 1
cmap = plt.get_cmap('Set1')
colors = [cmap(i) for i in np.linspace(0, 1, 6)]
for season in df.columns[1:len(df.columns)-1]:
# we need to sort depending of the season.
temp = df.sort_values(by=season)
ax = fig.add_subplot(2, 3, index)
ax.set_title(season.replace("_", " ").title())
# We just make one figure, so we plot the top 10 actors by season
temp.iloc[-10:].plot.barh(ax=ax, x='actor', y=season, \
stacked=True, width=.6, legend=False, \
color = colors[index - 1])
index += 1
ax.set_ylabel("")
ax.set_xlabel("Time in minutes")
fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/")
fig.savefig("char_by_season.png", dpi=300, format="png")
def by_house(df):
# in most cases, last name is the house.
df['house'] = df['actor'].map(lambda x: x.split(" ")[-1])
# fix for snow :(
df.loc[df['actor'] == "Jon Snow", 'house'] = 'Stark'
# delete people without last name
df = df.loc[df['actor'].map(lambda x: len(x.split(" "))) > 1]
grouped = df.groupby(by='house').sum()
fig = plt.figure()
ax = fig.add_subplot(111)
grouped = grouped.reset_index().sort_values(by='total')
grouped.iloc[-10:].plot.barh(ax=ax, x='house', y=df.columns[1:len(df.columns)-2], \
stacked=True, width=.6)
ax.legend(["Season 1", "Season 2", "Season 3", "Season 4", "Season 5", "Season 6"])
ax.set_title("Screen time by house")
ax.set_xlabel("Minutes")
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
fig.savefig("by_house.png", dpi=300, format="png")
def by_season(df):
# in most cases, last name is the house.
df['house'] = df['actor'].map(lambda x: x.split(" ")[-1])
fig = plt.figure(figsize=(19, 10))
# fix for snow :(
df.loc[df['actor'] == "Jon Snow", 'house'] = 'Stark'
# delete people without last name
df = df.loc[df['actor'].map(lambda x: len(x.split(" "))) > 1]
grouped = df.groupby(by='house').sum()
grouped = grouped.reset_index()
cmap = plt.get_cmap('Set1')
colors = [cmap(i) for i in np.linspace(0, 1, 6)]
row = 0
for i, season in enumerate(list(map(lambda x: "season_%d" % x, range(1, 7)))):
if i > 0 and i % 3 == 0:
row += 1
i = i % 3
index = row * 3 + i + 1
ax = fig.add_subplot(2, 3, index)
temp = grouped.sort_values(by=season).iloc[-10:]
temp.plot.barh(ax=ax, x='house', y=season, legend=False, color=colors[index - 1])
ax.set_ylabel("House")
ax.set_xlabel("Minutes")
ax.set_title(season.replace("_", " ").title())
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
fig.text(.02, .005, "Source: http://imdb.com/list/ls076752033/")
fig.savefig("by_season.png", dpi=300, format="png")
df = pd.read_csv("all_seasons.csv", sep=";")
df['total'] = 0
for col in df.columns[1:len(df.columns) - 1]:
df[col] = df[col].map(time_to_float)
df['total'] += df[col]
df = df.sort_values(by="total")
for season in df.columns[1:len(df.columns)-1]:
print(df[season].sum() / 60)
# Just uncomment the plot you want
#plot_by_time(df)
#plot_by_time_by_season(df)
#by_house(df)
#by_season(df)
#plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment