Skip to content

Instantly share code, notes, and snippets.

@JoeyFaulkner
Last active April 23, 2020 13:10
Show Gist options
  • Save JoeyFaulkner/507b1ca07838cf73a39a480aa6b270f3 to your computer and use it in GitHub Desktop.
Save JoeyFaulkner/507b1ca07838cf73a39a480aa6b270f3 to your computer and use it in GitHub Desktop.
Finding the longest connected set of film titles using IMDB and python
import pandas as pd
import os
import re
from collections import defaultdict
from tqdm import tqdm
cached_filename = 'title.basics-trimmed.tsv'
if not os.path.exists(cached_filename):
# this is a heavy operation so it would be ideal to just do it once and save the result
df = pd.read_csv('title.basics.tsv', delimiter='\t')
df = df[df['titleType'].isin(['movie', 'video', 'tvMovie', 'tvSeries', 'tvMiniSeries', 'short'])]
df.to_csv(cached_filename, sep='\t')
else:
df = pd.read_csv(cached_filename, sep='\t')
df['primaryTitle'] = df.primaryTitle.map(lambda l: l.lower().replace('&', 'and'))
regex_pattern = '[A-Za-z0-9]* and [A-Za-z0-9]*'
pattern = re.compile(regex_pattern)
contains = df.primaryTitle.map(pattern.fullmatch)
and_df = df[~contains.isnull()] # dataframe of imdb titles which are written "{X} and {Y}"
and_df['start'] = and_df.primaryTitle.map(lambda l: l.split(' ')[0])
and_df['end'] = and_df.primaryTitle.map(lambda l: l.split(' ')[-1])
# Now we have a dataframe of these films, and we know the start and end word of them. We then follow the outline in
# https://stackoverflow.com/a/29321323 to create a directed graph and brute force find the longest path
graph_dict = defaultdict(list)
for _, row in and_df.iterrows():
graph_dict[row['start']].append(row['end'])
def DFS(graph_dict, vertex, seen=None, path=None):
if seen is None: seen=[]
if path is None: path=[vertex]
seen.append(vertex) # this means we only go to each word once which is kind of sad but :shrug:
paths = []
for next_vertex in graph_dict[vertex]:
if next_vertex not in seen:
t_path = path + [next_vertex]
paths.append(tuple(t_path))
paths.extend(DFS(graph_dict, next_vertex, seen, t_path))
return paths
all_biggest = []
for start in tqdm(list(graph_dict.keys())):
all_biggest.extend(sorted(DFS(graph_dict, start), key=len, reverse=True)[:5])
all_biggest = sorted(all_biggest, key=len, reverse=True)
longest_chain = all_biggest[0]
longest_chain = map(lambda l: l.title(), longest_chain)
print(' and '.join(longest_chain))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment