Last active
April 23, 2020 13:10
-
-
Save JoeyFaulkner/507b1ca07838cf73a39a480aa6b270f3 to your computer and use it in GitHub Desktop.
Finding the longest connected set of film titles using IMDB and python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import re | |
from collections import defaultdict | |
from tqdm import tqdm | |
cached_filename = 'title.basics-trimmed.tsv' | |
if not os.path.exists(cached_filename): | |
# this is a heavy operation so it would be ideal to just do it once and save the result | |
df = pd.read_csv('title.basics.tsv', delimiter='\t') | |
df = df[df['titleType'].isin(['movie', 'video', 'tvMovie', 'tvSeries', 'tvMiniSeries', 'short'])] | |
df.to_csv(cached_filename, sep='\t') | |
else: | |
df = pd.read_csv(cached_filename, sep='\t') | |
df['primaryTitle'] = df.primaryTitle.map(lambda l: l.lower().replace('&', 'and')) | |
regex_pattern = '[A-Za-z0-9]* and [A-Za-z0-9]*' | |
pattern = re.compile(regex_pattern) | |
contains = df.primaryTitle.map(pattern.fullmatch) | |
and_df = df[~contains.isnull()] # dataframe of imdb titles which are written "{X} and {Y}" | |
and_df['start'] = and_df.primaryTitle.map(lambda l: l.split(' ')[0]) | |
and_df['end'] = and_df.primaryTitle.map(lambda l: l.split(' ')[-1]) | |
# Now we have a dataframe of these films, and we know the start and end word of them. We then follow the outline in | |
# https://stackoverflow.com/a/29321323 to create a directed graph and brute force find the longest path | |
graph_dict = defaultdict(list) | |
for _, row in and_df.iterrows(): | |
graph_dict[row['start']].append(row['end']) | |
def DFS(graph_dict, vertex, seen=None, path=None): | |
if seen is None: seen=[] | |
if path is None: path=[vertex] | |
seen.append(vertex) # this means we only go to each word once which is kind of sad but :shrug: | |
paths = [] | |
for next_vertex in graph_dict[vertex]: | |
if next_vertex not in seen: | |
t_path = path + [next_vertex] | |
paths.append(tuple(t_path)) | |
paths.extend(DFS(graph_dict, next_vertex, seen, t_path)) | |
return paths | |
all_biggest = [] | |
for start in tqdm(list(graph_dict.keys())): | |
all_biggest.extend(sorted(DFS(graph_dict, start), key=len, reverse=True)[:5]) | |
all_biggest = sorted(all_biggest, key=len, reverse=True) | |
longest_chain = all_biggest[0] | |
longest_chain = map(lambda l: l.title(), longest_chain) | |
print(' and '.join(longest_chain)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment