Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
IMDB interface tsv files cross reference using Python Pandas
import pandas as pd
import numpy as np
import csv
basics_file_loc = 'title.basics.tsv' # https://datasets.imdbws.com/title.basics.tsv.gz
ratings_file_loc = 'title.ratings.tsv' # https://datasets.imdbws.com/title.ratings.tsv.gz
basics_fields = ['tconst', 'titleType', 'primaryTitle',
'originalTitle', 'startYear', 'genres']
df_basics = pd.read_table(basics_file_loc, sep='\t', engine='c', skipinitialspace=True, usecols=basics_fields, dtype={
'tconst': str, 'titleType': str, 'primaryTitle': str, 'originalTitle': str, 'startYear': str, 'genres': str})
df_ratings = pd.read_table(ratings_file_loc, sep='\t', engine='c', skipinitialspace=True)
df_filter = (df_basics[(df_basics['primaryTitle'].str.startswith('Iron Man')) & (
df_basics['titleType'] == 'movie') & (df_basics['genres'].str.contains('Sci-Fi'))])
df_joined = df_filter.join(df_ratings.set_index('tconst'), on = 'tconst')
print(df_joined)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment