Skip to content

Instantly share code, notes, and snippets.

@sillasgonzaga
Last active February 24, 2019 02:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sillasgonzaga/565d69234f53b3aeac9e22ea7ed692b4 to your computer and use it in GitHub Desktop.
Save sillasgonzaga/565d69234f53b3aeac9e22ea7ed692b4 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(jsonlite)
library(UpSetR)
#### imdb data ####
# Fonte: https://www.imdb.com/interfaces/
imdb_title_akas <- read_delim("data/imdb/title.akas.tsv.gz",
delim = "\t",
na = c("", "\\N"))
imdb_title_basics <- read_delim("data/imdb/title.basics.tsv.gz",
delim = "\t",
na = c("", "\\N")) %>%
# excluir episodios
filter(titleType != "tvEpisode")
imdb_title_ratings <- read_delim("data/imdb/title.ratings.tsv.gz",
delim = "\t",
na = c("", "\\N"))
imdb_episodes <- read_delim("data/imdb/title.episode.tsv.gz",
delim = "\t",
na = c("", "\\N"))
# remover entradas duplicadas de cada titulo
imdb_title_akas <- imdb_title_akas %>%
filter(isOriginalTitle == 1)
# calcular media ponderada usando formula oficial do imdb
# fonte: https://www.quora.com/How-does-IMDbs-rating-system-work/answer/Mayank-Bhushan-1
imdb_weighted_rating <- function(v, m, R, C){
x <- (v / (v+m)) * R + (m / (v+m)) * C
round(x, 2)
}
imdb_title_ratings <- imdb_title_ratings %>%
mutate(official_rating = imdb_weighted_rating(v = numVotes,
R = averageRating,
m = 25000,
C = 7))
# criar dataset unico
imdb_title_akas <- imdb_title_akas %>%
select(tconst = titleId, title)
# construir dataset unico juntando os 3
# por alguma razao, alguns titulos estao presentes em title_akas mas nao em title_basics
# e vice-versa. Por isso eu uso um full_join
imdb <- full_join(imdb_title_akas, imdb_title_basics,
by = "tconst") %>%
# juntar com o dataset de notas
inner_join(imdb_title_ratings, by = "tconst") %>%
# remover algumas colunas desncessarias
select(-c(primaryTitle, isAdult, runtimeMinutes)) %>%
# remover episodios de seriados
anti_join(imdb_episodes, by = "tconst")
# criar coluna de titulo, pois em alguns casos o titulo esta em apenas uma
# das colunas "title" ou "originalTitle"
imdb <- imdb %>%
mutate(full_title = case_when(
is.na(title) & !is.na(originalTitle) ~ originalTitle,
!is.na(title) & is.na(originalTitle) ~ title,
is.na(title) & is.na(originalTitle) ~ title,
!is.na(title) & !is.na(originalTitle) ~ NA_character_
)) %>%
filter(!is.na(full_title)) %>%
arrange(desc(official_rating)) %>%
select(-c(title, originalTitle)) %>%
select(tconst, full_title, everything())
# selecionar as 5000 maiores notas
top_imdb <- imdb %>% head(5000)
# salvar arquivo
top_imdb %>%
select(tconst, full_title) %>%
write_csv("data/imdb/lista_filmes_imdb.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment