library(tidyverse) | |
library(jsonlite) | |
library(UpSetR) | |
#### imdb data #### | |
# Fonte: https://www.imdb.com/interfaces/ | |
imdb_title_akas <- read_delim("data/imdb/title.akas.tsv.gz", | |
delim = "\t", | |
na = c("", "\\N")) | |
imdb_title_basics <- read_delim("data/imdb/title.basics.tsv.gz", | |
delim = "\t", | |
na = c("", "\\N")) %>% | |
# excluir episodios | |
filter(titleType != "tvEpisode") | |
imdb_title_ratings <- read_delim("data/imdb/title.ratings.tsv.gz", | |
delim = "\t", | |
na = c("", "\\N")) | |
imdb_episodes <- read_delim("data/imdb/title.episode.tsv.gz", | |
delim = "\t", | |
na = c("", "\\N")) | |
# remover entradas duplicadas de cada titulo | |
imdb_title_akas <- imdb_title_akas %>% | |
filter(isOriginalTitle == 1) | |
# calcular media ponderada usando formula oficial do imdb | |
# fonte: https://www.quora.com/How-does-IMDbs-rating-system-work/answer/Mayank-Bhushan-1 | |
imdb_weighted_rating <- function(v, m, R, C){ | |
x <- (v / (v+m)) * R + (m / (v+m)) * C | |
round(x, 2) | |
} | |
imdb_title_ratings <- imdb_title_ratings %>% | |
mutate(official_rating = imdb_weighted_rating(v = numVotes, | |
R = averageRating, | |
m = 25000, | |
C = 7)) | |
# criar dataset unico | |
imdb_title_akas <- imdb_title_akas %>% | |
select(tconst = titleId, title) | |
# construir dataset unico juntando os 3 | |
# por alguma razao, alguns titulos estao presentes em title_akas mas nao em title_basics | |
# e vice-versa. Por isso eu uso um full_join | |
imdb <- full_join(imdb_title_akas, imdb_title_basics, | |
by = "tconst") %>% | |
# juntar com o dataset de notas | |
inner_join(imdb_title_ratings, by = "tconst") %>% | |
# remover algumas colunas desncessarias | |
select(-c(primaryTitle, isAdult, runtimeMinutes)) %>% | |
# remover episodios de seriados | |
anti_join(imdb_episodes, by = "tconst") | |
# criar coluna de titulo, pois em alguns casos o titulo esta em apenas uma | |
# das colunas "title" ou "originalTitle" | |
imdb <- imdb %>% | |
mutate(full_title = case_when( | |
is.na(title) & !is.na(originalTitle) ~ originalTitle, | |
!is.na(title) & is.na(originalTitle) ~ title, | |
is.na(title) & is.na(originalTitle) ~ title, | |
!is.na(title) & !is.na(originalTitle) ~ NA_character_ | |
)) %>% | |
filter(!is.na(full_title)) %>% | |
arrange(desc(official_rating)) %>% | |
select(-c(title, originalTitle)) %>% | |
select(tconst, full_title, everything()) | |
# selecionar as 5000 maiores notas | |
top_imdb <- imdb %>% head(5000) | |
# salvar arquivo | |
top_imdb %>% | |
select(tconst, full_title) %>% | |
write_csv("data/imdb/lista_filmes_imdb.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment