Last active
February 24, 2019 02:11
-
-
Save sillasgonzaga/565d69234f53b3aeac9e22ea7ed692b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(jsonlite) | |
library(UpSetR) | |
#### imdb data #### | |
# Fonte: https://www.imdb.com/interfaces/ | |
imdb_title_akas <- read_delim("data/imdb/title.akas.tsv.gz", | |
delim = "\t", | |
na = c("", "\\N")) | |
imdb_title_basics <- read_delim("data/imdb/title.basics.tsv.gz", | |
delim = "\t", | |
na = c("", "\\N")) %>% | |
# excluir episodios | |
filter(titleType != "tvEpisode") | |
imdb_title_ratings <- read_delim("data/imdb/title.ratings.tsv.gz", | |
delim = "\t", | |
na = c("", "\\N")) | |
imdb_episodes <- read_delim("data/imdb/title.episode.tsv.gz", | |
delim = "\t", | |
na = c("", "\\N")) | |
# remover entradas duplicadas de cada titulo | |
imdb_title_akas <- imdb_title_akas %>% | |
filter(isOriginalTitle == 1) | |
# calcular media ponderada usando formula oficial do imdb | |
# fonte: https://www.quora.com/How-does-IMDbs-rating-system-work/answer/Mayank-Bhushan-1 | |
imdb_weighted_rating <- function(v, m, R, C){ | |
x <- (v / (v+m)) * R + (m / (v+m)) * C | |
round(x, 2) | |
} | |
imdb_title_ratings <- imdb_title_ratings %>% | |
mutate(official_rating = imdb_weighted_rating(v = numVotes, | |
R = averageRating, | |
m = 25000, | |
C = 7)) | |
# criar dataset unico | |
imdb_title_akas <- imdb_title_akas %>% | |
select(tconst = titleId, title) | |
# construir dataset unico juntando os 3 | |
# por alguma razao, alguns titulos estao presentes em title_akas mas nao em title_basics | |
# e vice-versa. Por isso eu uso um full_join | |
imdb <- full_join(imdb_title_akas, imdb_title_basics, | |
by = "tconst") %>% | |
# juntar com o dataset de notas | |
inner_join(imdb_title_ratings, by = "tconst") %>% | |
# remover algumas colunas desncessarias | |
select(-c(primaryTitle, isAdult, runtimeMinutes)) %>% | |
# remover episodios de seriados | |
anti_join(imdb_episodes, by = "tconst") | |
# criar coluna de titulo, pois em alguns casos o titulo esta em apenas uma | |
# das colunas "title" ou "originalTitle" | |
imdb <- imdb %>% | |
mutate(full_title = case_when( | |
is.na(title) & !is.na(originalTitle) ~ originalTitle, | |
!is.na(title) & is.na(originalTitle) ~ title, | |
is.na(title) & is.na(originalTitle) ~ title, | |
!is.na(title) & !is.na(originalTitle) ~ NA_character_ | |
)) %>% | |
filter(!is.na(full_title)) %>% | |
arrange(desc(official_rating)) %>% | |
select(-c(title, originalTitle)) %>% | |
select(tconst, full_title, everything()) | |
# selecionar as 5000 maiores notas | |
top_imdb <- imdb %>% head(5000) | |
# salvar arquivo | |
top_imdb %>% | |
select(tconst, full_title) %>% | |
write_csv("data/imdb/lista_filmes_imdb.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment