-
-
Save dubsnipe/2bb20c4503eecc0d91f93f64c197fa5b to your computer and use it in GitHub Desktop.
Código para obtener playlists desde un grupo de Facebook y metadatos desde YouTube.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
list.of.packages <- c("ggplot2", "devtools") | |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
if(length(new.packages)) install.packages(new.packages) | |
library(devtools) | |
install_github("pablobarbera/Rfacebook/Rfacebook", force=T) | |
require (Rfacebook) | |
# http://thinktostart.com/analyzing-facebook-with-r/ | |
# https://github.com/pablobarbera/Rfacebook/issues/180 | |
token <- '' # Se obtuvo un token de la API de Facebook: https://developers.facebook.com/tools/accesstoken/ | |
fm <- getGroup("280420788956517", token=token, n=20000) # 280420788956517 es la id del grupo de FM. | |
write.csv(fm,file="fm.csv") | |
# Facebook dejó de permitir obtener información de los usuarios en algunos casos, | |
# por lo que no vamos a tener estadísticas sobre las personas que publicamos en FM. | |
require(dplyr) | |
require(stringr) | |
require(lubridate) | |
# Transformaciones | |
tbl_fm <- tbl_df(fm) | |
tbl_fm <- tbl_fm %>% filter(!is.na(link)) | |
tbl_fm <- tbl_fm %>% mutate(parsed_created_time=as.POSIXct(as_datetime(strptime(tbl_fm$created_time,format="%Y-%m-%dT%H:%M:%S+0000")))) | |
tbl_fm <- tbl_fm %>% select(message,parsed_created_time,type,link,likes_count,comments_count,shares_count) | |
messages <- tbl_fm$message | |
# Ahora a extraer los hashtags separados por espacios | |
hashtags <- unlist(lapply(messages,function(x) | |
paste(unlist(str_extract_all(x, | |
pattern="#[A-zÀ-ú]+")),collapse = ' '))) | |
hashtags <- sub("^NA$",NA,hashtags) | |
# Por alguna razón algunos de los registros vacíos aparecen como strings: "NA". Acá se corrigen: | |
hashtags[hashtags==""] <- NA | |
tbl_fm$hashtags <- hashtags | |
hashtags <- tolower(hashtags) | |
# Se usó la siguiente función para extraer la id de los vídeos de YouTube. Basado en la siguiente solución: | |
# https://stackoverflow.com/questions/45441896/extract-youtube-video-id-from-url-with-r-stringr-regex | |
# | |
# get_id = function(link) { | |
# if(stringr::str_detect(link, 'youtu')){ | |
# if (stringr::str_detect(link, '/watch\\?')) { | |
# rgx = '(?<=\\?v=|&v=)[\\w\\-]+' | |
# } else if (stringr::str_detect(link, '/attribution_link\\?')) { | |
# rgx = '(?<=\\?a=|&a=)[\\w\\-]+' | |
# } else { | |
# rgx = '(?<=/)[\\w\\-]+/?(?:$|\\?)' | |
# } | |
# stringr::str_extract(link, rgx) | |
# } else{ | |
# rgx = NA | |
# } | |
# } | |
ids = unlist(unname(sapply(tbl_fm$link, get_id))) | |
tbl_fm$video_id <- ids | |
require(tuber) | |
rm(video_data) | |
video_data <- lapply(ids,function(x) | |
tryCatch(get_video_details(x), | |
warning=function(w) NA, | |
error=function(e) NA) | |
) | |
video_titles <- vector() | |
for (i in 1:length(video_data)){ | |
if (length(video_data[[i]])==1 && is.na(video_data[[i]])){ | |
video_titles <- c(video_titles,NA) | |
} else { | |
video_titles <- c(video_titles,video_data[[i]]$items[[1]]$snippet$localized$title) | |
} | |
} | |
tbl_fm$video_titles <- video_titles | |
# Nueva iteración | |
yt_fm <- tbl_fm | |
yt_fm <- yt_fm %>% | |
filter(!is.na(video_titles)) %>% | |
mutate(youtube_link=paste0("https://www.youtube.com/watch?v=",video_id)) %>% | |
select(parsed_created_time,hashtags,video_id,video_titles,likes_count,comments_count,link) | |
# https://www.rdocumentation.org/packages/tidyr/versions/0.8.0/topics/separate_rows | |
# Separé los hashtags en observaciones separadas para poder organizar. | |
require(tidyr) | |
yt_fm_sep <- yt_fm %>% separate_rows(hashtags,sep=" ") | |
# Ranking de hashtags más importantes. Voy a tomar los 60 más usados. | |
cuenta_hashtags <- yt_fm_sep %>% | |
group_by(hashtags) %>% | |
summarize(count=n()) %>% | |
arrange(desc(count)) | |
hashtags_populares <- head(cuenta_hashtags,60) | |
videos_populares <- tbl_df(merge(hashtags_populares,yt_fm_sep,by="hashtags")) | |
videos_populares <- videos_populares %>% | |
mutate(hashtags = tolower(hashtags)) %>% | |
group_by(hashtags) %>% | |
mutate(numero = row_number()) %>% | |
mutate(playlist = paste0(hashtags,ceiling(numero/50))) | |
#Este es hashtags populares pero con el listado nuevo de los playlists | |
playlists <- unique(videos_populares$playlist) | |
e <- lapply(playlists,function(x) paste(videos_populares[videos_populares$playlist==x,]$video_id)) | |
l <- tbl_df(cbind(playlists,unlist(lapply(e, function(x) paste(x[which(x!="NA")],collapse=","))))) | |
l <- tbl_df(l) | |
colnames(l) <- c("hashtag","ids") | |
# Acá se generan las listas de YouTube según la estructura de las url, usando las id de los vídeos. | |
# Basado en esta solución: https://webapps.stackexchange.com/questions/72787/how-to-create-a-youtube-playlist-from-a-list-of-links | |
l <- l %>% mutate(url=paste0("http://www.youtube.com/watch_videos?video_ids=",ids)) | |
# Generé un archivo csv. Este se convirtió en una tabla HTML: http://www.convertcsv.com/csv-to-html.htm | |
write.csv(l,"playlists2.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment