Last active February 22, 2019 09:16
Get a tidy data frame of information about all of your Github stars (repos you've starred)
## install {remotes} pkg
if (!requireNamespace("remotes", quietly = TRUE)) {
## install {tfse} from github
## load {tfse}
## packages to install
pkgs <- c("rvest", "tbltools", "dplyr")
## install if not already
## function to convert html page into stars data frame
parse_stars_page <- function(url) {
## read as xml
s <- xml2::read_html(url)
## each strs node
strs <- rvest::html_nodes(s, "")
## user/repo
repo <- rvest::html_node(strs, "h3 a") %>%
rvest::html_attr("href") %>%
sub("^/", "", .)
## store full gh repo URL
gh_url <- paste0("", repo)
## user/account name
user <- sub("/.*", "", repo)
## simplify to name of repo
repo <- tfse::regmatches_first(repo, "(?<=/).*")
## description of repo
description <- rvest::html_node(strs, " p") %>%
rvest::html_text(trim = TRUE)
## repo language
lang <- rvest::html_node(strs, "") %>%
rvest::html_text(trim = TRUE)
## for repos that don't have langs
wout_lang_stars <- strs %>%
rvest::html_node(" a:nth-child(1)") %>%
rvest::html_text(trim = TRUE) %>%
gsub("\\,", "", .) %>%
(function(.) suppressWarnings(as.integer(.)))
wout_lang_forks <- strs %>%
rvest::html_node(" a:nth-child(2)") %>%
rvest::html_text(trim = TRUE) %>%
gsub("\\,", "", .) %>%
(function(.) suppressWarnings(as.integer(.)))
## for repos that do have langs
with_lang_stars <- strs %>%
rvest::html_node(" a:nth-child(3)") %>%
rvest::html_text(trim = TRUE) %>%
gsub("\\,", "", .) %>%
(function(.) suppressWarnings(as.integer(.)))
with_lang_forks <- strs %>%
rvest::html_node(" a:nth-child(4)") %>%
rvest::html_text(trim = TRUE) %>%
gsub("\\,", "", .) %>%
(function(.) suppressWarnings(as.integer(.)))
## star count and fork count
star_count <- ifelse(!, with_lang_stars, wout_lang_stars)
fork_count <- ifelse(!, with_lang_forks, wout_lang_forks)
fork_count[] <- 0L
## store as data set
d <- tfse::data_set(
user = user,
repo = repo,
lang = lang,
description = description,
stars = star_count,
forks = fork_count,
url = gh_url
## parse next link
next_link <- s %>%
as.character() %>%
tfse::regmatches_('(?<=href=")https://github\\.com/\\S+\\?after[^"]+(?=")') %>%
unlist() %>%
grep("direction=", ., invert = TRUE, value = TRUE)
## store as attribute
attr(d, "next_link") <- next_link
## return data
## function to return next_link attribute
next_link <- function(x) attr(x, "next_link")
## function to initialize list vector
init_list <- function(n = 0) vector("list", n)
## function to convert GH count format to numeric
detruncnum <- function(x) {
if (length(x) == 0) return(x)
k <- grepl("k$", x)
x <- sub("k$", "", x) %>% as.numeric()
x[k] <- x[k] * 1000
## function to bind rows
do_call_rbind <- function(x) {"rbind", x[lengths(x) > 0], quote = TRUE)
## big function to get all stars data
get_stars_data <- function(username) {
## get the number of starred repos
stars_url <- sprintf("", username)
s <- xml2::read_html(stars_url)
num_stars <- rvest::html_nodes(s, "span.Counter") %>%
rvest::html_text(trim = TRUE) %>%
detruncnum() %>%
## divide by 30 to get estimated number of pages
n_pages <- ceiling(num_stars / 30)
## initialize output vector
stars_data <- init_list(n_pages)
tfse::print_start("Looping through ", n_pages, " pages (about ", num_stars, " stars) of repos...")
## for loop through the pages, breaking on error
for (i in seq_along(stars_data)) {
## fetch page and convert to tbl_df
stars_data[[i]] <- tryCatch(
error = function(e) NULL
## break on error
if (is.null(stars_data[[i]])) break
tfse::print_complete("Page ", i, "/", n_pages, " complete!")
## update stars_url (pagination)
stars_url <- next_link(stars_data[[i]])
## if there's not a next_link then break
if (length(stars_url) == 0) break
## return stars data (as data frame if possible)
error = function(e) stars_data
## replace my Github username with yours to get your starred repo data
stars_data <- get_stars_data("mkearney")
## number of starred repos by user
stars_data %>%
tbltools::tabsort(user) %>%
print(n = 20)
## number of starred repos by lang
stars_data %>%
tbltools::tabsort(lang) %>%
print(n = 20)
## number of starred repos by user and lang
stars_data %>%
tbltools::tabsort(lang, user)
## most common repo names
stars_data %>%
## average number of stars and forks by user
stars_data %>%
dplyr::group_by(user) %>%
dplyr::summarise(stars = mean(stars), forks = mean(forks), n = dplyr::n()) %>%
tbltools::arrange_rows(stars, forks) %>%
dplyr::filter(n > 2) %>%
print(n = 20)
