Skip to content

Instantly share code, notes, and snippets.

@mkearney
Last active February 22, 2019 09:16
Show Gist options
  • Save mkearney/be9e7a7f69adb0768b720cc64d9b35f6 to your computer and use it in GitHub Desktop.
Save mkearney/be9e7a7f69adb0768b720cc64d9b35f6 to your computer and use it in GitHub Desktop.
Get a tidy data frame of information about all of your Github stars (repos you've starred)
## install {remotes} pkg
if (!requireNamespace("remotes", quietly = TRUE)) {
install.packages("remotes")
}
## install {tfse} from github
remotes::install_github("mkearney/tfse")
## load {tfse}
library(tfse)
## packages to install
pkgs <- c("rvest", "tbltools", "dplyr")
## install if not already
install_if(pkgs)
## function to convert html page into stars data frame
parse_stars_page <- function(url) {
## read as xml
s <- xml2::read_html(url)
## each strs node
strs <- rvest::html_nodes(s, ".col-12.d-block.width-full.py-4.border-bottom")
## user/repo
repo <- rvest::html_node(strs, "h3 a") %>%
rvest::html_attr("href") %>%
sub("^/", "", .)
## store full gh repo URL
gh_url <- paste0("https://github.com/", repo)
## user/account name
user <- sub("/.*", "", repo)
## simplify to name of repo
repo <- tfse::regmatches_first(repo, "(?<=/).*")
## description of repo
description <- rvest::html_node(strs, "div.py-1 p") %>%
rvest::html_text(trim = TRUE)
## repo language
lang <- rvest::html_node(strs, "span.mr-3") %>%
rvest::html_text(trim = TRUE)
## for repos that don't have langs
wout_lang_stars <- strs %>%
rvest::html_node(".f6.text-gray.mt-2 a:nth-child(1)") %>%
rvest::html_text(trim = TRUE) %>%
gsub("\\,", "", .) %>%
(function(.) suppressWarnings(as.integer(.)))
wout_lang_forks <- strs %>%
rvest::html_node(".f6.text-gray.mt-2 a:nth-child(2)") %>%
rvest::html_text(trim = TRUE) %>%
gsub("\\,", "", .) %>%
(function(.) suppressWarnings(as.integer(.)))
## for repos that do have langs
with_lang_stars <- strs %>%
rvest::html_node(".f6.text-gray.mt-2 a:nth-child(3)") %>%
rvest::html_text(trim = TRUE) %>%
gsub("\\,", "", .) %>%
(function(.) suppressWarnings(as.integer(.)))
with_lang_forks <- strs %>%
rvest::html_node(".f6.text-gray.mt-2 a:nth-child(4)") %>%
rvest::html_text(trim = TRUE) %>%
gsub("\\,", "", .) %>%
(function(.) suppressWarnings(as.integer(.)))
## star count and fork count
star_count <- ifelse(!is.na(lang), with_lang_stars, wout_lang_stars)
fork_count <- ifelse(!is.na(lang), with_lang_forks, wout_lang_forks)
fork_count[is.na(fork_count)] <- 0L
## store as data set
d <- tfse::data_set(
user = user,
repo = repo,
lang = lang,
description = description,
stars = star_count,
forks = fork_count,
url = gh_url
)
## parse next link
next_link <- s %>%
as.character() %>%
tfse::regmatches_('(?<=href=")https://github\\.com/\\S+\\?after[^"]+(?=")') %>%
unlist() %>%
grep("direction=", ., invert = TRUE, value = TRUE)
## store as attribute
attr(d, "next_link") <- next_link
## return data
d
}
## function to return next_link attribute
next_link <- function(x) attr(x, "next_link")
## function to initialize list vector
init_list <- function(n = 0) vector("list", n)
## function to convert GH count format to numeric
detruncnum <- function(x) {
if (length(x) == 0) return(x)
k <- grepl("k$", x)
x <- sub("k$", "", x) %>% as.numeric()
x[k] <- x[k] * 1000
x
}
## function to bind rows
do_call_rbind <- function(x) {
do.call("rbind", x[lengths(x) > 0], quote = TRUE)
}
## big function to get all stars data
get_stars_data <- function(username) {
## get the number of starred repos
stars_url <- sprintf("https://github.com/%s?tab=stars", username)
s <- xml2::read_html(stars_url)
num_stars <- rvest::html_nodes(s, "span.Counter") %>%
rvest::html_text(trim = TRUE) %>%
detruncnum() %>%
.[2]
## divide by 30 to get estimated number of pages
n_pages <- ceiling(num_stars / 30)
## initialize output vector
stars_data <- init_list(n_pages)
tfse::print_start("Looping through ", n_pages, " pages (about ", num_stars, " stars) of repos...")
## for loop through the pages, breaking on error
for (i in seq_along(stars_data)) {
## fetch page and convert to tbl_df
stars_data[[i]] <- tryCatch(
parse_stars_page(stars_url),
error = function(e) NULL
)
## break on error
if (is.null(stars_data[[i]])) break
tfse::print_complete("Page ", i, "/", n_pages, " complete!")
## update stars_url (pagination)
stars_url <- next_link(stars_data[[i]])
## if there's not a next_link then break
if (length(stars_url) == 0) break
}
## return stars data (as data frame if possible)
tryCatch(
do_call_rbind(stars_data),
error = function(e) stars_data
)
}
##----------------------------------------------------------------------------##
## ENTER YOUR USERNAME HERE ##
##----------------------------------------------------------------------------##
## replace my Github username with yours to get your starred repo data
stars_data <- get_stars_data("mkearney")
## number of starred repos by user
stars_data %>%
tbltools::tabsort(user) %>%
print(n = 20)
## number of starred repos by lang
stars_data %>%
tbltools::tabsort(lang) %>%
print(n = 20)
## number of starred repos by user and lang
stars_data %>%
tbltools::tabsort(lang, user)
## most common repo names
stars_data %>%
tbltools::tabsort(repo)
## average number of stars and forks by user
stars_data %>%
dplyr::group_by(user) %>%
dplyr::summarise(stars = mean(stars), forks = mean(forks), n = dplyr::n()) %>%
tbltools::arrange_rows(stars, forks) %>%
dplyr::filter(n > 2) %>%
print(n = 20)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment