Skip to content

Instantly share code, notes, and snippets.

@aronlindberg
Created March 25, 2015 15:43
Show Gist options
  • Save aronlindberg/49103ace2a3b6a3a1be1 to your computer and use it in GitHub Desktop.
Save aronlindberg/49103ace2a3b6a3a1be1 to your computer and use it in GitHub Desktop.
file interdependence query
# For this script to run I have to run my local version of rgithub. Open the rgithub Rstudio project and build and reload the package to use the modified get.commit() function that allows for setting the "git" argument to NULL.
# library(github)
# library(devtools)
# library(roxygen2)
setwd("/Users/Aron/Dropbox/Thesis/3-Variance/Journal/Computational Analysis/compute/")
# library(github)
# library(rgithub, lib.loc = "/Users/Aron/github/local/")
library(httpuv)
library(jsonlite)
library(dplyr)
library(plyr)
library(stringr)
library(igraph)
# 0. Set up the query
ctx = interactive.login("client_id", "client_secret")
# This function makes sure I get the pagination right
digest_header_links <- function(x) {
y <- x$headers$link
if(is.null(y)) {
# message("No links found in header.")
m <- matrix(0, ncol = 3, nrow = 4)
links <- as.data.frame(m)
names(links) <- c("rel", "per_page", "page")
return(links)
}
y %>%
str_split(", ") %>% unlist %>% # split into e.g. next, last, first, prev
str_split_fixed("; ", 2) %>% # separate URL from the relation
plyr::alply(2) %>% # workaround: make into a list
as.data.frame() %>% # convert to data.frame, no factors!
setNames(c("URL", "rel")) %>% # sane names
dplyr::mutate_(rel = ~ str_match(rel, "next|last|first|prev"),
per_page = ~ str_match(URL, "per_page=([0-9]+)") %>%
`[`( , 2) %>% as.integer,
page = ~ str_match(URL, "&page=([0-9]+)") %>%
`[`( , 2) %>% as.integer,
URL = ~ str_replace_all(URL, "<|>", ""))
}
modularization_query <- function(owner, repo){
# This function pulls down data on all the pull requests.
pull <- function(i){
commits <- get.pull.request.commits(owner = owner, repo = repo, id = i, ctx = get.github.context(), per_page=100)
links <- digest_header_links(commits)
number_of_pages <- links[2,]$page
if (number_of_pages != 0)
try_default(for (n in 1:number_of_pages){
if (as.integer(commits$headers$`x-ratelimit-remaining`) < 5)
Sys.sleep(as.integer(commits$headers$`x-ratelimit-reset`)-as.POSIXct(Sys.time()) %>% as.integer())
else
get.pull.request.commits(owner = owner, repo = repo, id = i, ctx = get.github.context(), per_page=100, page = n)
}, default = NULL)
else
return(commits)
}
list <- read.csv(paste0("/Users/Aron/dropbox/Thesis/3-Variance/Journal/Computational Analysis/compute/", repo, "_include.csv"), header = FALSE)
pull_lists <- lapply(list$V1, pull)
# This is a function for getting all the correct SHAs (ignores parent and tree SHAs)
sha_list <- vector("list", length(pull_lists))
for (i in 1:length(pull_lists)){
try_default(sha_list[[i]]<- pull_lists[[i]]$content[[1]]$sha, default = NULL) # possibly I need to insert a try_default here
}
# this removes all the NULL values
# sha_list_clean <- sha_list[ ! sapply(sha_list, is.null) ]
get_commits <- function(sha){
get.commit(git = NULL, ctx = get.github.context(), owner = owner, repo = repo, sha = sha)
}
commit_lists0 <- lapply(sha_list, get_commits)
file_list <- vector("list", length(commit_lists0))
for (i in 1:length(file_list)){
try_default(file_list[[i]]<- commit_lists0[[i]]$content$files, default = NULL)
}
# Then find all the filenames using grepl
grep_filenames <- function(input){
unlist(input, use.names=FALSE )[ grepl( "filename", names(unlist(input)))]
}
filename_lists <- lapply(file_list, grep_filenames)
filename_lists <- filename_lists[!grepl("test",filename_lists)]
# 1. Iterate across the list of PR_ids & create combination edgelists
combine_edge_lists <- function(filename_lists){
try_default(t(combn(filename_lists, 2)), default = NULL)
}
file_lists_merged <- lapply(filename_lists, combine_edge_lists)
# 2. Merge all combination edgelists
edge_list_final <- do.call(rbind, file_lists_merged)
# 3. Calculate sna_metrics for each node
g <- graph.edgelist(edge_list_final)
(g)
}
django_graph18 <- modularization_query("django", "django")
rubinius_graph18 <- modularization_query("rubinius", "rubinius")
bootstrap_graph18 <- modularization_query("twbs", "bootstrap") # I may need to redo this as I may have hit the rate limit
rails_graph18 <- modularization_query("rails", "rails")
# POST PROCESSING
degree_list <- degree(g)
names <- names(degree_list)
names(degree_list) <- NULL
output <- as.data.frame(cbind(names, degree_list), stringsAsFactors = FALSE)
output$degree_list <- degree_list
# 4. Calculate average sna_metric for each PR
mean_f <- function(entry){
mean(output$degree_list[output$names %in% entry])
}
final_output <- sapply(file_lists_merged, mean_f)
names(final_output) <- as.character(list$V1)
final_output[is.nan(final_output)] <- 0
hist(final_output)
write.csv(final_output, file = paste0(repo, "_modularization.csv"))
(final_output)
hist(degree(rails), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
hist(degree(rubinius), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
hist(degree(django), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
hist(degree(bootstrap), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment