Created
March 25, 2015 15:43
-
-
Save aronlindberg/49103ace2a3b6a3a1be1 to your computer and use it in GitHub Desktop.
file interdependence query
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# For this script to run I have to run my local version of rgithub. Open the rgithub Rstudio project and build and reload the package to use the modified get.commit() function that allows for setting the "git" argument to NULL. | |
# library(github) | |
# library(devtools) | |
# library(roxygen2) | |
setwd("/Users/Aron/Dropbox/Thesis/3-Variance/Journal/Computational Analysis/compute/") | |
# library(github) | |
# library(rgithub, lib.loc = "/Users/Aron/github/local/") | |
library(httpuv) | |
library(jsonlite) | |
library(dplyr) | |
library(plyr) | |
library(stringr) | |
library(igraph) | |
# 0. Set up the query | |
ctx = interactive.login("client_id", "client_secret") | |
# This function makes sure I get the pagination right | |
digest_header_links <- function(x) { | |
y <- x$headers$link | |
if(is.null(y)) { | |
# message("No links found in header.") | |
m <- matrix(0, ncol = 3, nrow = 4) | |
links <- as.data.frame(m) | |
names(links) <- c("rel", "per_page", "page") | |
return(links) | |
} | |
y %>% | |
str_split(", ") %>% unlist %>% # split into e.g. next, last, first, prev | |
str_split_fixed("; ", 2) %>% # separate URL from the relation | |
plyr::alply(2) %>% # workaround: make into a list | |
as.data.frame() %>% # convert to data.frame, no factors! | |
setNames(c("URL", "rel")) %>% # sane names | |
dplyr::mutate_(rel = ~ str_match(rel, "next|last|first|prev"), | |
per_page = ~ str_match(URL, "per_page=([0-9]+)") %>% | |
`[`( , 2) %>% as.integer, | |
page = ~ str_match(URL, "&page=([0-9]+)") %>% | |
`[`( , 2) %>% as.integer, | |
URL = ~ str_replace_all(URL, "<|>", "")) | |
} | |
modularization_query <- function(owner, repo){ | |
# This function pulls down data on all the pull requests. | |
pull <- function(i){ | |
commits <- get.pull.request.commits(owner = owner, repo = repo, id = i, ctx = get.github.context(), per_page=100) | |
links <- digest_header_links(commits) | |
number_of_pages <- links[2,]$page | |
if (number_of_pages != 0) | |
try_default(for (n in 1:number_of_pages){ | |
if (as.integer(commits$headers$`x-ratelimit-remaining`) < 5) | |
Sys.sleep(as.integer(commits$headers$`x-ratelimit-reset`)-as.POSIXct(Sys.time()) %>% as.integer()) | |
else | |
get.pull.request.commits(owner = owner, repo = repo, id = i, ctx = get.github.context(), per_page=100, page = n) | |
}, default = NULL) | |
else | |
return(commits) | |
} | |
list <- read.csv(paste0("/Users/Aron/dropbox/Thesis/3-Variance/Journal/Computational Analysis/compute/", repo, "_include.csv"), header = FALSE) | |
pull_lists <- lapply(list$V1, pull) | |
# This is a function for getting all the correct SHAs (ignores parent and tree SHAs) | |
sha_list <- vector("list", length(pull_lists)) | |
for (i in 1:length(pull_lists)){ | |
try_default(sha_list[[i]]<- pull_lists[[i]]$content[[1]]$sha, default = NULL) # possibly I need to insert a try_default here | |
} | |
# this removes all the NULL values | |
# sha_list_clean <- sha_list[ ! sapply(sha_list, is.null) ] | |
get_commits <- function(sha){ | |
get.commit(git = NULL, ctx = get.github.context(), owner = owner, repo = repo, sha = sha) | |
} | |
commit_lists0 <- lapply(sha_list, get_commits) | |
file_list <- vector("list", length(commit_lists0)) | |
for (i in 1:length(file_list)){ | |
try_default(file_list[[i]]<- commit_lists0[[i]]$content$files, default = NULL) | |
} | |
# Then find all the filenames using grepl | |
grep_filenames <- function(input){ | |
unlist(input, use.names=FALSE )[ grepl( "filename", names(unlist(input)))] | |
} | |
filename_lists <- lapply(file_list, grep_filenames) | |
filename_lists <- filename_lists[!grepl("test",filename_lists)] | |
# 1. Iterate across the list of PR_ids & create combination edgelists | |
combine_edge_lists <- function(filename_lists){ | |
try_default(t(combn(filename_lists, 2)), default = NULL) | |
} | |
file_lists_merged <- lapply(filename_lists, combine_edge_lists) | |
# 2. Merge all combination edgelists | |
edge_list_final <- do.call(rbind, file_lists_merged) | |
# 3. Calculate sna_metrics for each node | |
g <- graph.edgelist(edge_list_final) | |
(g) | |
} | |
django_graph18 <- modularization_query("django", "django") | |
rubinius_graph18 <- modularization_query("rubinius", "rubinius") | |
bootstrap_graph18 <- modularization_query("twbs", "bootstrap") # I may need to redo this as I may have hit the rate limit | |
rails_graph18 <- modularization_query("rails", "rails") | |
# POST PROCESSING | |
degree_list <- degree(g) | |
names <- names(degree_list) | |
names(degree_list) <- NULL | |
output <- as.data.frame(cbind(names, degree_list), stringsAsFactors = FALSE) | |
output$degree_list <- degree_list | |
# 4. Calculate average sna_metric for each PR | |
mean_f <- function(entry){ | |
mean(output$degree_list[output$names %in% entry]) | |
} | |
final_output <- sapply(file_lists_merged, mean_f) | |
names(final_output) <- as.character(list$V1) | |
final_output[is.nan(final_output)] <- 0 | |
hist(final_output) | |
write.csv(final_output, file = paste0(repo, "_modularization.csv")) | |
(final_output) | |
hist(degree(rails), breaks = 50, ylim = c(0, 500), xlim = c(0, 750)) | |
hist(degree(rubinius), breaks = 50, ylim = c(0, 500), xlim = c(0, 750)) | |
hist(degree(django), breaks = 50, ylim = c(0, 500), xlim = c(0, 750)) | |
hist(degree(bootstrap), breaks = 50, ylim = c(0, 500), xlim = c(0, 750)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment