aronlindberg/file interdependence query

## file interdependence query
# For this script to run I have to run my local version of rgithub. Open the rgithub Rstudio project and build and reload the package to use the modified get.commit() function that allows for setting the "git" argument to NULL.

# library(github)
# library(devtools)
# library(roxygen2)
setwd("/Users/Aron/Dropbox/Thesis/3-Variance/Journal/Computational Analysis/compute/")
# library(github)
# library(rgithub, lib.loc = "/Users/Aron/github/local/")
library(httpuv)
library(jsonlite)
library(dplyr)
library(plyr)
library(stringr)
library(igraph)


# 0. Set up the query
ctx = interactive.login("client_id", "client_secret")

# This function makes sure I get the pagination right
digest_header_links <- function(x) {
  y <- x$headers$link
  if(is.null(y)) {
    # message("No links found in header.")
    m <- matrix(0, ncol = 3, nrow = 4)
    links <- as.data.frame(m)
    names(links) <- c("rel", "per_page", "page")
    return(links)
  }
  y %>%
    str_split(", ") %>% unlist %>%  # split into e.g. next, last, first, prev
    str_split_fixed("; ", 2) %>%    # separate URL from the relation
    plyr::alply(2) %>%              # workaround: make into a list
    as.data.frame() %>%        # convert to data.frame, no factors!
    setNames(c("URL", "rel")) %>%   # sane names
    dplyr::mutate_(rel = ~ str_match(rel, "next|last|first|prev"),
                   per_page = ~ str_match(URL, "per_page=([0-9]+)") %>%
                     `[`( , 2) %>% as.integer,
                   page = ~ str_match(URL, "&page=([0-9]+)") %>%
                     `[`( , 2) %>% as.integer,
                   URL = ~ str_replace_all(URL, "<|>", ""))
}

modularization_query <- function(owner, repo){
# This function pulls down data on all the pull requests.
pull <- function(i){
  commits <- get.pull.request.commits(owner = owner, repo = repo, id = i, ctx = get.github.context(), per_page=100)
  links <- digest_header_links(commits)
  number_of_pages <- links[2,]$page
  if (number_of_pages != 0)
    try_default(for (n in 1:number_of_pages){
      if (as.integer(commits$headers$`x-ratelimit-remaining`) < 5)
        Sys.sleep(as.integer(commits$headers$`x-ratelimit-reset`)-as.POSIXct(Sys.time()) %>% as.integer())
      else
        get.pull.request.commits(owner = owner, repo = repo, id = i, ctx = get.github.context(), per_page=100, page = n)
    }, default = NULL)
  else
    return(commits)
}

list <- read.csv(paste0("/Users/Aron/dropbox/Thesis/3-Variance/Journal/Computational Analysis/compute/", repo, "_include.csv"), header = FALSE)

pull_lists <- lapply(list$V1, pull)

# This is a function for getting all the correct SHAs (ignores parent and tree SHAs)
sha_list <- vector("list", length(pull_lists))
for (i in 1:length(pull_lists)){
  try_default(sha_list[[i]]<- pull_lists[[i]]$content[[1]]$sha, default = NULL) # possibly I need to insert a try_default here
}

# this removes all the NULL values
# sha_list_clean <- sha_list[ ! sapply(sha_list, is.null) ]

get_commits <- function(sha){
  get.commit(git = NULL, ctx = get.github.context(), owner = owner, repo = repo, sha = sha)
}

commit_lists0 <- lapply(sha_list, get_commits)

file_list <- vector("list", length(commit_lists0))
for (i in 1:length(file_list)){
  try_default(file_list[[i]]<- commit_lists0[[i]]$content$files, default = NULL)
}

# Then find all the filenames using grepl
grep_filenames <- function(input){
  unlist(input, use.names=FALSE )[ grepl( "filename", names(unlist(input)))]
}

filename_lists <- lapply(file_list, grep_filenames)
filename_lists <- filename_lists[!grepl("test",filename_lists)]

# 1. Iterate across the list of PR_ids & create combination edgelists
combine_edge_lists <- function(filename_lists){
  try_default(t(combn(filename_lists, 2)), default = NULL)
}

file_lists_merged <- lapply(filename_lists, combine_edge_lists)

# 2. Merge all combination edgelists
edge_list_final <- do.call(rbind, file_lists_merged)

# 3. Calculate sna_metrics for each node
g <- graph.edgelist(edge_list_final)
(g)
}

django_graph18 <- modularization_query("django", "django")
rubinius_graph18 <- modularization_query("rubinius", "rubinius")
bootstrap_graph18 <- modularization_query("twbs", "bootstrap") # I may need to redo this as I may have hit the rate limit
rails_graph18 <- modularization_query("rails", "rails")


# POST PROCESSING


degree_list <- degree(g)
names <- names(degree_list)
names(degree_list) <- NULL
output <- as.data.frame(cbind(names, degree_list), stringsAsFactors = FALSE)
output$degree_list <- degree_list

# 4. Calculate average sna_metric for each PR
mean_f <- function(entry){
  mean(output$degree_list[output$names %in% entry])
}

final_output <- sapply(file_lists_merged, mean_f)
names(final_output) <- as.character(list$V1)
final_output[is.nan(final_output)] <- 0
hist(final_output)
write.csv(final_output, file = paste0(repo, "_modularization.csv"))
(final_output)

hist(degree(rails), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
hist(degree(rubinius), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
hist(degree(django), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
hist(degree(bootstrap), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
	# For this script to run I have to run my local version of rgithub. Open the rgithub Rstudio project and build and reload the package to use the modified get.commit() function that allows for setting the "git" argument to NULL.

	# library(github)
	# library(devtools)
	# library(roxygen2)
	setwd("/Users/Aron/Dropbox/Thesis/3-Variance/Journal/Computational Analysis/compute/")
	# library(github)
	# library(rgithub, lib.loc = "/Users/Aron/github/local/")
	library(httpuv)
	library(jsonlite)
	library(dplyr)
	library(plyr)
	library(stringr)
	library(igraph)


	# 0. Set up the query
	ctx = interactive.login("client_id", "client_secret")

	# This function makes sure I get the pagination right
	digest_header_links <- function(x) {
	y <- x$headers$link
	if(is.null(y)) {
	# message("No links found in header.")
	m <- matrix(0, ncol = 3, nrow = 4)
	links <- as.data.frame(m)
	names(links) <- c("rel", "per_page", "page")
	return(links)
	}
	y %>%
	str_split(", ") %>% unlist %>% # split into e.g. next, last, first, prev
	str_split_fixed("; ", 2) %>% # separate URL from the relation
	plyr::alply(2) %>% # workaround: make into a list
	as.data.frame() %>% # convert to data.frame, no factors!
	setNames(c("URL", "rel")) %>% # sane names
	dplyr::mutate_(rel = ~ str_match(rel, "next\|last\|first\|prev"),
	per_page = ~ str_match(URL, "per_page=([0-9]+)") %>%
	`[`( , 2) %>% as.integer,
	page = ~ str_match(URL, "&page=([0-9]+)") %>%
	`[`( , 2) %>% as.integer,
	URL = ~ str_replace_all(URL, "<\|>", ""))
	}

	modularization_query <- function(owner, repo){
	# This function pulls down data on all the pull requests.
	pull <- function(i){
	commits <- get.pull.request.commits(owner = owner, repo = repo, id = i, ctx = get.github.context(), per_page=100)
	links <- digest_header_links(commits)
	number_of_pages <- links[2,]$page
	if (number_of_pages != 0)
	try_default(for (n in 1:number_of_pages){
	if (as.integer(commits$headers$`x-ratelimit-remaining`) < 5)
	Sys.sleep(as.integer(commits$headers$`x-ratelimit-reset`)-as.POSIXct(Sys.time()) %>% as.integer())
	else
	get.pull.request.commits(owner = owner, repo = repo, id = i, ctx = get.github.context(), per_page=100, page = n)
	}, default = NULL)
	else
	return(commits)
	}

	list <- read.csv(paste0("/Users/Aron/dropbox/Thesis/3-Variance/Journal/Computational Analysis/compute/", repo, "_include.csv"), header = FALSE)

	pull_lists <- lapply(list$V1, pull)

	# This is a function for getting all the correct SHAs (ignores parent and tree SHAs)
	sha_list <- vector("list", length(pull_lists))
	for (i in 1:length(pull_lists)){
	try_default(sha_list[[i]]<- pull_lists[[i]]$content[[1]]$sha, default = NULL) # possibly I need to insert a try_default here
	}

	# this removes all the NULL values
	# sha_list_clean <- sha_list[ ! sapply(sha_list, is.null) ]

	get_commits <- function(sha){
	get.commit(git = NULL, ctx = get.github.context(), owner = owner, repo = repo, sha = sha)
	}

	commit_lists0 <- lapply(sha_list, get_commits)

	file_list <- vector("list", length(commit_lists0))
	for (i in 1:length(file_list)){
	try_default(file_list[[i]]<- commit_lists0[[i]]$content$files, default = NULL)
	}

	# Then find all the filenames using grepl
	grep_filenames <- function(input){
	unlist(input, use.names=FALSE )[ grepl( "filename", names(unlist(input)))]
	}

	filename_lists <- lapply(file_list, grep_filenames)
	filename_lists <- filename_lists[!grepl("test",filename_lists)]

	# 1. Iterate across the list of PR_ids & create combination edgelists
	combine_edge_lists <- function(filename_lists){
	try_default(t(combn(filename_lists, 2)), default = NULL)
	}

	file_lists_merged <- lapply(filename_lists, combine_edge_lists)

	# 2. Merge all combination edgelists
	edge_list_final <- do.call(rbind, file_lists_merged)

	# 3. Calculate sna_metrics for each node
	g <- graph.edgelist(edge_list_final)
	(g)
	}

	django_graph18 <- modularization_query("django", "django")
	rubinius_graph18 <- modularization_query("rubinius", "rubinius")
	bootstrap_graph18 <- modularization_query("twbs", "bootstrap") # I may need to redo this as I may have hit the rate limit
	rails_graph18 <- modularization_query("rails", "rails")


	# POST PROCESSING


	degree_list <- degree(g)
	names <- names(degree_list)
	names(degree_list) <- NULL
	output <- as.data.frame(cbind(names, degree_list), stringsAsFactors = FALSE)
	output$degree_list <- degree_list

	# 4. Calculate average sna_metric for each PR
	mean_f <- function(entry){
	mean(output$degree_list[output$names %in% entry])
	}

	final_output <- sapply(file_lists_merged, mean_f)
	names(final_output) <- as.character(list$V1)
	final_output[is.nan(final_output)] <- 0
	hist(final_output)
	write.csv(final_output, file = paste0(repo, "_modularization.csv"))
	(final_output)

	hist(degree(rails), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
	hist(degree(rubinius), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
	hist(degree(django), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))
	hist(degree(bootstrap), breaks = 50, ylim = c(0, 500), xlim = c(0, 750))