Created
October 29, 2015 19:26
-
-
Save sauljackman/fad0f19415a47a8bac03 to your computer and use it in GitHub Desktop.
Use this code to turn a list of SQL queries into a visual of how your database is used.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Install packages if you don't already have them | |
install.packages(c("stringr", "tm", "igraph"), dependencies = TRUE) | |
## Load the packages | |
library(stringr) | |
library(tm) | |
library(igraph) | |
## Read in the data | |
queries <- read.csv("~/Downloads/queries.csv") | |
tables <- read.csv("~/Downloads/tables.csv") | |
# See what tables look like | |
head(queries,1) | |
head(tables) | |
## Create an empty data frame, with rows equal to number of queries being analyzed | |
x <- data.frame(rep(0,length(queries$query))) | |
## Populate data frame | |
# In the data frame x, each row is a query and each column is the name of a table from your database; each cell is an indicator for whether a particular table name appears in a particular query | |
for (i in 1:length(tables$tables)) { | |
x[,i] <- str_extract(queries$query, as.character(tables$tables[i])) | |
} | |
## Name columns and replace NAs | |
colnames(x) <- tables$tables | |
x[is.na(x)] <- 0 | |
##Concatenate each row into a single string | |
x_args <- c(x, sep = " ") | |
x$list <- do.call(paste, x_args) | |
## Clean out zeroes and whitespace | |
x$list <- str_trim(gsub(0, "", x$list)) | |
x$list <- gsub("[ ]+", " ", x$list) | |
## Check out data frame | |
x$list[520:524] | |
## Convert x$list into a corpus of documents | |
corpus <- Corpus(VectorSource(x$list)) | |
## Create term-document matrix | |
tdm <- TermDocumentMatrix(corpus) | |
inspect(tdm) | |
termDocMatrix <- as.matrix(tdm) | |
## Create term-term matrix | |
termMatrix <- termDocMatrix %*% t(termDocMatrix) | |
## Create graph-adjacency matrix | |
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected") | |
## Remove loops | |
g <- simplify(g) | |
## Set labels and degrees of vertices | |
V(g)$label <- V(g)$name | |
V(g)$degree <- degree(g) | |
## Plot a Graph | |
plot(g, layout=layout_in_circle(g)) | |
## Plot a prettier graph | |
V(g)$label.cex <- 3 * (0.06125 * V(g)$degree / max(V(g)$degree) + .2) | |
V(g)$label.color <- rgb(0, 0, .2, .49 * V(g)$degree / max(V(g)$degree) + .5) | |
V(g)$frame.color <- rgb(0, 0, .2, .39 * V(g)$degree / max(V(g)$degree) + .6) | |
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4) | |
#E(g)$color <- rgb(0, .3, .6, egam) | |
E(g)$color <- rgb((colorRamp(c("blue", "yellow", "red"))(E(g)$weight/max(E(g)$weight)))/255, alpha=egam) | |
E(g)$width <- egam | |
plot(g, layout=layout_in_circle(g), vertex.color = rgb((colorRamp(c("blue", "yellow", "red"))(degree(g)/max(degree(g))))/255), vertex.size = ((V(g)$degree)*2/3)+2, edge.width = 6 * E(g)$weight / max(E(g)$weight)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment