Skip to content

Instantly share code, notes, and snippets.

@sauljackman
Created October 29, 2015 19:26
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save sauljackman/fad0f19415a47a8bac03 to your computer and use it in GitHub Desktop.
Save sauljackman/fad0f19415a47a8bac03 to your computer and use it in GitHub Desktop.
Use this code to turn a list of SQL queries into a visual of how your database is used.
## Install packages if you don't already have them
install.packages(c("stringr", "tm", "igraph"), dependencies = TRUE)
## Load the packages
library(stringr)
library(tm)
library(igraph)
## Read in the data
queries <- read.csv("~/Downloads/queries.csv")
tables <- read.csv("~/Downloads/tables.csv")
# See what tables look like
head(queries,1)
head(tables)
## Create an empty data frame, with rows equal to number of queries being analyzed
x <- data.frame(rep(0,length(queries$query)))
## Populate data frame
# In the data frame x, each row is a query and each column is the name of a table from your database; each cell is an indicator for whether a particular table name appears in a particular query
for (i in 1:length(tables$tables)) {
x[,i] <- str_extract(queries$query, as.character(tables$tables[i]))
}
## Name columns and replace NAs
colnames(x) <- tables$tables
x[is.na(x)] <- 0
##Concatenate each row into a single string
x_args <- c(x, sep = " ")
x$list <- do.call(paste, x_args)
## Clean out zeroes and whitespace
x$list <- str_trim(gsub(0, "", x$list))
x$list <- gsub("[ ]+", " ", x$list)
## Check out data frame
x$list[520:524]
## Convert x$list into a corpus of documents
corpus <- Corpus(VectorSource(x$list))
## Create term-document matrix
tdm <- TermDocumentMatrix(corpus)
inspect(tdm)
termDocMatrix <- as.matrix(tdm)
## Create term-term matrix
termMatrix <- termDocMatrix %*% t(termDocMatrix)
## Create graph-adjacency matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
## Remove loops
g <- simplify(g)
## Set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
## Plot a Graph
plot(g, layout=layout_in_circle(g))
## Plot a prettier graph
V(g)$label.cex <- 3 * (0.06125 * V(g)$degree / max(V(g)$degree) + .2)
V(g)$label.color <- rgb(0, 0, .2, .49 * V(g)$degree / max(V(g)$degree) + .5)
V(g)$frame.color <- rgb(0, 0, .2, .39 * V(g)$degree / max(V(g)$degree) + .6)
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
#E(g)$color <- rgb(0, .3, .6, egam)
E(g)$color <- rgb((colorRamp(c("blue", "yellow", "red"))(E(g)$weight/max(E(g)$weight)))/255, alpha=egam)
E(g)$width <- egam
plot(g, layout=layout_in_circle(g), vertex.color = rgb((colorRamp(c("blue", "yellow", "red"))(degree(g)/max(degree(g))))/255), vertex.size = ((V(g)$degree)*2/3)+2, edge.width = 6 * E(g)$weight / max(E(g)$weight))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment