Skip to content

Instantly share code, notes, and snippets.

@voltek62
Created October 26, 2018 20:49
Show Gist options
  • Save voltek62/04ec0836f1922a9d6802aabd068f0298 to your computer and use it in GitHub Desktop.
Save voltek62/04ec0836f1922a9d6802aabd068f0298 to your computer and use it in GitHub Desktop.
Visualizing your website’s Internal Linking
#autoinstall packages
packages <- c("igraph", "dplyr", "ggplot2", "magrittr")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())))
}
# Enjoy learning ? https://dataseolabs.com
library(igraph)
library(dplyr)
library(ggplot2)
library(magrittr)
map <- function(x, range = c(0,1), from.range=NA) {
if(any(is.na(from.range))) from.range <- range(x, na.rm=TRUE)
## check if all values are the same
if(!diff(from.range)) return(
matrix(mean(range), ncol=ncol(x), nrow=nrow(x),
dimnames = dimnames(x)))
## map to [0,1]
x <- (x-from.range[1])
x <- x/diff(from.range)
## handle single values
if(diff(from.range) == 0) x <- 0
## map from [0,1] to [range]
if (range[1]>range[2]) x <- 1-x
x <- x*(abs(diff(range))) + min(range)
x[x<min(range) | x>max(range)] <- NA
x
}
# import ScreamingFrog : “Bulk Export” then “All Outlinks”
## skip first line
DF <- read.csv2("all_outlinks.csv", header=TRUE, sep = ",", stringsAsFactors = F, skip=1 )
## we keep only link
DF <- DF[DF$Type=="AHREF",]
DF <- select(DF,Source,Destination)
## adapt colnames and rownames
colnames(DF) <- c("From","To")
rownames(DF) <- NULL
# generate graph with data.frame
graphObject = graph.data.frame(DF, directed = TRUE)
# to run pagerank we need a simple, undirected graph
graphObject = simplify(as.undirected(graphObject))
# calculate pagerank
pr <- page.rank(graphObject, directed= TRUE, damping = 0.85)
# print graph with size node linked with pagerank
plot(graphObject,
layout=layout.fruchterman.reingold,
vertex.size = map(pr$vector, c(1,20)),
vertex.label = NA,
vertex.label.color = "black",
edge.arrow.size=.2
)
# calculate pagerank and store into your data.frame
urls_pagerank <- pr %>%
use_series("vector") %>%
sort(decreasing = TRUE) %>%
as.data.frame %>%
set_colnames("raw.internal.pagerank")
urls_pagerank$Address<-rownames(urls_pagerank)
rownames(urls_pagerank) <- NULL
urls_pagerank <- mutate(urls_pagerank, internal.pagerank = map(raw.internal.pagerank, c(1,10)))
# print only your top URLs
nbUrl <- 800
graphObjectTopUrl <- subgraph.edges(graphObject, 1:nbUrl)
# use tkplot for interactive graph
prLimit <- as.numeric(map(pr$vector, c(1,20)))[1:(nbUrl+1)]
tkplot(graphObjectTopUrl
,layout=layout.fruchterman.reingold
,vertex.size = prLimit
,vertex.label = NA
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment