Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
pulls serp data from semrush, scrapes html from each page, and checks against the html for your url/link
# General-purpose data wrangling
#install.packages("tidyverse")
#install.packages("rvest")
#install.packages("rebus")
#install.packages("lubridate")
library(tidyverse)
library(httr)
# Parsing of HTML/XML files
library(rvest)
# String manipulation
library(stringr)
# Verbose regular expressions
library(rebus)
# Eases DateTime manipulation
library(lubridate)
# we're going to first choose which link we want to check our results against. Change this to your
# preferred URL (in the link_check variable)
link_check <- "hubspot.com/products/"
link_check_rgx <- paste0(".+", link_check, ".+")
# here are our other inputs, the keyword and the number of search results we want to check
keyword <- readline(prompt="Enter search query: ")
n_serp_results <- readline(prompt="Enter number of SERP results: ")
# alright, now the SEMRush API stuff. Make sure to change the API key to your own
base_url <- "https://api.semrush.com/"
query_params = list(type = "phrase_organic", key=api_key, phrase=keyword,
database="us", display_limit=n_serp_results)
query_params_2 = list(type = "phrase_all", key=api_key, phrase=keyword,
database="us")
request <- GET(base_url, query = query_params)
request_2 <- GET(base_url, query = query_params_2)
# get the keyword volume
kw_volume <- strsplit(content(request_2, as = "text"), ";")
kw_volume <- as.data.frame(kw_volume, col.names = "volume", stringsAsFactors = FALSE)
kw_volume <- kw_volume$volume[9]
# read google SERP as html and put into a list
url_list <- strsplit(content(request, as = "text"), "\r\n")
# and then change it into a dataframe
split_df <- as.data.frame(url_list, col.names = "urls", stringsAsFactors = FALSE)
# then clean it up and parse out the domain and url characters
splitup <- str_split_fixed(split_df$urls, ";", 2)
#finally do it again as a dataframe and clean up the column titles
splitup_df <- as.data.frame(splitup, col.names = splitup[1,], stringsAsFactors = FALSE)
# check that shit
head(splitup_df)
splitup_df <- splitup_df[-1,]
# make the second column - the urls - into a list. That's what we'll use to check against for links
url_list_2 <- as.list(splitup_df$V2)
head(splitup_df$V2)
# start an empty data list to capture each link check (true or false per URL)
datalist <- list()
urllist <- list()
keyword_list <- list()
yes_link_count <- list()
# Loop over each URL in splitup_df - our list of URLs from the SERP
for(i in seq(nrow(splitup_df))) {
keyword_list[[i]] <- keyword
try(text <- read_html(splitup_df$V2[i]) %>%
html_nodes("a") %>%
html_attr("href"))
links_check <- grepl(link_check, text) # print true if includes 1 or more links, false if none
if(TRUE %in% links_check) {
print(TRUE)
links_check <- TRUE
listy <- (str_extract_all(text, link_check_rgx))
listy <- listy[lapply(listy,length)>0]
listy <- as.character(listy[!is.na(listy)])
urllist[[i]] <- listy
yes_link_count[[i]] <- length(listy)
} else {
print(FALSE)
links_check <- FALSE
urllist[[i]] <- "None"
yes_link_count[[i]] <- 0
}
datalist[[i]] <- links_check # Add True or False to data frame
}
#change link count to character class
yes_link_count <- as.character(yes_link_count)
# check that shit (all the data)
head(datalist)
# combine the google SERP URLS with the data list (true or false link check)
urls_plus_link_check <- list(splitup_df,datalist, keyword, yes_link_count, urllist)
# check that shit
head(urls_plus_link_check)
# make it into a data frame
new <- mapply(cbind,urls_plus_link_check)
# make some column names
new_1 <- as.data.frame(new, stringsAsFactors = FALSE, col.names = c("Domain", "Link", "Keyword", "N_Links", "Full.Link"))
# now change the true/false list to character vector and save to csv
new_1$Link <- as.character(new_1$Link)
new_1$Full.Link <- as.character(new_1$Full.Link)
write.csv(new_1, "serp.csv")
# why not check that it worked too?
head(new_1)
# count the number of links, number of results, and number of non links
link_count_new <- sum(new_1 == 'TRUE', na.rm=TRUE)
n_of_results <- nrow(new_1)
no_mention <- n_of_results - link_count_new
# put all that into a data frame with the date of the pull attached
yes <- data.frame(Sys.Date(), link_count_new, no_mention, n_of_results)
# might as well load the percentage of the serp too
percentage_of_serp <- (link_count_new / n_of_results * 100)
# okay, now let's make a pie chart. First we need to build a dataframe with the stuff we want, links and no links
df <- data.frame(
group = c("Link", "No Link"),
value = c(link_count_new, no_mention))
# load ggplot2 of course
library(ggplot2)
# now make a simple barplot
bp <- ggplot(df, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity")
# flip it into a pie chart!
pie <- bp + coord_polar("y", start=0)
# feast your eyes on this beautiful pie chart
pie
# now write the link, no link, and total results summary data to a spreadsheet.
write.csv(yes, "link_count.csv")
read.csv("link_count.csv")
# now we have two csv files - link_count and serp_results - the summary data and the table with the URLS. Nice!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment