Downloading GSC data with R
# Description: The aim of this Gist is to show a simple way of downloading Google Search Console API data through R | |
# Created: 2020-04-28 | |
# Author: Leszek Sieminski | |
# | |
# Goals: | |
# 1. Full - no data inconsistencies allowed | |
# 2. Stable - not ruining Google API limits | |
# 3. Functional - based on functions that encapsulate specific tasks. | |
# | |
# The idea: download all the data for every single day and loop this process for all dates in a date range | |
# | |
# The execution: there are two custom functions: | |
# 1) gsc_date_range() that creates date range (optional but handy) | |
# 2) the gsc_download() function which is a wrapper for searchConsoleR's package search_analytics() function | |
# which downloads up to 100k rows per for a single date | |
# Those two functions are combined in purrr's package map_dfr() loop which repeats the download for every date | |
# and returns a data frame with all the results. | |
# To ensure that the API limits are not abused, there is a specified amount of time in which R wait between the calls. | |
# libraries ------------------------------------------------------------------------------------------------ | |
library("purrr") | |
library("searchConsoleR") | |
# authentication of Google Search Console ------------------------------------------------------------------ | |
# depends on your searchConsoleR and gargle packages' versions, so find out what is working for you | |
# searchConsoleR::scr_auth() # simple example | |
# declare a function that creates a date range ------------------------------------------------------------- | |
# start - first date in a date range | |
# end - last date of a date range | |
# returns a vector of dates | |
gsc_date_range <- function(start, end) { | |
seq.Date(from = as.Date(start), | |
to = as.Date(end), | |
by = "day") | |
} | |
# declare a function that downloads GSC data from a single date with chosen dimensions ---------------------- | |
# single_date - a single date, like in as.Date("2020-01-01") | |
# url - the URL of the website with GSC API project | |
# dim1 - first (mandatory) dimension: page, query etc. The date & device are already required by the function | |
# dim2 - second (optional) dimension: page, query etc | |
# returns data frame with GSC API data for a single day | |
gsc_download <- function(single_date, url, dim1, dim2 = NULL) { | |
gsc_data <- searchConsoleR::search_analytics( | |
siteURL = url, | |
startDate = single_date, | |
endDate = single_date, | |
dimensions = c("date", | |
"device", | |
dim1, | |
dim2), | |
walk_data = "byBatch", | |
searchType = "web", | |
rowLimit = 100000) | |
Sys.sleep(3) # we're going to make a lot of API calls, so let's not annoy the API limits | |
# 3 seconds of waiting between the calls seems pretty reasonable | |
return(gsc_data) | |
} | |
# download the data ----------------------------------------------------------------------------------------- | |
# now we use map_dfr - we create a vector of dates for which we want to download the GSC API data (.x parameter) | |
# and we download all the data for each day | |
df_page_query <- purrr::map_dfr( | |
.x = gsc_date_range( | |
start = Sys.Date() – 93, | |
end = Sys.Date() – 3), | |
.f = gsc_download, | |
url = site, | |
dim1 = "page", | |
dim2 = "query") | |
# finish! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment