Skip to content

Instantly share code, notes, and snippets.

@CorradoLanera
Last active July 9, 2020 18:09
Show Gist options
  • Save CorradoLanera/1f8e6807a5a32652475413375600f504 to your computer and use it in GitHub Desktop.
Save CorradoLanera/1f8e6807a5a32652475413375600f504 to your computer and use it in GitHub Desktop.
Small vectorized function to extract domains in base R
#' Get domain
#'
#' @param url (chr) url from which extract the domain
#' @param include_suffix (lgl, default TRUE) would you like to get the
#' suffix too (dot included)?
#'
#' @seealso https://stackoverflow.com/questions/19020749/function-to-extract-domain-name-from-url-in-r
#' @export
#' @examples
#' get_domain("www.example.com")
#' get_domain("http://www.example.com")
#' get_domain("https://www.example.com")
#' get_domain("example.com")
#' get_domain("https://www.example.com/foo/bar?tar")
#' get_domain("https://www.foo.example.com/bar?tar")
get_domain <- function(url, include_suffix = TRUE) {
res <- paste0("\\3", c("\\4\\5")[include_suffix])
sub(url_regexpr(), res, url)
}
url_regexpr <- function() {
protocol <- "([^/]+://)*" # could be
sub <- "([^\\.\\?/]+\\.)*" # could be
domain <- "([^\\.\\?/]+)" # must be
dot <- "(\\.)" # must be
suffix <- "([^/]+)" # must be
folders <- "(/[^\\?]*)*" # could be
args <- "(\\?.*)*" #could be
paste0(
"^",
protocol, sub, domain, dot, suffix, folders, args,
"$"
)
}
library(testthat)
test_that("get_domain works", {
expect_equal(get_domain("https://www.example.com"), "example.com")
expect_equal(get_domain("http://www.example.com"), "example.com")
expect_equal(get_domain("www.example.com"), "example.com")
expect_equal(get_domain("www.example.net"), "example.net")
expect_equal(get_domain("www.example.net/baz"), "example.net")
expect_equal(get_domain("https://www.example.net/baz"), "example.net")
expect_equal(get_domain("https://www.example.net/baz/tar"), "example.net")
expect_equal(get_domain("https://foo.example.net"), "example.net")
expect_equal(get_domain("https://www.foo.example.net"), "example.net")
})
test_that("get_domain is vectorized", {
urls <- c("www.example.com", "www.example.net")
expect_equal(get_domain(urls), c("example.com", "example.net"))
})
test_that("can remove suffix", {
expect_equal(
get_domain("https://www.example.com", include_suffix = FALSE),
"example"
)
})
test_that("works with file extensions", {
expect_equal(
get_domain("https://www.example.com/foo.php"),
"example.com"
)
})
test_that("works against args after slash", {
expect_equal(
get_domain("http://example.com/?"),
"example.com"
)
})
test_that("works against multiple dots after slash", {
expect_equal(
get_domain("http://example.com/foo.net.bar"),
"example.com"
)
})
test_that("generalized protocols", {
expect_equal(
get_domain("android-app://example.com/foo.net.bar"),
"example.com"
)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment