CorradoLanera/get_domain.R

## get_domain.R
#' Get domain
#'
#' @param url (chr) url from which extract the domain
#' @param include_suffix (lgl, default TRUE) would you like to get the
#'   suffix too (dot included)?
#'
#' @seealso https://stackoverflow.com/questions/19020749/function-to-extract-domain-name-from-url-in-r
#' @export
#' @examples
#' get_domain("www.example.com")
#' get_domain("http://www.example.com")
#' get_domain("https://www.example.com")
#' get_domain("example.com")
#' get_domain("https://www.example.com/foo/bar?tar")
#' get_domain("https://www.foo.example.com/bar?tar")
get_domain <- function(url, include_suffix = TRUE) {
  res <- paste0("\\3", c("\\4\\5")[include_suffix])
  sub(url_regexpr(), res, url)
}

url_regexpr <- function() {
  protocol <- "([^/]+://)*"  # could be
  sub <- "([^\\.\\?/]+\\.)*"  # could be
  domain <- "([^\\.\\?/]+)"  # must be
  dot <- "(\\.)"  # must be
  suffix <- "([^/]+)"  # must be
  folders <- "(/[^\\?]*)*"  # could be
  args <- "(\\?.*)*"  #could be

  paste0(
    "^",
    protocol, sub, domain, dot, suffix, folders, args,
    "$"
  )
}


## test-get_domain.R
library(testthat)

test_that("get_domain works", {
  expect_equal(get_domain("https://www.example.com"), "example.com")
  expect_equal(get_domain("http://www.example.com"), "example.com")
  expect_equal(get_domain("www.example.com"), "example.com")

  expect_equal(get_domain("www.example.net"), "example.net")
  expect_equal(get_domain("www.example.net/baz"), "example.net")

  expect_equal(get_domain("https://www.example.net/baz"), "example.net")
  expect_equal(get_domain("https://www.example.net/baz/tar"), "example.net")

  expect_equal(get_domain("https://foo.example.net"), "example.net")
  expect_equal(get_domain("https://www.foo.example.net"), "example.net")
})

test_that("get_domain is vectorized", {
  urls <- c("www.example.com", "www.example.net")
  expect_equal(get_domain(urls), c("example.com", "example.net"))
})


test_that("can remove suffix", {
  expect_equal(
    get_domain("https://www.example.com", include_suffix = FALSE),
    "example"
  )
})

test_that("works with file extensions", {
  expect_equal(
    get_domain("https://www.example.com/foo.php"),
    "example.com"
  )
})

test_that("works against args after slash", {
  expect_equal(
    get_domain("http://example.com/?"),
    "example.com"
  )
})

test_that("works against multiple dots after slash", {
  expect_equal(
    get_domain("http://example.com/foo.net.bar"),
    "example.com"
  )
})

test_that("generalized protocols", {
  expect_equal(
    get_domain("android-app://example.com/foo.net.bar"),
    "example.com"
  )
})
	#' Get domain
	#'
	#' @param url (chr) url from which extract the domain
	#' @param include_suffix (lgl, default TRUE) would you like to get the
	#' suffix too (dot included)?
	#'
	#' @seealso https://stackoverflow.com/questions/19020749/function-to-extract-domain-name-from-url-in-r
	#' @export
	#' @examples
	#' get_domain("www.example.com")
	#' get_domain("http://www.example.com")
	#' get_domain("https://www.example.com")
	#' get_domain("example.com")
	#' get_domain("https://www.example.com/foo/bar?tar")
	#' get_domain("https://www.foo.example.com/bar?tar")
	get_domain <- function(url, include_suffix = TRUE) {
	res <- paste0("\\3", c("\\4\\5")[include_suffix])
	sub(url_regexpr(), res, url)
	}

	url_regexpr <- function() {
	protocol <- "([^/]+://)*" # could be
	sub <- "([^\\.\\?/]+\\.)*" # could be
	domain <- "([^\\.\\?/]+)" # must be
	dot <- "(\\.)" # must be
	suffix <- "([^/]+)" # must be
	folders <- "(/[^\\?])" # could be
	args <- "(\\?.)" #could be

	paste0(
	"^",
	protocol, sub, domain, dot, suffix, folders, args,
	"$"
	)
	}
	library(testthat)

	test_that("get_domain works", {
	expect_equal(get_domain("https://www.example.com"), "example.com")
	expect_equal(get_domain("http://www.example.com"), "example.com")
	expect_equal(get_domain("www.example.com"), "example.com")

	expect_equal(get_domain("www.example.net"), "example.net")
	expect_equal(get_domain("www.example.net/baz"), "example.net")

	expect_equal(get_domain("https://www.example.net/baz"), "example.net")
	expect_equal(get_domain("https://www.example.net/baz/tar"), "example.net")

	expect_equal(get_domain("https://foo.example.net"), "example.net")
	expect_equal(get_domain("https://www.foo.example.net"), "example.net")
	})

	test_that("get_domain is vectorized", {
	urls <- c("www.example.com", "www.example.net")
	expect_equal(get_domain(urls), c("example.com", "example.net"))
	})


	test_that("can remove suffix", {
	expect_equal(
	get_domain("https://www.example.com", include_suffix = FALSE),
	"example"
	)
	})

	test_that("works with file extensions", {
	expect_equal(
	get_domain("https://www.example.com/foo.php"),
	"example.com"
	)
	})

	test_that("works against args after slash", {
	expect_equal(
	get_domain("http://example.com/?"),
	"example.com"
	)
	})

	test_that("works against multiple dots after slash", {
	expect_equal(
	get_domain("http://example.com/foo.net.bar"),
	"example.com"
	)
	})

	test_that("generalized protocols", {
	expect_equal(
	get_domain("android-app://example.com/foo.net.bar"),
	"example.com"
	)
	})