Skip to content

Instantly share code, notes, and snippets.

@aravindhebbali
Last active September 24, 2017 08:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aravindhebbali/5d1799744e55dc76cdf1af6b1cc03c82 to your computer and use it in GitHub Desktop.
Save aravindhebbali/5d1799744e55dc76cdf1af6b1cc03c82 to your computer and use it in GitHub Desktop.
Working with strings in R
# install
install.packages('stringr')
install.packages('readr')
install.packages('tibble')
install.packages('magrittr')
install.packages('purrr')
install.packages('dplyr')
# library
library(stringr)
library(readr)
library(tibble)
library(magrittr)
library(purrr)
library(dplyr)
# import data
mockstring <- readr::read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/mock_strings.csv')
mockstring
# extract domain name from random email ids
mockstring$email[1:2] %>%
str_split(pattern = '@') %>%
unlist() %>%
str_subset(pattern = '\\.') %>%
str_split(pattern = '\\.') %>%
purrr::map_chr(extract(1))
# extract image type from url
mockstring$imageurl[1:5] %>%
str_split(pattern = '\\.') %>%
purrr::map_chr(extract(3)) %>%
str_sub(start = 1, end = 3)
mockstring$imageurl[1:5] %>%
str_split(pattern = '\\.') %>%
purrr::map_chr(extract(3)) %>%
str_split(pattern = '/') %>%
purrr::map_chr(extract(1))
# extract image dimension from url
pattern <- "[:digits:]"
mockstring$imageurl[1:5] %>%
str_locate(pattern = "[0-9]")
mockstring$imageurl[1:5] %>%
str_sub(start = 23) %>%
str_split(pattern = '\\.') %>%
purrr::map_chr(extract(1))
# extract protocol
mockstring$url[1:5] %>%
str_split(pattern = '://') %>%
map_chr(extract(1))
k <- mockstring$url[1:5] %>%
str_locate(pattern = '://') %>%
data.frame() %>%
extract2('start') %>%
subtract(1)
mockstring$url[1:5] %>%
str_sub(start = 1, end = k)
# extract domain name
n <- mockstring$url[1:5] %>%
str_locate_all(pattern = "/") %>%
map_int(extract(3))
mockstring$url[1:5] %>%
str_sub(end = n) %>%
str_split(pattern = '\\.') %>%
map_chr(extract(1)) %>%
str_sub(start = k + 4)
# extract extension from url
mockstring$url[1:5] %>%
str_sub(end = n) %>%
str_split(pattern = '\\.') %>%
map_chr(extract(2)) %>%
str_split(pattern = '/') %>%
map_chr(extract(1))
# extract file type from url
# only 2 dots in the url
mockstring$url[1:3] %>%
str_locate_all(pattern = '\\.') %>%
map_int(nrow) %>%
is_greater_than(2) %>%
sum()
# only 1 question mark in the url
mockstring$url[1:3] %>%
str_locate_all(pattern = "[?]") %>%
map_int(nrow) %>%
is_greater_than(1) %>%
sum()
# location of second dot and 1 question mark
d <- mockstring$url[1:3] %>%
str_locate_all(pattern = '\\.') %>%
map_int(extract(2)) %>%
add(1)
q <- mockstring$url[1:3] %>%
str_locate_all(pattern = "[?]") %>%
map_int(extract(1)) %>%
subtract(1)
mockstring$url[1:3] %>%
str_sub(start = d, end = q)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment