Last active
September 24, 2017 08:26
-
-
Save aravindhebbali/5d1799744e55dc76cdf1af6b1cc03c82 to your computer and use it in GitHub Desktop.
Working with strings in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# install | |
install.packages('stringr') | |
install.packages('readr') | |
install.packages('tibble') | |
install.packages('magrittr') | |
install.packages('purrr') | |
install.packages('dplyr') | |
# library | |
library(stringr) | |
library(readr) | |
library(tibble) | |
library(magrittr) | |
library(purrr) | |
library(dplyr) | |
# import data | |
mockstring <- readr::read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/mock_strings.csv') | |
mockstring | |
# extract domain name from random email ids | |
mockstring$email[1:2] %>% | |
str_split(pattern = '@') %>% | |
unlist() %>% | |
str_subset(pattern = '\\.') %>% | |
str_split(pattern = '\\.') %>% | |
purrr::map_chr(extract(1)) | |
# extract image type from url | |
mockstring$imageurl[1:5] %>% | |
str_split(pattern = '\\.') %>% | |
purrr::map_chr(extract(3)) %>% | |
str_sub(start = 1, end = 3) | |
mockstring$imageurl[1:5] %>% | |
str_split(pattern = '\\.') %>% | |
purrr::map_chr(extract(3)) %>% | |
str_split(pattern = '/') %>% | |
purrr::map_chr(extract(1)) | |
# extract image dimension from url | |
pattern <- "[:digits:]" | |
mockstring$imageurl[1:5] %>% | |
str_locate(pattern = "[0-9]") | |
mockstring$imageurl[1:5] %>% | |
str_sub(start = 23) %>% | |
str_split(pattern = '\\.') %>% | |
purrr::map_chr(extract(1)) | |
# extract protocol | |
mockstring$url[1:5] %>% | |
str_split(pattern = '://') %>% | |
map_chr(extract(1)) | |
k <- mockstring$url[1:5] %>% | |
str_locate(pattern = '://') %>% | |
data.frame() %>% | |
extract2('start') %>% | |
subtract(1) | |
mockstring$url[1:5] %>% | |
str_sub(start = 1, end = k) | |
# extract domain name | |
n <- mockstring$url[1:5] %>% | |
str_locate_all(pattern = "/") %>% | |
map_int(extract(3)) | |
mockstring$url[1:5] %>% | |
str_sub(end = n) %>% | |
str_split(pattern = '\\.') %>% | |
map_chr(extract(1)) %>% | |
str_sub(start = k + 4) | |
# extract extension from url | |
mockstring$url[1:5] %>% | |
str_sub(end = n) %>% | |
str_split(pattern = '\\.') %>% | |
map_chr(extract(2)) %>% | |
str_split(pattern = '/') %>% | |
map_chr(extract(1)) | |
# extract file type from url | |
# only 2 dots in the url | |
mockstring$url[1:3] %>% | |
str_locate_all(pattern = '\\.') %>% | |
map_int(nrow) %>% | |
is_greater_than(2) %>% | |
sum() | |
# only 1 question mark in the url | |
mockstring$url[1:3] %>% | |
str_locate_all(pattern = "[?]") %>% | |
map_int(nrow) %>% | |
is_greater_than(1) %>% | |
sum() | |
# location of second dot and 1 question mark | |
d <- mockstring$url[1:3] %>% | |
str_locate_all(pattern = '\\.') %>% | |
map_int(extract(2)) %>% | |
add(1) | |
q <- mockstring$url[1:3] %>% | |
str_locate_all(pattern = "[?]") %>% | |
map_int(extract(1)) %>% | |
subtract(1) | |
mockstring$url[1:3] %>% | |
str_sub(start = d, end = q) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment