Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
library(tidyverse)
t = tibble(x=1:4, name=c("mary", "sue", "jane", "pete"))
t$x2 = c(1,9)
t %>% mutate(x3 = rep(c(1,9) ,2))
t(t[2, ])
x = c(1,2,3)
length(x)
class(x)
t
x = t$x
class(x)
x = c(1,2,"Three")
x
cor.test(t$x, t$y)
d1 = read.csv2('/tmp/test.csv')
d1 = read.csv2('/tmp/test.csv', stringsAsFactors = F)
d1 = read_csv2('/tmp/test.csv')
d1
levels(d1$group)
reorder(d1$group, d1$id)
d1$text = as.character(d1$text)
d1 = d1 %>% mutate(text = as.character(text))
class(d1$age)
d1$age = c("1", "2", "4")
d1 %>% mutate(age_str = as.character(age)) %>% mutate(age2 = as.numeric(age_str))
d1 %>% mutate(age_str = as.character(age), age2 = as.numeric(age_str))
d1 %>% mutate(age_str = as.numeric(as.character(age)))
levels(d1$age)
class(d1$age)
d1 = d1 %>% select(id, text)
d1 %>% mutate(len=str_length(text))
d1 %>% mutate(subset=str_sub(text, start=5, end=10))
d1$timestamps = "2019-01-03 12:32:56"
d1 %>% mutate(time=str_sub(timestamps, start=str_length(timestamps) - 7))
d1 %>% mutate(time=str_sub(timestamps, start=-8))
d1
str_sub(d1$timestamps, start = 11) = "xxxxx"
# regular expressions
regex_email = regex("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+")
email=c("vanatteveldt+spamme@gmail.com", "vanatteveldt+spamme@gmail+fake.com")
str_view_all(email, regex_email)
str_view_all(d1$text, "\\w+@\\w+\\.\\w+")
a@b.c
d1 = read_csv2('/tmp/test.csv')
d1
str_view_all(d1$text, "\\#\\w+")
str_view_all(d1$text, "( |^)\\@\\w+( |$)")
str_view_all(d1$timestamps, "\\d\\d\\d\\d-\\d\\d-\\d\\d")
str_view_all(d1$text, "\\d?\\d\\.\\d?\\d\\.\\d\\d\\d?\\d?")
str_view_all(d1$text, "\\d+\\.\\d+\\.\\d+")
str_view_all(d1$text, "\\d*\\.\\d*\\.\\d*")
str_view_all(d1$text, "\\d?\\d\\.\\d?\\d\\.\\d\\d(\\d\\d)?")
str_view_all(d1$text, "[0123]?\\d\\.[012]?\\d\\.([12]\\d)?\\d\\d")
# str_detect / str_count
str_detect(d1$text, regex_email)
d1 %>% filter(str_detect(text, regex_email))
d1 %>% filter(str_detect(text, "#\\w+"))
d1 %>% filter(str_detect(text, "\\bBob\\b"))
d1 %>% mutate(n = str_count(text, regex_email))
d %>% mutate(n = str_count(text, "liberty")) %>% group_by(President) %>% summarize(n=sum(n))
d %>% filter( str_detect(text, "terror"))
religion = regex("relig|christ|muslim|islam|hindu|hindi|buddh", ignore_case = T)
terror = regex("terror", ignore_case = T)
x = d %>% mutate(terror = str_count(text, terror), muslim=str_count(text, religion))
table(x$terror, x$muslim)
cor.test(x$terror, x$muslim)
x %>% group_by(President) %>% summarize(cor=cor(terror, muslim)) %>% arrange(-cor)
# str_replace
d1 %>% mutate(text_cleaned = str_replace_all(text, "[^A-Za-z]", " "), text_cleaned2 = str_replace_all(text_cleaned, " +", " "),
text_cleaned3 = str_replace_all(text_cleaned2, "^ +| +$", ""))
# text_cleaned3 = str_trim(text_cleaned2))
d1 %>% mutate(redacted = str_replace_all(text, "\\w+@", "***@"))
d1 %>% mutate(redacted = str_replace_all(text, "\\w+(?=@)", "***"))
d1 %>% mutate(redacted = str_replace_all(text, "(?<= |^)(@|#)\\w+(?= |$)", " "))
# gsub() === str_replace_all
# grep === str_detect
str_view_all(d1$text, "( |^)(@|#)\\w+( |$)", " ")
str_view_all(d1$text, "(?<= |^)(@|#)\\w+(?= |$)", " ")
# str_extract
regex_date = regex("[0123]?\\d\\.[012]?\\d\\.([12]\\d)?\\d\\d")
tagsnmentions = regex('(?<= |^)(@|#)\\w+(?= |$)')
tags = regex('(?<= |^)(#)\\w+(?= |$)')
d1 %>% mutate(date = str_extract(text, regex_date))
d1 %>% mutate(email = str_extract(text, regex_email))
d1 %>% mutate(tags = str_extract(text, tags))
str_extract(d1$text, tags)
x = str_extract_all(d1$text, tags, simplify=T)
x = cbind(d1, x)
class(x$`1`)
x %>% gather("hitn", "tag", -id:-group) %>% filter(tag != "") %>% group_by(tag) %>% summarize(n=n())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.