library(tidyverse) | |
t = tibble(x=1:4, name=c("mary", "sue", "jane", "pete")) | |
t$x2 = c(1,9) | |
t %>% mutate(x3 = rep(c(1,9) ,2)) | |
t(t[2, ]) | |
x = c(1,2,3) | |
length(x) | |
class(x) | |
t | |
x = t$x | |
class(x) | |
x = c(1,2,"Three") | |
x | |
cor.test(t$x, t$y) | |
d1 = read.csv2('/tmp/test.csv') | |
d1 = read.csv2('/tmp/test.csv', stringsAsFactors = F) | |
d1 = read_csv2('/tmp/test.csv') | |
d1 | |
levels(d1$group) | |
reorder(d1$group, d1$id) | |
d1$text = as.character(d1$text) | |
d1 = d1 %>% mutate(text = as.character(text)) | |
class(d1$age) | |
d1$age = c("1", "2", "4") | |
d1 %>% mutate(age_str = as.character(age)) %>% mutate(age2 = as.numeric(age_str)) | |
d1 %>% mutate(age_str = as.character(age), age2 = as.numeric(age_str)) | |
d1 %>% mutate(age_str = as.numeric(as.character(age))) | |
levels(d1$age) | |
class(d1$age) | |
d1 = d1 %>% select(id, text) | |
d1 %>% mutate(len=str_length(text)) | |
d1 %>% mutate(subset=str_sub(text, start=5, end=10)) | |
d1$timestamps = "2019-01-03 12:32:56" | |
d1 %>% mutate(time=str_sub(timestamps, start=str_length(timestamps) - 7)) | |
d1 %>% mutate(time=str_sub(timestamps, start=-8)) | |
d1 | |
str_sub(d1$timestamps, start = 11) = "xxxxx" | |
# regular expressions | |
regex_email = regex("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+") | |
email=c("vanatteveldt+spamme@gmail.com", "vanatteveldt+spamme@gmail+fake.com") | |
str_view_all(email, regex_email) | |
str_view_all(d1$text, "\\w+@\\w+\\.\\w+") | |
a@b.c | |
d1 = read_csv2('/tmp/test.csv') | |
d1 | |
str_view_all(d1$text, "\\#\\w+") | |
str_view_all(d1$text, "( |^)\\@\\w+( |$)") | |
str_view_all(d1$timestamps, "\\d\\d\\d\\d-\\d\\d-\\d\\d") | |
str_view_all(d1$text, "\\d?\\d\\.\\d?\\d\\.\\d\\d\\d?\\d?") | |
str_view_all(d1$text, "\\d+\\.\\d+\\.\\d+") | |
str_view_all(d1$text, "\\d*\\.\\d*\\.\\d*") | |
str_view_all(d1$text, "\\d?\\d\\.\\d?\\d\\.\\d\\d(\\d\\d)?") | |
str_view_all(d1$text, "[0123]?\\d\\.[012]?\\d\\.([12]\\d)?\\d\\d") | |
# str_detect / str_count | |
str_detect(d1$text, regex_email) | |
d1 %>% filter(str_detect(text, regex_email)) | |
d1 %>% filter(str_detect(text, "#\\w+")) | |
d1 %>% filter(str_detect(text, "\\bBob\\b")) | |
d1 %>% mutate(n = str_count(text, regex_email)) | |
d %>% mutate(n = str_count(text, "liberty")) %>% group_by(President) %>% summarize(n=sum(n)) | |
d %>% filter( str_detect(text, "terror")) | |
religion = regex("relig|christ|muslim|islam|hindu|hindi|buddh", ignore_case = T) | |
terror = regex("terror", ignore_case = T) | |
x = d %>% mutate(terror = str_count(text, terror), muslim=str_count(text, religion)) | |
table(x$terror, x$muslim) | |
cor.test(x$terror, x$muslim) | |
x %>% group_by(President) %>% summarize(cor=cor(terror, muslim)) %>% arrange(-cor) | |
# str_replace | |
d1 %>% mutate(text_cleaned = str_replace_all(text, "[^A-Za-z]", " "), text_cleaned2 = str_replace_all(text_cleaned, " +", " "), | |
text_cleaned3 = str_replace_all(text_cleaned2, "^ +| +$", "")) | |
# text_cleaned3 = str_trim(text_cleaned2)) | |
d1 %>% mutate(redacted = str_replace_all(text, "\\w+@", "***@")) | |
d1 %>% mutate(redacted = str_replace_all(text, "\\w+(?=@)", "***")) | |
d1 %>% mutate(redacted = str_replace_all(text, "(?<= |^)(@|#)\\w+(?= |$)", " ")) | |
# gsub() === str_replace_all | |
# grep === str_detect | |
str_view_all(d1$text, "( |^)(@|#)\\w+( |$)", " ") | |
str_view_all(d1$text, "(?<= |^)(@|#)\\w+(?= |$)", " ") | |
# str_extract | |
regex_date = regex("[0123]?\\d\\.[012]?\\d\\.([12]\\d)?\\d\\d") | |
tagsnmentions = regex('(?<= |^)(@|#)\\w+(?= |$)') | |
tags = regex('(?<= |^)(#)\\w+(?= |$)') | |
d1 %>% mutate(date = str_extract(text, regex_date)) | |
d1 %>% mutate(email = str_extract(text, regex_email)) | |
d1 %>% mutate(tags = str_extract(text, tags)) | |
str_extract(d1$text, tags) | |
x = str_extract_all(d1$text, tags, simplify=T) | |
x = cbind(d1, x) | |
class(x$`1`) | |
x %>% gather("hitn", "tag", -id:-group) %>% filter(tag != "") %>% group_by(tag) %>% summarize(n=n()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment