Skip to content

Instantly share code, notes, and snippets.

@sastoudt
Last active November 3, 2023 16:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sastoudt/ef5d5f6e99cd9d6be57cd5df36f17873 to your computer and use it in GitHub Desktop.
Save sastoudt/ef5d5f6e99cd9d6be57cd5df36f17873 to your computer and use it in GitHub Desktop.
Different sampling techniques to generate poems based on Walt Whitman's 'Song of Myself'
library(dplyr)
library(ggplot2)
## https://www.poetryfoundation.org/poems/45477/song-of-myself-1892-version
## https://docs.google.com/spreadsheets/d/1f7JiBosfa_ralv_Pzx-Q_2IjwIj-HtVupb1V0Cwq1co/edit?usp=sharing
poem <- read.csv("song_of_myself.csv")
#### EDA ####
byStanza <- poem %>%
group_by(stanza) %>%
summarise(count = n())
ggplot(byStanza, aes(x = count)) +
geom_histogram() +
theme_minimal() +
xlab("number of lines") +
ggtitle("Walt Whitman's 'Song of Myself'", subtitle = "Stanza Lengths")
summary(byStanza$count)
prob <- byStanza %>%
mutate(prop = count / nrow(poem)) %>%
select(prop) %>%
as.vector() %>%
unname() %>%
unlist()
#### SRS ####
numStanzas <- max(poem$stanza)
set.seed(112514)
## a poem that has the same number of lines as there were number of stanzas in original poem
poem$line[sample(1:nrow(poem), numStanzas, replace = F)]
set.seed(112516)
## a poem that has the same number of lines as there were number of stanzas in original poem
poem$line[sample(1:nrow(poem), numStanzas, replace = F)]
#### stratified ####
set.seed(112517)
## one line per stanza
new_poem <- rep(NA, numStanzas)
for (i in 1:numStanzas) {
this_stanza <- poem %>% filter(stanza == i)
new_poem[i] <- this_stanza$line[sample(1:nrow(this_stanza), 1, replace = F)]
}
new_poem
set.seed(112519)
## one line per stanza
new_poem <- rep(NA, numStanzas)
for (i in 1:numStanzas) {
this_stanza <- poem %>% filter(stanza == i)
new_poem[i] <- this_stanza$line[sample(1:nrow(this_stanza), 1, replace = F)]
}
new_poem
## lines proportional to stanza length
set.seed(112521)
prop_weights <- ceiling(prob * 100)
longer_poem <- sum(prop_weights)
new_poem <- c()
for (i in 1:numStanzas) {
this_stanza <- poem %>% filter(stanza == i)
new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), prop_weights[i], replace = F)])
}
new_poem
set.seed(112525)
prop_weights <- ceiling(prob * 100)
longer_poem <- sum(prop_weights)
new_poem <- c()
for (i in 1:numStanzas) {
this_stanza <- poem %>% filter(stanza == i)
new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), prop_weights[i], replace = F)])
}
new_poem
#### clustering ####
set.seed(112527)
numClusters <- 5
poem %>%
filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
select(line) %>%
c()
set.seed(112527)
numClusters <- 25
poem %>%
filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
select(line) %>%
c()
set.seed(112528)
numClusters <- 25
poem %>%
filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
select(line) %>%
c()
#### systematic ####
start <- 50
k <- 50
end_line <- floor(nrow(poem) / k) * 50
poem$line[seq(start, end_line, by = k)]
start <- 1
k <- 50
end_line <- floor(nrow(poem) / k) * 50
poem$line[seq(start, end_line, by = k)]
#### multistage ####
# cluster, then SRS
set.seed(112535)
numClusters <- 5
numLines <- min(byStanza$count)
sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)
new_poem <- c()
for (i in sampled_stanza) {
this_stanza <- poem %>% filter(stanza == i)
new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
}
new_poem
set.seed(112535)
numClusters <- 50
numLines <- min(byStanza$count)
sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)
new_poem <- c()
for (i in sampled_stanza) {
this_stanza <- poem %>% filter(stanza == i)
new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
}
new_poem
set.seed(112539)
numClusters <- 50
numLines <- min(byStanza$count)
sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)
new_poem <- c()
for (i in sampled_stanza) {
this_stanza <- poem %>% filter(stanza == i)
new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
}
new_poem
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment