sastoudt/song_of_myself_sampling.R Secret

## song_of_myself_sampling.R
library(dplyr)
library(ggplot2)

## https://www.poetryfoundation.org/poems/45477/song-of-myself-1892-version
## https://docs.google.com/spreadsheets/d/1f7JiBosfa_ralv_Pzx-Q_2IjwIj-HtVupb1V0Cwq1co/edit?usp=sharing

poem <- read.csv("song_of_myself.csv")

#### EDA ####

byStanza <- poem %>%
  group_by(stanza) %>%
  summarise(count = n())

ggplot(byStanza, aes(x = count)) +
  geom_histogram() +
  theme_minimal() +
  xlab("number of lines") +
  ggtitle("Walt Whitman's 'Song of Myself'", subtitle = "Stanza Lengths")

summary(byStanza$count)

prob <- byStanza %>%
  mutate(prop = count / nrow(poem)) %>%
  select(prop) %>%
  as.vector() %>%
  unname() %>%
  unlist()


#### SRS ####

numStanzas <- max(poem$stanza)

set.seed(112514)

## a poem that has the same number of lines as there were number of stanzas in original poem
poem$line[sample(1:nrow(poem), numStanzas, replace = F)]


set.seed(112516)

## a poem that has the same number of lines as there were number of stanzas in original poem
poem$line[sample(1:nrow(poem), numStanzas, replace = F)]


#### stratified ####

set.seed(112517)

## one line per stanza

new_poem <- rep(NA, numStanzas)
for (i in 1:numStanzas) {
  this_stanza <- poem %>% filter(stanza == i)

  new_poem[i] <- this_stanza$line[sample(1:nrow(this_stanza), 1, replace = F)]
}

new_poem

set.seed(112519)

## one line per stanza

new_poem <- rep(NA, numStanzas)
for (i in 1:numStanzas) {
  this_stanza <- poem %>% filter(stanza == i)

  new_poem[i] <- this_stanza$line[sample(1:nrow(this_stanza), 1, replace = F)]
}

new_poem


## lines proportional to stanza length

set.seed(112521)


prop_weights <- ceiling(prob * 100)

longer_poem <- sum(prop_weights)

new_poem <- c()
for (i in 1:numStanzas) {
  this_stanza <- poem %>% filter(stanza == i)

  new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), prop_weights[i], replace = F)])
}

new_poem

set.seed(112525)


prop_weights <- ceiling(prob * 100)

longer_poem <- sum(prop_weights)

new_poem <- c()
for (i in 1:numStanzas) {
  this_stanza <- poem %>% filter(stanza == i)

  new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), prop_weights[i], replace = F)])
}

new_poem


#### clustering ####

set.seed(112527)


numClusters <- 5

poem %>%
  filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
  select(line) %>%
  c()


set.seed(112527)


numClusters <- 25

poem %>%
  filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
  select(line) %>%
  c()

set.seed(112528)


numClusters <- 25

poem %>%
  filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
  select(line) %>%
  c()


#### systematic ####

start <- 50
k <- 50

end_line <- floor(nrow(poem) / k) * 50

poem$line[seq(start, end_line, by = k)]


start <- 1
k <- 50

end_line <- floor(nrow(poem) / k) * 50

poem$line[seq(start, end_line, by = k)]


#### multistage ####

# cluster, then SRS

set.seed(112535)

numClusters <- 5
numLines <- min(byStanza$count)

sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)

new_poem <- c()
for (i in sampled_stanza) {
  this_stanza <- poem %>% filter(stanza == i)

  new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
}

new_poem

set.seed(112535)

numClusters <- 50
numLines <- min(byStanza$count)

sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)

new_poem <- c()
for (i in sampled_stanza) {
  this_stanza <- poem %>% filter(stanza == i)

  new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
}

new_poem

set.seed(112539)

numClusters <- 50
numLines <- min(byStanza$count)

sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)

new_poem <- c()
for (i in sampled_stanza) {
  this_stanza <- poem %>% filter(stanza == i)

  new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
}

new_poem
	library(dplyr)
	library(ggplot2)

	## https://www.poetryfoundation.org/poems/45477/song-of-myself-1892-version
	## https://docs.google.com/spreadsheets/d/1f7JiBosfa_ralv_Pzx-Q_2IjwIj-HtVupb1V0Cwq1co/edit?usp=sharing

	poem <- read.csv("song_of_myself.csv")

	#### EDA ####

	byStanza <- poem %>%
	group_by(stanza) %>%
	summarise(count = n())

	ggplot(byStanza, aes(x = count)) +
	geom_histogram() +
	theme_minimal() +
	xlab("number of lines") +
	ggtitle("Walt Whitman's 'Song of Myself'", subtitle = "Stanza Lengths")

	summary(byStanza$count)

	prob <- byStanza %>%
	mutate(prop = count / nrow(poem)) %>%
	select(prop) %>%
	as.vector() %>%
	unname() %>%
	unlist()


	#### SRS ####

	numStanzas <- max(poem$stanza)

	set.seed(112514)

	## a poem that has the same number of lines as there were number of stanzas in original poem
	poem$line[sample(1:nrow(poem), numStanzas, replace = F)]


	set.seed(112516)

	## a poem that has the same number of lines as there were number of stanzas in original poem
	poem$line[sample(1:nrow(poem), numStanzas, replace = F)]



	#### stratified ####

	set.seed(112517)

	## one line per stanza

	new_poem <- rep(NA, numStanzas)
	for (i in 1:numStanzas) {
	this_stanza <- poem %>% filter(stanza == i)

	new_poem[i] <- this_stanza$line[sample(1:nrow(this_stanza), 1, replace = F)]
	}

	new_poem

	set.seed(112519)

	## one line per stanza

	new_poem <- rep(NA, numStanzas)
	for (i in 1:numStanzas) {
	this_stanza <- poem %>% filter(stanza == i)

	new_poem[i] <- this_stanza$line[sample(1:nrow(this_stanza), 1, replace = F)]
	}

	new_poem


	## lines proportional to stanza length

	set.seed(112521)


	prop_weights <- ceiling(prob * 100)

	longer_poem <- sum(prop_weights)

	new_poem <- c()
	for (i in 1:numStanzas) {
	this_stanza <- poem %>% filter(stanza == i)

	new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), prop_weights[i], replace = F)])
	}

	new_poem

	set.seed(112525)


	prop_weights <- ceiling(prob * 100)

	longer_poem <- sum(prop_weights)

	new_poem <- c()
	for (i in 1:numStanzas) {
	this_stanza <- poem %>% filter(stanza == i)

	new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), prop_weights[i], replace = F)])
	}

	new_poem


	#### clustering ####

	set.seed(112527)


	numClusters <- 5

	poem %>%
	filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
	select(line) %>%
	c()


	set.seed(112527)


	numClusters <- 25

	poem %>%
	filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
	select(line) %>%
	c()

	set.seed(112528)


	numClusters <- 25

	poem %>%
	filter(stanza %in% sample(1:numStanzas, numClusters, replace = F)) %>%
	select(line) %>%
	c()



	#### systematic ####

	start <- 50
	k <- 50

	end_line <- floor(nrow(poem) / k) * 50

	poem$line[seq(start, end_line, by = k)]



	start <- 1
	k <- 50

	end_line <- floor(nrow(poem) / k) * 50

	poem$line[seq(start, end_line, by = k)]



	#### multistage ####

	# cluster, then SRS

	set.seed(112535)

	numClusters <- 5
	numLines <- min(byStanza$count)

	sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)

	new_poem <- c()
	for (i in sampled_stanza) {
	this_stanza <- poem %>% filter(stanza == i)

	new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
	}

	new_poem

	set.seed(112535)

	numClusters <- 50
	numLines <- min(byStanza$count)

	sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)

	new_poem <- c()
	for (i in sampled_stanza) {
	this_stanza <- poem %>% filter(stanza == i)

	new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
	}

	new_poem

	set.seed(112539)

	numClusters <- 50
	numLines <- min(byStanza$count)

	sampled_stanza <- sample(1:numStanzas, numClusters, replace = F)

	new_poem <- c()
	for (i in sampled_stanza) {
	this_stanza <- poem %>% filter(stanza == i)

	new_poem <- c(new_poem, this_stanza$line[sample(1:nrow(this_stanza), numLines, replace = F)])
	}

	new_poem