Skip to content

Instantly share code, notes, and snippets.

@csiu
Created April 27, 2017 08:04
Show Gist options
  • Save csiu/3b2f792de6f41b80e6889921f3d3715e to your computer and use it in GitHub Desktop.
Save csiu/3b2f792de6f41b80e6889921f3d3715e to your computer and use it in GitHub Desktop.
day61: Word position
#' ---
#' output:
#' html_document:
#' keep_md: yes
#' ---
#+ message=FALSE
library(dplyr)
library(ggplot2)
library(readr)
library(tidyr)
# Read data remove na documents
docs <- read_csv("day61.txt") %>%
filter(!is.na(doc_processed))
# How many rows
num_rows <- nrow(docs)
# Convert string into vector
docs$doc_processed <- strsplit(docs$doc_processed, split = " ")
# Split vector to word and add word_position
docs <-
docs %>%
unnest(doc_processed) %>%
group_by(id) %>%
mutate(word_position = row_number()) %>%
rename(word = doc_processed)
# Compute median position from word position
word_averages <-
docs %>%
group_by(id) %>%
mutate(word_position = row_number() / n()) %>%
group_by(word) %>%
summarize(median_position = median(word_position),
number = n())
word_averages %>%
# Remove rare words < 25
filter(number >= 25) %>%
arrange(median_position) %>%
mutate(word = reorder(word, -median_position),
direction = ifelse(median_position < .5, "Beginning", "End")) %>%
ggplot(aes(median_position, word, color = direction)) +
geom_point(size = 5) +
geom_errorbarh(aes(xmin = .5, xmax = median_position), height = 0) +
geom_vline(xintercept = .5, lty = 2) +
scale_x_continuous(labels = scales::percent_format()) +
expand_limits(x = c(0, 1)) +
labs(x = "Median position of word within project title",
y = "",
title = "Words most shifted towards the beginning or end of a project title",
subtitle = "Of words with at least 25 uses across 3996 'Documentary' type projects",
color = "")
#' Plotting code from
#' https://github.com/dgrtwo/dgrtwo.github.com/blob/master/_R/2017-04-26-tidytext-plots.Rmd
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment