roblanf/words.R

## words.R
# Load necessary library
library(dplyr)
library(lexicon)
library(visNetwork)

# Step 1: List most common three-letter English words

data(sw_fry_1000)
three_letter_words <- sw_fry_1000[nchar(sw_fry_1000) == 3]

# Ensure all words are lowercase
three_letter_words <- tolower(three_letter_words)

# add other words as you like
three_letter_words = c(three_letter_words, c("rat", "not", "bag", "hag", "bet", "bum", "web", "its", "day", "two", "sex", "set", "map", "way", "off", "did", "car", "dvd", "per", "usa", "non", "why", "gay", "air", "hot", "say", "tax", "got", "act", "red", "key", "far", "win", "bed", "ass", "sea", "cut", "kit", "boy", "son", "bug", "mid", "lcd", "hiv", "kim", "lie", "eve", "rod", "sad", "ban", "rid", "rip", "pub", "dna", "eat", "gun", "sam", "tie", "ron", "rob", "gap", "sms", "cow", "lip", "sap", "dis", "fwd", "mud", "cop", "sir", "wet", "ram", "fox", "hop", "kid", "nfl", "sum", "vat", "hat", "ray", "ice", "yet", "sow", "bow", "low"))


#from ChatGPT
additional_words <- c(
  "and", "the", "for", "are", "but", "you", "all", "can", "was", "her", "she",
  "him", "had", "his", "out", "get", "man", "old", "now", "use", "how", "our",
  "may", "see", "big", "end", "any", "let", "put", "own", "yes", "too", "bad",
  "dog", "cat", "pig", "ant", "bee", "fly", "owl", "bat", "sun", "sky", "dry",
  "top", "pay", "buy", "ask", "dig", "fix", "mix", "row", "hit", "fit", "sit",
  "has", "nor", "mom", "sis", "fun", "joy", "box", "cup", "pen", "rug", "bus",
  "van", "cab", "gas", "oil", "add", "sum", "law", "art", "god", "pie", "egg",
  "elf", "fan", "fat", "fog", "gel", "gig", "gym", "hip", "hug", "ink", "jam",
  "jar", "jet", "jog", "jug", "lap", "log", "mad", "mat", "nap", "nod", "nut",
  "pad", "pan", "pat", "peg", "pet", "pin", "pit", "pop", "pot", "rag", "ran",
  "rap", "rig", "rim", "rot", "row", "rub", "sat", "set", "sew", "shy", "sip",
  "six", "ski", "sob", "tan", "tap", "tea", "ten", "tin", "tip", "toe", "ton",
  "top", "toy", "try", "tub", "tug", "vet", "via", "vow", "war", "wax", "wig",
  "wow", "yak", "yam", "zip", "zoo"
)


three_letter_words = c(three_letter_words, additional_words)


# Create a set for quick lookup
word_set <- unique(three_letter_words)

# Step 2: For each possible pair of words, check if the shifted concatenation produces valid words
results <- data.frame(Word1 = character(),
                      Word2 = character(),
                      ShiftedWords = character(),
                      stringsAsFactors = FALSE)


# Iterate over all ordered pairs of words
for (w1 in three_letter_words) {
  for (w2 in three_letter_words) {

    concat_word <- paste0(w1, w2, sep = "")  # This is a 6-letter string

    shifted_word <- substr(concat_word, 3, 5)

    # Check if shifted words are valid three-letter words
    if (shifted_word %in% word_set) {
      # Add to results
      results <- rbind(results, data.frame(Word1 = w1,
                                           Word2 = w2,
                                           shifted_word,
                                           stringsAsFactors = FALSE))
    }
  }
}

# Step 3: Print the table of valid pairs
print(results)

# Make a tibble of which words can follow any other word
follow_on_df <- results %>%
  group_by(Word1) %>%
  summarise(
    follow_on = (paste(unique(Word2), collapse = ", ")),
    fN = length(unique(Word2))
  )


# crazy graph


# Assuming follow_on_df is already created
# Example:
# follow_on_df <- data.frame(
#   Word1 = c("cat", "dog"),
#   follow_on = c("and, bat, mat", "cat, bat"),
#   fN = c(3, 2),
#   stringsAsFactors = FALSE
# )

# Prepare your data
unique_words <- unique(c(
  follow_on_df$Word1,
  unlist(strsplit(follow_on_df$follow_on, ", "))
))

library(visNetwork)
library(dplyr)

# Sample 'follow_on_df' data frame
# Replace this with your actual data
# Example:
# follow_on_df <- data.frame(
#   Word1 = c("tea", "and", "cat"),
#   follow_on = c("and, the, bag", "cat, dog", "hat, run"),
#   stringsAsFactors = FALSE
# )

# Define your nodes based on 'follow_on_df'
nodes <- data.frame(
  id = unique(c(follow_on_df$Word1, unlist(strsplit(follow_on_df$follow_on, ", ")))),
  label = unique(c(follow_on_df$Word1, unlist(strsplit(follow_on_df$follow_on, ", ")))),
  stringsAsFactors = FALSE
)

# Initialize empty edges data frame
edges <- data.frame(
  from = character(),
  to = character(),
  stringsAsFactors = FALSE
)

# Loop through 'follow_on_df' to create edges pointing from Word1 to its follow-on words
for (i in 1:nrow(follow_on_df)) {
  from_word <- follow_on_df$Word1[i]
  to_words <- unlist(strsplit(follow_on_df$follow_on[i], ", "))

  # Create edges from 'from_word' to each 'to_word'
  edges_temp <- data.frame(
    from = from_word,
    to = to_words,
    stringsAsFactors = FALSE
  )

  edges <- rbind(edges, edges_temp)
}

# Assign default color to all edges
edges$color <- "grey"  # Default edge color

# Optionally, assign unique IDs to edges if you have multiple identical edges
# edges$id <- 1:nrow(edges)

# Build the visNetwork graph
visNetwork(nodes, edges) %>%
  visIgraphLayout(layout = "layout_with_fr") %>%  # Apply layout algorithm
  visEdges(
    arrows = "to",
    color = list(
      inherit = FALSE  # Allow individual edge colors to be used
    )
  ) %>%
  visPhysics(enabled = TRUE) %>%                 # Enable physics for node movement
  visNodes(                                       # Set node properties
    shape = "circle",
    font = list(
      color = "black",
      size = 20,
      face = "arial",
      align = "center",
      vadjust = 0
    ),
    color = list(
      background = "lightblue",
      border = "darkblue",
      highlight = "orange"
    )
  ) %>%
  visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%
  visEvents(
    selectNode = "function(properties) {
      var nodeId = properties.nodes[0];

      // Get all edges connected to the selected node
      var allEdges = this.body.data.edges.get();

      // Iterate through all edges
      allEdges.forEach(function(edge) {
        if(edge.from === nodeId) {
          // Highlight outgoing edges in green
          edge.color = 'green';
        } else {
          // Reset other edges to default color
          edge.color = 'grey';
        }
        // Update the edge in the network
        this.body.data.edges.update(edge);
      }.bind(this));

      // Optionally, you can focus on the selected node
      this.fit({
        nodes: [nodeId],
        animation: {duration: 500, easingFunction: 'easeInOutQuad'}
      });
    }",
    deselectNode = "function(properties) {
      // Reset all edges to default color when no node is selected
      var allEdges = this.body.data.edges.get();

      allEdges.forEach(function(edge) {
        edge.color = 'grey';
        this.body.data.edges.update(edge);
      }.bind(this));
    }"
  )
	# Load necessary library
	library(dplyr)
	library(lexicon)
	library(visNetwork)

	# Step 1: List most common three-letter English words

	data(sw_fry_1000)
	three_letter_words <- sw_fry_1000[nchar(sw_fry_1000) == 3]

	# Ensure all words are lowercase
	three_letter_words <- tolower(three_letter_words)

	# add other words as you like
	three_letter_words = c(three_letter_words, c("rat", "not", "bag", "hag", "bet", "bum", "web", "its", "day", "two", "sex", "set", "map", "way", "off", "did", "car", "dvd", "per", "usa", "non", "why", "gay", "air", "hot", "say", "tax", "got", "act", "red", "key", "far", "win", "bed", "ass", "sea", "cut", "kit", "boy", "son", "bug", "mid", "lcd", "hiv", "kim", "lie", "eve", "rod", "sad", "ban", "rid", "rip", "pub", "dna", "eat", "gun", "sam", "tie", "ron", "rob", "gap", "sms", "cow", "lip", "sap", "dis", "fwd", "mud", "cop", "sir", "wet", "ram", "fox", "hop", "kid", "nfl", "sum", "vat", "hat", "ray", "ice", "yet", "sow", "bow", "low"))


	#from ChatGPT
	additional_words <- c(
	"and", "the", "for", "are", "but", "you", "all", "can", "was", "her", "she",
	"him", "had", "his", "out", "get", "man", "old", "now", "use", "how", "our",
	"may", "see", "big", "end", "any", "let", "put", "own", "yes", "too", "bad",
	"dog", "cat", "pig", "ant", "bee", "fly", "owl", "bat", "sun", "sky", "dry",
	"top", "pay", "buy", "ask", "dig", "fix", "mix", "row", "hit", "fit", "sit",
	"has", "nor", "mom", "sis", "fun", "joy", "box", "cup", "pen", "rug", "bus",
	"van", "cab", "gas", "oil", "add", "sum", "law", "art", "god", "pie", "egg",
	"elf", "fan", "fat", "fog", "gel", "gig", "gym", "hip", "hug", "ink", "jam",
	"jar", "jet", "jog", "jug", "lap", "log", "mad", "mat", "nap", "nod", "nut",
	"pad", "pan", "pat", "peg", "pet", "pin", "pit", "pop", "pot", "rag", "ran",
	"rap", "rig", "rim", "rot", "row", "rub", "sat", "set", "sew", "shy", "sip",
	"six", "ski", "sob", "tan", "tap", "tea", "ten", "tin", "tip", "toe", "ton",
	"top", "toy", "try", "tub", "tug", "vet", "via", "vow", "war", "wax", "wig",
	"wow", "yak", "yam", "zip", "zoo"
	)


	three_letter_words = c(three_letter_words, additional_words)




	# Create a set for quick lookup
	word_set <- unique(three_letter_words)

	# Step 2: For each possible pair of words, check if the shifted concatenation produces valid words
	results <- data.frame(Word1 = character(),
	Word2 = character(),
	ShiftedWords = character(),
	stringsAsFactors = FALSE)


	# Iterate over all ordered pairs of words
	for (w1 in three_letter_words) {
	for (w2 in three_letter_words) {

	concat_word <- paste0(w1, w2, sep = "") # This is a 6-letter string

	shifted_word <- substr(concat_word, 3, 5)

	# Check if shifted words are valid three-letter words
	if (shifted_word %in% word_set) {
	# Add to results
	results <- rbind(results, data.frame(Word1 = w1,
	Word2 = w2,
	shifted_word,
	stringsAsFactors = FALSE))
	}
	}
	}

	# Step 3: Print the table of valid pairs
	print(results)

	# Make a tibble of which words can follow any other word
	follow_on_df <- results %>%
	group_by(Word1) %>%
	summarise(
	follow_on = (paste(unique(Word2), collapse = ", ")),
	fN = length(unique(Word2))
	)



	# crazy graph


	# Assuming follow_on_df is already created
	# Example:
	# follow_on_df <- data.frame(
	# Word1 = c("cat", "dog"),
	# follow_on = c("and, bat, mat", "cat, bat"),
	# fN = c(3, 2),
	# stringsAsFactors = FALSE
	# )

	# Prepare your data
	unique_words <- unique(c(
	follow_on_df$Word1,
	unlist(strsplit(follow_on_df$follow_on, ", "))
	))

	library(visNetwork)
	library(dplyr)

	# Sample 'follow_on_df' data frame
	# Replace this with your actual data
	# Example:
	# follow_on_df <- data.frame(
	# Word1 = c("tea", "and", "cat"),
	# follow_on = c("and, the, bag", "cat, dog", "hat, run"),
	# stringsAsFactors = FALSE
	# )

	# Define your nodes based on 'follow_on_df'
	nodes <- data.frame(
	id = unique(c(follow_on_df$Word1, unlist(strsplit(follow_on_df$follow_on, ", ")))),
	label = unique(c(follow_on_df$Word1, unlist(strsplit(follow_on_df$follow_on, ", ")))),
	stringsAsFactors = FALSE
	)

	# Initialize empty edges data frame
	edges <- data.frame(
	from = character(),
	to = character(),
	stringsAsFactors = FALSE
	)

	# Loop through 'follow_on_df' to create edges pointing from Word1 to its follow-on words
	for (i in 1:nrow(follow_on_df)) {
	from_word <- follow_on_df$Word1[i]
	to_words <- unlist(strsplit(follow_on_df$follow_on[i], ", "))

	# Create edges from 'from_word' to each 'to_word'
	edges_temp <- data.frame(
	from = from_word,
	to = to_words,
	stringsAsFactors = FALSE
	)

	edges <- rbind(edges, edges_temp)
	}

	# Assign default color to all edges
	edges$color <- "grey" # Default edge color

	# Optionally, assign unique IDs to edges if you have multiple identical edges
	# edges$id <- 1:nrow(edges)

	# Build the visNetwork graph
	visNetwork(nodes, edges) %>%
	visIgraphLayout(layout = "layout_with_fr") %>% # Apply layout algorithm
	visEdges(
	arrows = "to",
	color = list(
	inherit = FALSE # Allow individual edge colors to be used
	)
	) %>%
	visPhysics(enabled = TRUE) %>% # Enable physics for node movement
	visNodes( # Set node properties
	shape = "circle",
	font = list(
	color = "black",
	size = 20,
	face = "arial",
	align = "center",
	vadjust = 0
	),
	color = list(
	background = "lightblue",
	border = "darkblue",
	highlight = "orange"
	)
	) %>%
	visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%
	visEvents(
	selectNode = "function(properties) {
	var nodeId = properties.nodes[0];

	// Get all edges connected to the selected node
	var allEdges = this.body.data.edges.get();

	// Iterate through all edges
	allEdges.forEach(function(edge) {
	if(edge.from === nodeId) {
	// Highlight outgoing edges in green
	edge.color = 'green';
	} else {
	// Reset other edges to default color
	edge.color = 'grey';
	}
	// Update the edge in the network
	this.body.data.edges.update(edge);
	}.bind(this));

	// Optionally, you can focus on the selected node
	this.fit({
	nodes: [nodeId],
	animation: {duration: 500, easingFunction: 'easeInOutQuad'}
	});
	}",
	deselectNode = "function(properties) {
	// Reset all edges to default color when no node is selected
	var allEdges = this.body.data.edges.get();

	allEdges.forEach(function(edge) {
	edge.color = 'grey';
	this.body.data.edges.update(edge);
	}.bind(this));
	}"
	)