thoughtfulbloke/anagram.R

## anagram.R
# inspired by
# http://blog.plover.com/2017/02/21/#anagram-scoring
download.file("http://pic.blog.plover.com/lang/anagram-scoring/Web2.txt.gz", destfile = "compressWords.txt.gz")
w <- tolower(readLines("compressWords.txt.gz"))
ord <- as.character(lapply(lapply(strsplit(w,NULL),sort),paste,collapse=""))

# what I haven't tested is if it is faster to find the pairs and test all or test and find the best pair
# the first is much simpler code and computers are fast enough. I am a data guy not a CS guy.

# avoid self matches by picking only one starter from each group there is an anagram for
library(dplyr)
grouped <- data.frame(w, ord, stringsAsFactors = FALSE) %>%
  group_by(ord) %>% mutate(gnum = n()) %>%
  ungroup() %>% filter(gnum > 1) %>% # removes words with no anagrams
  group_by(ord) %>% slice(1)
wpairs <- merge(grouped, data.frame(other=w, ord, stringsAsFactors = FALSE), by="ord") #form pairs
wpairs <- wpairs[wpairs$w != wpairs$other,]

# this is where opinions start, with what is interesting
# I am going for least word one pairs of characters in word 2: i_score
# on a per number of letters per_length

comparetwo <- function(w1,w2){
  c1 <- unlist(strsplit(w1,NULL))
  c2 <- unlist(strsplit(w2,NULL))
  p1 <- sapply(1:(length(c1)-1), function(x){paste(c1[x], c1[x+1], sep="")})
  p2 <- sapply(1:(length(c1)-1), function(x){paste(c2[x], c2[x+1], sep="")})
  sum(!(p1 %in% p2))
}

wpairs$i_score <- apply(wpairs,1, function(x){comparetwo(x[2],x[4])})

wpairs$per_length <- wpairs$i_score / nchar(wpairs$w)
	# inspired by
	# http://blog.plover.com/2017/02/21/#anagram-scoring
	download.file("http://pic.blog.plover.com/lang/anagram-scoring/Web2.txt.gz", destfile = "compressWords.txt.gz")
	w <- tolower(readLines("compressWords.txt.gz"))
	ord <- as.character(lapply(lapply(strsplit(w,NULL),sort),paste,collapse=""))

	# what I haven't tested is if it is faster to find the pairs and test all or test and find the best pair
	# the first is much simpler code and computers are fast enough. I am a data guy not a CS guy.

	# avoid self matches by picking only one starter from each group there is an anagram for
	library(dplyr)
	grouped <- data.frame(w, ord, stringsAsFactors = FALSE) %>%
	group_by(ord) %>% mutate(gnum = n()) %>%
	ungroup() %>% filter(gnum > 1) %>% # removes words with no anagrams
	group_by(ord) %>% slice(1)
	wpairs <- merge(grouped, data.frame(other=w, ord, stringsAsFactors = FALSE), by="ord") #form pairs
	wpairs <- wpairs[wpairs$w != wpairs$other,]

	# this is where opinions start, with what is interesting
	# I am going for least word one pairs of characters in word 2: i_score
	# on a per number of letters per_length

	comparetwo <- function(w1,w2){
	c1 <- unlist(strsplit(w1,NULL))
	c2 <- unlist(strsplit(w2,NULL))
	p1 <- sapply(1:(length(c1)-1), function(x){paste(c1[x], c1[x+1], sep="")})
	p2 <- sapply(1:(length(c1)-1), function(x){paste(c2[x], c2[x+1], sep="")})
	sum(!(p1 %in% p2))
	}

	wpairs$i_score <- apply(wpairs,1, function(x){comparetwo(x[2],x[4])})

	wpairs$per_length <- wpairs$i_score / nchar(wpairs$w)