jrnold/gist:c4867ddf82b2913803d2

## gistfile1.r
# Treebank Tokenizer in R
#
# Code is a port of the NLTK python and sed scripts:
# - http://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer
# - http://www.cis.upenn.edu/~treebank/tokenizer.sed
library("stringr")
library("tokenizers")
library("microbenchmark")

.CONTRACTIONS2 <-
  c("\\b(can)(not)\\b",
    "\\b(d)('ye)\\b",
    "\\b(gon)(na)\\b",
    "\\b(got)(ta)\\b",
    "\\b(lem)(me)\\b",
    "\\b(mor)('n)\\b",
    "\\b(wan)(na) "
  )
.CONTRACTIONS3 <-
  c(" ('t)(is)\\b",
    " ('t)(was)\\b")
.CONTRACTIONS4 <-
  c("\\b(whad)(dd)(ya)\\b",
    "\\b(wha)(t)(cha)\\b")


tokenize_ptb <- function(string) {
  # Starting quotes
  string <- str_replace_all(string, '^\\"', '``')
  string <- str_replace_all(string, '(``)', '\\1')
  string <- str_replace_all(string, '([ (\\[{<])"', '\\1 `` ')
  # Punctuation
  string <- str_replace_all(string, '([:,])([^\\d])', ' \\1 \\2')
  string <- str_replace_all(string, '\\.{3}', ' ... ')
  string <- str_replace_all(string, '([,;@#$%&])', ' \\1 ')
  string <- str_replace_all(string, '([^\\.])(\\.)([\\]\\)}>"\\\']*)?\\s*$', '\\1 \\2\\3 ')
  string <- str_replace_all(string, '([?!])', ' \\1 ')

  string <- str_replace_all(string, "([^'])' ", "\\1 ' ")
  # re.sub\\(r(['"].*?['"]), r(["'].*?['"]), text\\)

  # parens, brackets, etc
  string <- str_replace_all(string, '([\\]\\[\\(\\)\\{\\}\\<\\>])', ' \\1 ')
  string <- str_replace_all(string, '--', ' -- ')

  # add extra space
  string <- str_c(" ", string, " ")

  # ending quotes
  string <- str_replace_all(string, '"', " '' ")
  string <- str_replace_all(string, "(\\S)('')", "\\1 \\2 ")
  string <- str_replace_all(string, "([^' ])('[sS]|'[mM]|'[dD]|') ",
                            "\\1 \\2 ")
  string <- str_replace_all(string,
                            "([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ",
                            "\\1 \\2 ")

  for (pattern in .CONTRACTIONS2)
    string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
                              " \\1 \\2 ")
  for (pattern in .CONTRACTIONS3)
    string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
                              " \\1 \\2 ")
  for (pattern in .CONTRACTIONS4)
    string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
                              " \\1 \\2 \\3 ")

  # return
  str_split(str_trim(string), '\\s+')
}

moby <- quanteda::data_char_mobydick
microbenchmark(tokenize_words(moby))
microbenchmark(tokenize_ptb(moby))

tokenize_ptb("Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.")
tokenize_ptb("They'll save and invest more.")
	# Treebank Tokenizer in R
	#
	# Code is a port of the NLTK python and sed scripts:
	# - http://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer
	# - http://www.cis.upenn.edu/~treebank/tokenizer.sed
	library("stringr")
	library("tokenizers")
	library("microbenchmark")

	.CONTRACTIONS2 <-
	c("\\b(can)(not)\\b",
	"\\b(d)('ye)\\b",
	"\\b(gon)(na)\\b",
	"\\b(got)(ta)\\b",
	"\\b(lem)(me)\\b",
	"\\b(mor)('n)\\b",
	"\\b(wan)(na) "
	)
	.CONTRACTIONS3 <-
	c(" ('t)(is)\\b",
	" ('t)(was)\\b")
	.CONTRACTIONS4 <-
	c("\\b(whad)(dd)(ya)\\b",
	"\\b(wha)(t)(cha)\\b")


	tokenize_ptb <- function(string) {
	# Starting quotes
	string <- str_replace_all(string, '^\\"', '``')
	string <- str_replace_all(string, '(``)', '\\1')
	string <- str_replace_all(string, '([ (\\[{<])"', '\\1 `` ')
	# Punctuation
	string <- str_replace_all(string, '([:,])([^\\d])', ' \\1 \\2')
	string <- str_replace_all(string, '\\.{3}', ' ... ')
	string <- str_replace_all(string, '([,;@#$%&])', ' \\1 ')
	string <- str_replace_all(string, '([^\\.])(\\.)([\\]\\)}>"\\\'])?\\s$', '\\1 \\2\\3 ')
	string <- str_replace_all(string, '([?!])', ' \\1 ')

	string <- str_replace_all(string, "([^'])' ", "\\1 ' ")
	# re.sub\\(r(['"].?['"]), r(["'].?['"]), text\\)

	# parens, brackets, etc
	string <- str_replace_all(string, '([\\]\\[\\(\\)\\{\\}\\<\\>])', ' \\1 ')
	string <- str_replace_all(string, '--', ' -- ')

	# add extra space
	string <- str_c(" ", string, " ")

	# ending quotes
	string <- str_replace_all(string, '"', " '' ")
	string <- str_replace_all(string, "(\\S)('')", "\\1 \\2 ")
	string <- str_replace_all(string, "([^' ])('[sS]\|'[mM]\|'[dD]\|') ",
	"\\1 \\2 ")
	string <- str_replace_all(string,
	"([^' ])('ll\|'LL\|'re\|'RE\|'ve\|'VE\|n't\|N'T) ",
	"\\1 \\2 ")

	for (pattern in .CONTRACTIONS2)
	string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
	" \\1 \\2 ")
	for (pattern in .CONTRACTIONS3)
	string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
	" \\1 \\2 ")
	for (pattern in .CONTRACTIONS4)
	string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
	" \\1 \\2 \\3 ")

	# return
	str_split(str_trim(string), '\\s+')
	}

	moby <- quanteda::data_char_mobydick
	microbenchmark(tokenize_words(moby))
	microbenchmark(tokenize_ptb(moby))

	tokenize_ptb("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.")
	tokenize_ptb("They'll save and invest more.")