Skip to content

Instantly share code, notes, and snippets.

@jrnold
Last active February 19, 2017 21:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jrnold/c4867ddf82b2913803d2 to your computer and use it in GitHub Desktop.
Save jrnold/c4867ddf82b2913803d2 to your computer and use it in GitHub Desktop.
Treebank word tokenizer in R
# Treebank Tokenizer in R
#
# Code is a port of the NLTK python and sed scripts:
# - http://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer
# - http://www.cis.upenn.edu/~treebank/tokenizer.sed
library("stringr")
library("tokenizers")
library("microbenchmark")
.CONTRACTIONS2 <-
c("\\b(can)(not)\\b",
"\\b(d)('ye)\\b",
"\\b(gon)(na)\\b",
"\\b(got)(ta)\\b",
"\\b(lem)(me)\\b",
"\\b(mor)('n)\\b",
"\\b(wan)(na) "
)
.CONTRACTIONS3 <-
c(" ('t)(is)\\b",
" ('t)(was)\\b")
.CONTRACTIONS4 <-
c("\\b(whad)(dd)(ya)\\b",
"\\b(wha)(t)(cha)\\b")
tokenize_ptb <- function(string) {
# Starting quotes
string <- str_replace_all(string, '^\\"', '``')
string <- str_replace_all(string, '(``)', '\\1')
string <- str_replace_all(string, '([ (\\[{<])"', '\\1 `` ')
# Punctuation
string <- str_replace_all(string, '([:,])([^\\d])', ' \\1 \\2')
string <- str_replace_all(string, '\\.{3}', ' ... ')
string <- str_replace_all(string, '([,;@#$%&])', ' \\1 ')
string <- str_replace_all(string, '([^\\.])(\\.)([\\]\\)}>"\\\']*)?\\s*$', '\\1 \\2\\3 ')
string <- str_replace_all(string, '([?!])', ' \\1 ')
string <- str_replace_all(string, "([^'])' ", "\\1 ' ")
# re.sub\\(r(['"].*?['"]), r(["'].*?['"]), text\\)
# parens, brackets, etc
string <- str_replace_all(string, '([\\]\\[\\(\\)\\{\\}\\<\\>])', ' \\1 ')
string <- str_replace_all(string, '--', ' -- ')
# add extra space
string <- str_c(" ", string, " ")
# ending quotes
string <- str_replace_all(string, '"', " '' ")
string <- str_replace_all(string, "(\\S)('')", "\\1 \\2 ")
string <- str_replace_all(string, "([^' ])('[sS]|'[mM]|'[dD]|') ",
"\\1 \\2 ")
string <- str_replace_all(string,
"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ",
"\\1 \\2 ")
for (pattern in .CONTRACTIONS2)
string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
" \\1 \\2 ")
for (pattern in .CONTRACTIONS3)
string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
" \\1 \\2 ")
for (pattern in .CONTRACTIONS4)
string <- str_replace_all(string, regex(pattern, ignore_case = TRUE),
" \\1 \\2 \\3 ")
# return
str_split(str_trim(string), '\\s+')
}
moby <- quanteda::data_char_mobydick
microbenchmark(tokenize_words(moby))
microbenchmark(tokenize_ptb(moby))
tokenize_ptb("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.")
tokenize_ptb("They'll save and invest more.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment