Skip to content

Instantly share code, notes, and snippets.

@kowey
Created August 1, 2014 16:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kowey/97a57951e46372b78f33 to your computer and use it in GitHub Desktop.
Save kowey/97a57951e46372b78f33 to your computer and use it in GitHub Desktop.
# straightforward replacements
_PTB_MAPPING = {"``": "\"",
"''": "\"",
"-LRB-": "(",
"-RRB-": ")",
"-LSB-": "[",
"-RSB-": "]",
"-LCB-": "{",
"-RCB-": "}",
}
# prefixes for things we can skip
_PTB_SKIP_RE = re.compile(r'^(' +
r'(\*((T|ICH|EXP|RNR|PPA)\*)?-\d*)' +
r'|(\*(U|\?|NOT)\*)' +
r')$')
# we can skip this if the tag is -NONE- and the whole
# word matches this value
_PTB_SKIP_TAG_NONE = ["0", "*"]
# these specifice fileid, token number combinations are skipped or rewritten
_PTB_BLACKLIST = {('06/wsj_0606.mrg', 357): None,
('06/wsj_0675.mrg', 546): "---", # --
('11/wsj_1139.mrg', 582): ">a", # evil - insertion
('11/wsj_1161.mrg', 845): "<Tourism", # evil - insertion
('13/wsj_1331.mrg', 930): "`S",
('13/wsj_1377.mrg', 4): None,
('13/wsj_1377.mrg', 790): None, # full stop after abbrev
('23/wsj_2303.mrg', 301): None}
def norm_token(fileid):
"""
Return a function that normalises a token, sometimes including horribly
specific one-off changes for one-off errors
"""
slash_re = re.compile(r'\\/')
star_re = re.compile(r'\\\*')
def _norm((toknum, (word, tag))):
if (fileid, toknum) in _PTB_BLACKLIST:
return _PTB_BLACKLIST[(fileid, toknum)] or ""
elif tag == "-NONE-":
if _PTB_SKIP_RE.match(word) or word in _PTB_SKIP_TAG_NONE:
return ""
mword = _PTB_MAPPING.get(word, word)
return star_re.sub('*', slash_re.sub('/', mword))
return _norm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment