Skip to content

Instantly share code, notes, and snippets.

@widiger-anna
Created May 4, 2018 17:31
Show Gist options
  • Save widiger-anna/deefac010da426911381c118a97fc23f to your computer and use it in GitHub Desktop.
Save widiger-anna/deefac010da426911381c118a97fc23f to your computer and use it in GitHub Desktop.
Expanding contractions with spaCy
from __future__ import unicode_literals, print_function
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from pathlib import Path
import re
nlp = spacy.load('en_core_web_sm')
# example of expanding contractions using regexes (slow for a big corpus)
text_with_contractions = "Oh no he didn't."
print(text_without_contractions = re.sub(r'(\w+)n\'t', r'\g<1>' + " not", text_with_contractions))
'''
dealing with contractions by expanding spaCy's tokenizer exceptions
ORTH is the form in the text/corpus
LEMMA is the dictionary form
TAG is part of speech
'''
TOKENIZER_EXCEPTIONS = {
# do
"don't": [
{ORTH: "do", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"doesn't": [
{ORTH: "does", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"didn't": [
{ORTH: "did", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
# can
"can't": [
{ORTH: "ca", LEMMA: "can"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"couldn't": [
{ORTH: "could", LEMMA: "can"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
# have
"I've'": [
{ORTH: "I", LEMMA: "I"},
{ORTH: "'ve'", LEMMA: "have", NORM: "have", TAG: "VERB"}],
"haven't": [
{ORTH: "have", LEMMA: "have"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"hasn't": [
{ORTH: "has", LEMMA: "have"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"hadn't": [
{ORTH: "had", LEMMA: "have"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
# will/shall will be replaced by will
"I'll'": [
{ORTH: "I", LEMMA: "I"},
{ORTH: "'ll'", LEMMA: "will", NORM: "will", TAG: "VERB"}],
"he'll'": [
{ORTH: "he", LEMMA: "he"},
{ORTH: "'ll'", LEMMA: "will", NORM: "will", TAG: "VERB"}],
"she'll'": [
{ORTH: "she", LEMMA: "she"},
{ORTH: "'ll'", LEMMA: "will", NORM: "will", TAG: "VERB"}],
"it'll'": [
{ORTH: "it", LEMMA: "it"},
{ORTH: "'ll'", LEMMA: "will", NORM: "will", TAG: "VERB"}],
"won't": [
{ORTH: "wo", LEMMA: "will"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"wouldn't": [
{ORTH: "would", LEMMA: "will"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
# be
"I'm'": [
{ORTH: "I", LEMMA: "I"},
{ORTH: "'m'", LEMMA: "be", NORM: "am", TAG: "VERB"}]
}
#testing all contractions using spaCy's update tokenizer
doc1 = nlp(u"Oh no he didn't. I can't and I won't. I'll know what I'm gonna do.")
for token in doc1:
print(token.text, token.lemma_, token.pos_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment