Skip to content

Instantly share code, notes, and snippets.

@piyushrj
Created June 12, 2018 12:44
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save piyushrj/29f35793949a44e17ca0f9a08b98226c to your computer and use it in GitHub Desktop.
Save piyushrj/29f35793949a44e17ca0f9a08b98226c to your computer and use it in GitHub Desktop.
Adding custom Matcher rules to identify date patterns along with the ones identified with spacy's NER
import spacy, re, dateparser
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy import displacy
def custom_tokenizer(nlp, infix_reg):
"""
Function to return a customized tokenizer based on the infix regex
PARAMETERS
----------
nlp : Language
A Spacy language object with loaded model
infix_reg : relgular expression object
The infix regular expression object based on which the tokenization is to be
carried out.
RETURNS
-------
Tokenizer : Tokenizer object
The Spacy tokenizer obtained based on the infix regex.
"""
return Tokenizer(nlp.vocab, infix_finditer = infix_reg.finditer)
def is_valid_date(matcher, doc, i, matches):
"""
on match function to validate whether a matched instance is an actual date or not
PARAMETERS
----------
matcher : Matcher
The Matcher instance
doc : Doc
The document the matcher was used on
i : int
Index of the current match
matches : list
A list of (match_ic, start, end) tuples, describing the matches. A matched
tuple describe the span doc[start:end]
RETURNS:
-------
The function doesn't return a value, it just prints whether the found date instance is valid
if it's a valid date.
"""
match_id, start, end = matches[i]
if dateparser.parse(doc[start:end].text):
print doc[start:end].text, 'valid'
def add_date_ent(matcher, doc, i, matches):
"""
on_match function to name the valid date as a DATE entity
for reference see https://spacy.io/usage/linguistic-features#on_match
PARAMETERS
----------
matcher : Matcher
The Matcher instance
doc : Doc
The document the matcher was used on
i : int
Index of the current match
matches : list
A list of (match_ic, start, end) tuples, describing the matches. A matched
tuple describe the span doc[start:end]
RETURNS:
-------
The function doesn't return a value rather append a DATE entity to each valid date
and print the date with its validity
"""
match_id, start, end = matches[i]
match_str = doc[start:end].text
print match_str, 'Suspect'
if dateparser.parse(match_str):
entity = (DATE, start, end)
doc.ents += (entity,)
print match_str, 'VALID'
else:
print match_str, 'INVALID'
def add_regex_flag(vocab, pattern_str):
"""
Function to create a custom regex based flag for token pattern matching
Parameters
----------
vocab : Vocab
The nlp model's vocabulary, which is simply a lookup to access Lexeme objects as well as
StringStore
pattern_str : String
The string regular expression pattern we want to create the flag for
RETURNS
-------
flag_id : int
The integer ID by which the flag value can be checked.
"""
flag_id = vocab.add_flag(re.compile(pattern_str).match)
return flag_id
if __name__ == '__main__':
infix_re = re.compile(r'''[-/,]''')
nlp = spacy.load('en')
nlp.tokenizer = custom_tokenizer(nlp, infix_re)
DATE = nlp.vocab.strings['DATE']
# for the token pattern 1st, 22nd, 15th etc
IS_REGEX_MATCH = add_regex_flag(nlp.vocab, '\d{1,2}(?:[stndrh]){2}?')
# MM/DD/YYYY and YYYY/MM/DD
pattern_1 = [{'IS_DIGIT': True}, {'ORTH': '/'}, {'IS_DIGIT': True}, {'ORTH': '/'}, {'IS_DIGIT': True}]
# MM-DD-YYYY and YYYY-MM-DD
pattern_2 = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
# dates of the form 10-Aug-2018
pattern_3 = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_ALPHA': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
# dates of the form Aug-10-2018
pattern_4 = [{'IS_ALPHA': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
# dates of the form 10th August, 2018
pattern_5 = [{IS_REGEX_MATCH: True}, {'IS_ALPHA': True}, {'ORTH': ',', 'OP': '?'}, {'IS_DIGIT': True}]
# dates of the form August 10th, 2018
pattern_6 = [{'IS_ALPHA': True}, {IS_REGEX_MATCH: True}, {'ORTH': ',', 'OP': '?'}, {'IS_DIGIT': True}]
matcher = Matcher(nlp.vocab)
matcher.add('Type1', add_date_ent, pattern_1)
matcher.add('Type2', add_date_ent, pattern_2)
matcher.add('Type3', add_date_ent, pattern_3)
matcher.add('Type4', add_date_ent, pattern_4)
matcher.add('Type5', add_date_ent, pattern_5)
matcher.add('Type6', add_date_ent, pattern_6)
doc = nlp(u'Today is 06/11/2018 yesterday was 10-Jun-2018 and tomorrow is 06-12-2018 and I will go home on 7-Jul-2018 but clearly not on 39/02/2011 and some dates are of the form 12th February,2017')
matches = matcher(doc)
# displacy.serve(doc, style='ent')
@bgshri
Copy link

bgshri commented Sep 24, 2019

Can u please include dot formatted dates. Such as 10.09.2019

@stelmath
Copy link

Great, thank you!

@bharath8847
Copy link

I am getting an error doc=nlp(".....")

An integer is required

@simonmoya
Copy link

Just don´t works!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment