Skip to content

Instantly share code, notes, and snippets.

@jacquesfize
Last active February 21, 2019 08:29
Show Gist options
  • Save jacquesfize/0af846c77bcbf9cc0f61655621ad4fd4 to your computer and use it in GitHub Desktop.
Save jacquesfize/0af846c77bcbf9cc0f61655621ad4fd4 to your computer and use it in GitHub Desktop.
A python class to match element (from a terminology) in text using Spacy module.
# coding= utf-8
import warnings
import re
import importlib
import glob
import copy
import pandas as pd
import numpy as np
from tqdm import tqdm
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
def match_sequence(seq,dataset):
"""
Return start,end positions of every occurrence of a sequence in a dataset
Parameters
----------
seq : list
sequence
dataset : list
dataset
"""
N = len(seq)
if N < 1:
raise ValueError("Sequence is empty !")
if isinstance(dataset,list):
dataset=np.asarray(dataset)
if isinstance(seq,list):
seq=np.asarray(seq,dtype=dataset.dtype)
prefix_ind=np.where(dataset == seq[0])[0]
results=[]
for idx in prefix_ind:
start,end=idx,idx+N
if (dataset[start:end].tolist() == seq.tolist()):
results.append([seq,start,end])
return results
def match_sequences(seqs,dataset):
"""
Return start,end positions of every occurrence of a liste of sequences in a dataset
Parameters
----------
seqs : list
sequences
dataset : list
dataset
"""
N=len(seqs)
if N < 1:
warnings.warn("Sequence Empty")
return []
if isinstance(dataset,list):
dataset=np.asarray(dataset)
if isinstance(seqs,list):
seqs=np.array(seqs)
prefixes_dict={seq[0]:(seq,i,len(seq)) for i,seq in enumerate(seqs)}
prefixes=list(prefixes_dict.keys())
prefix_ind=np.where(np.isin(dataset,prefixes))[0]
results=[]
for idx in prefix_ind:
start,end=idx,idx+prefixes_dict[dataset[idx]][-1]
if (dataset[start:end].tolist() == prefixes_dict[dataset[idx]][0]):
results.append([prefixes_dict[dataset[idx]][1],start,end])
return results
def get_lemmatizer(lang):
i = importlib.import_module("spacy.lang.{0}.lemmatizer".format(lang))
return i.LOOKUP
class Matcher():
"""
A class responsible of the matching of terms in a text.
"""
def __init__(self,lang,use_lower=True,use_lemma=True,use_singular=True,use_plural=True):
"""
Constructor
Parameters
----------
lang : str
language
use_lower : bool, optional
match on lower version (the default is True)
use_lemma : bool, optional
match on lemma (the default is True)
use_singular : bool, optional
match on singular (the default is True)
use_plural : bool, optional
match on plural (the default is True)
"""
if lang == "fr":
self.blobber = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
else:
self.blobber= Blobber()
self.index_to_id={}
self.basic_form=[]
self.term_lower_form=[]
self.term_singular_form=[]
self.term_plural_form=[]
self.term_lemma_form=[]
self.use_lemma,self.use_lower,self.use_singular,self.use_plural=use_lemma,use_lower,use_singular,use_plural
self.lemmatizer=get_lemmatizer(lang)
self.N = 0
def add(self,id_,func,basic_form,lower_form=[],lemma_form=[],singular_form=[],plural_form=[]):
"""
Add a term or one of its variation
Parameters
----------
id_ : str or int
identifier
func : func
NotImplemented
basic_form : list of str
initial form
lower_form : list, optional
lower form (the default is [])
lemma_form : list, optional
lemma form (the default is [])
singular_form : list, optional
singular form (the default is [])
plural_form : list, optional
plural form (the default is [])
"""
self.index_to_id[self.N]=id_
self.basic_form.append(basic_form)
self.term_lower_form.append(lower_form)
self.term_singular_form.append(singular_form)
self.term_plural_form.append(plural_form)
self.term_lemma_form.append(lemma_form)
self.N+=1
def match(self,text):
"""
Execute the matching
Parameters
----------
text : str
input
Returns
-------
list
list of matches(id,start,end)
"""
results=[]
doc=self.blobber(text)
tokens=list(doc.tokenize())
tokens_lower=list(doc.lower().tokenize())
tokens_lemma = [self.lemmatizer.get(token.lower(), token.lower()) for token in doc]
results.extend(match_sequences(self.basic_form,tokens))
if self.use_lemma:
results.extend(match_sequences(self.term_lemma_form,tokens_lemma))
if self.use_lower:
results.extend(match_sequences(self.term_lower_form,tokens_lower))
if self.use_singular:
results.extend(match_sequences(self.term_singular_form,tokens_lower))
if self.use_plural:
results.extend(match_sequences(self.term_plural_form,tokens_lower))
return self.parse_results(results)
def parse_results(self,results):
"""
Parse raw match results
Parameters
----------
results : list
raw match results
Returns
-------
list
reformatted matches
"""
new_=[]
for res in results:
new_.append([self.index_to_id[res[0]],res[1],res[2]])
return new_
def __call__(self,text):
return self.match(text)
class TerminologyMatcher:
"""
A Matcher used to detect words from a terminology in a text. It uses the **spacy** Matcher class.
Terminology can be given as simple 1D-array(`list`,`numpy.ndarray`), a Python `dict` or `pandas.Dataframe`.
Usage:
>>> terminology = ["Agroforesterie","équipe"]
>>> matcher = TerminologyMatcher(terminology)
>>> matcher(nlp("Cette homme travaille dans le domaine de l'agroforesterie. Plusieurs équipes du CIRAD travaille dans ce domaine."))
[(0, 8, 9), (1, 11, 12)]
"""
def __init__(self, terminology_data, lang="fr", column_id="id", column_label="label", column_alt_label="alt_labels"):
"""
TerminologyMatcher Constructor
Parameters
----------
terminology_data : terminology container
Iterable object
lang : str, optional
language of the terminology (the default is "fr", which is French)
column_id : str, optional
the column name that contains the id of term (Only for pandas.Dataframe input) (the default is "id")
column_label : str, optional
The name of the column that contains the preferred label (the default is "label")
column_alt_label : str, optional
The name of the column that contains the alternate labels (must be iterable) (the default is "alt_labels")
Raises
------
ValueError
If the terminology variable is not iterable
"""
self.terminology_data = terminology_data
if not hasattr(terminology_data, '__iter__'): # Checking Vocabulary
raise ValueError(
"The 'terminology_data' args must be an iterable!")
self.is_dict = isinstance(terminology_data, dict)
self.is_panda = isinstance(terminology_data, pd.DataFrame)
self.column_id_in = column_id in terminology_data
self.column_id = column_id
self.column_label=column_label
self.lang = lang
self.lemmatizer = get_lemmatizer(self.lang) #  Accelerate the process
self.matcher = Matcher("fr")
self.inflector = None
try:
from inflector import French, English
if self.lang == "fr":
self.inflector = French()
elif self.lang == "en":
self.inflector = English()
except ImportError:
raise ImportError("You must install the module `inflector` from https://github.com/Jacobe2169/Python-Inflector")
if self.is_panda:
for index, row in tqdm(self.terminology_data.iterrows(), desc="Composing the thematic matcher..."):
self.matcher.add(
index if not column_id in terminology_data else row[column_id],
None,
*self.generate_input_matcher(row[column_label])
)
for label in row[column_alt_label]:
self.matcher.add(index if not column_id in terminology_data else row[column_id],
None,
*self.generate_input_matcher(row[column_label]))
elif self.is_dict:
for key, value in tqdm(self.terminology_data.items(), desc="Composing the thematic matcher..."):
try:
self.matcher.add(
row[key],
None,
*self.generate_input_matcher(value)
)
except:
pass
else:
for i, word in tqdm(enumerate(self.terminology_data), desc="Composing the thematic matcher..."):
try:
self.matcher.add(
i,
None,
*self.generate_input_matcher(word)
)
except:
pass
def __call__(self, doc):
"""
Overriding the __call__ method so it can be used as the spacy.matcher.Matcher object
Parameters
----------
doc : str
text
Returns
-------
list
list of matches position found in the text
"""
return self.matcher.match(doc)
def generate_input_matcher(self, doc):
"""
Generate the patterns that enable the identification of term (and its variation) for the matcher.
Parameters
----------
doc : str
text
Returns
-------
list or tuple
patterns list
"""
doc = doc.split()
basic_form=doc
lemma_=[self.lemmatizer.get(token.lower(), token.lower()) for token in doc]
lower_=[token.lower() for token in doc]
singular_=[self.inflector.singularize(token.lower()) for token in doc]
plural_=[self.inflector.pluralize(token.lower()) for token in doc]
return (basic_form,lower_,lemma_,singular_,plural_)
def get_word(self, key):
"""
Return the string representation for an id from the spacy.matcher.Matcher() results.
Parameters
----------
key : int or str
id
Returns
-------
str
string representation for the id
"""
if self.is_panda:
if self.column_id_in:
return self.terminology_data[self.terminology_data[self.column_id == key]]
return self.terminology_data.iloc[key][self.column_label]
return self.terminology_data[key]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment