jacquesfize/terminology_matcher.py

## terminology_matcher.py
# coding= utf-8
import warnings
import re
import importlib
import glob
import copy

import pandas as pd
import numpy as np
from tqdm import tqdm
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer


def match_sequence(seq,dataset):
    """
    Return start,end positions of every occurrence of a sequence in a dataset

    Parameters
    ----------
    seq : list
        sequence
    dataset : list
        dataset

    """
    N = len(seq)
    if N < 1:
        raise ValueError("Sequence is empty !")

    if isinstance(dataset,list):
        dataset=np.asarray(dataset)
    if isinstance(seq,list):
        seq=np.asarray(seq,dtype=dataset.dtype)

    prefix_ind=np.where(dataset == seq[0])[0]
    results=[]
    for idx in prefix_ind:
        start,end=idx,idx+N
        if (dataset[start:end].tolist() == seq.tolist()):
            results.append([seq,start,end])
    return results

def match_sequences(seqs,dataset):
    """
    Return start,end positions of every occurrence of a liste of sequences in a dataset

    Parameters
    ----------
    seqs : list
        sequences
    dataset : list
        dataset

    """
    N=len(seqs)
    if N < 1:
        warnings.warn("Sequence Empty")
        return []

    if isinstance(dataset,list):
        dataset=np.asarray(dataset)
    if isinstance(seqs,list):
        seqs=np.array(seqs)

    prefixes_dict={seq[0]:(seq,i,len(seq)) for i,seq in enumerate(seqs)}
    prefixes=list(prefixes_dict.keys())
    prefix_ind=np.where(np.isin(dataset,prefixes))[0]
    results=[]
    for idx in prefix_ind:
        start,end=idx,idx+prefixes_dict[dataset[idx]][-1]
        if (dataset[start:end].tolist() == prefixes_dict[dataset[idx]][0]):
            results.append([prefixes_dict[dataset[idx]][1],start,end])
    return results


def get_lemmatizer(lang):
    i = importlib.import_module("spacy.lang.{0}.lemmatizer".format(lang))
    return i.LOOKUP


class Matcher():
    """
    A class responsible of the matching of terms in a text.
    """

    def __init__(self,lang,use_lower=True,use_lemma=True,use_singular=True,use_plural=True):
        """
        Constructor

        Parameters
        ----------
        lang : str
            language
        use_lower : bool, optional
            match on lower version (the default is True)
        use_lemma : bool, optional
            match on lemma (the default is True)
        use_singular : bool, optional
            match on singular (the default is True)
        use_plural : bool, optional
            match on plural (the default is True)
        """

        if lang == "fr":
            self.blobber = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
        else:
            self.blobber= Blobber()
        self.index_to_id={}
        self.basic_form=[]
        self.term_lower_form=[]
        self.term_singular_form=[]
        self.term_plural_form=[]
        self.term_lemma_form=[]

        self.use_lemma,self.use_lower,self.use_singular,self.use_plural=use_lemma,use_lower,use_singular,use_plural
        self.lemmatizer=get_lemmatizer(lang)
        self.N = 0

    def add(self,id_,func,basic_form,lower_form=[],lemma_form=[],singular_form=[],plural_form=[]):
        """
        Add a term or one of its variation

        Parameters
        ----------
        id_ : str or int
            identifier
        func : func
            NotImplemented
        basic_form : list of str
            initial form
        lower_form : list, optional
            lower form (the default is [])
        lemma_form : list, optional
            lemma form (the default is [])
        singular_form : list, optional
            singular form (the default is [])
        plural_form : list, optional
            plural form (the default is [])
        """
        self.index_to_id[self.N]=id_
        self.basic_form.append(basic_form)
        self.term_lower_form.append(lower_form)
        self.term_singular_form.append(singular_form)
        self.term_plural_form.append(plural_form)
        self.term_lemma_form.append(lemma_form)
        self.N+=1

    def match(self,text):
        """
        Execute the matching

        Parameters
        ----------
        text : str
            input

        Returns
        -------
        list
            list of matches(id,start,end)
        """
        results=[]
        doc=self.blobber(text)
        tokens=list(doc.tokenize())
        tokens_lower=list(doc.lower().tokenize())
        tokens_lemma = [self.lemmatizer.get(token.lower(), token.lower()) for token in doc]
        results.extend(match_sequences(self.basic_form,tokens))
        if self.use_lemma:
            results.extend(match_sequences(self.term_lemma_form,tokens_lemma))
        if self.use_lower:
            results.extend(match_sequences(self.term_lower_form,tokens_lower))
        if self.use_singular:
            results.extend(match_sequences(self.term_singular_form,tokens_lower))
        if self.use_plural:
            results.extend(match_sequences(self.term_plural_form,tokens_lower))
        return self.parse_results(results)

    def parse_results(self,results):
        """
        Parse raw match results

        Parameters
        ----------
        results : list
            raw match results

        Returns
        -------
        list
            reformatted matches
        """
        new_=[]
        for res in results:
            new_.append([self.index_to_id[res[0]],res[1],res[2]])
        return new_

    def __call__(self,text):
        return self.match(text)


class TerminologyMatcher:
    """
    A Matcher used to detect words from a terminology in a text. It uses the **spacy** Matcher class.

    Terminology can be given as simple 1D-array(`list`,`numpy.ndarray`), a Python `dict` or `pandas.Dataframe`.

    Usage:
    >>> terminology = ["Agroforesterie","équipe"]
    >>> matcher = TerminologyMatcher(terminology)
    >>> matcher(nlp("Cette homme travaille dans le domaine de l'agroforesterie. Plusieurs équipes du CIRAD travaille dans ce domaine."))
    [(0, 8, 9), (1, 11, 12)]
    """

    def __init__(self, terminology_data, lang="fr", column_id="id", column_label="label", column_alt_label="alt_labels"):
        """
        TerminologyMatcher Constructor

        Parameters
        ----------
        terminology_data : terminology container
            Iterable object
        lang : str, optional
            language of the terminology (the default is "fr", which is French)
        column_id : str, optional
            the column name that contains the id of term (Only for pandas.Dataframe input) (the default is "id")
        column_label : str, optional
            The name of the column that contains the preferred label (the default is "label")
        column_alt_label : str, optional
            The name of the column that contains the alternate labels (must be iterable) (the default is "alt_labels")

        Raises
        ------
        ValueError
            If the terminology variable is not iterable

        """
        self.terminology_data = terminology_data
        if not hasattr(terminology_data, '__iter__'):  # Checking Vocabulary
            raise ValueError(
                "The 'terminology_data' args must be an iterable!")

        self.is_dict = isinstance(terminology_data, dict)
        self.is_panda = isinstance(terminology_data, pd.DataFrame)
        self.column_id_in = column_id in terminology_data
        self.column_id = column_id
        self.column_label=column_label

        self.lang = lang

        self.lemmatizer = get_lemmatizer(self.lang)  #  Accelerate the process
        self.matcher = Matcher("fr")

        self.inflector = None
        try:
            from inflector import French, English
            if self.lang == "fr":
                self.inflector = French()
            elif self.lang == "en":
                self.inflector = English()
        except ImportError:
            raise ImportError("You must install the module `inflector` from https://github.com/Jacobe2169/Python-Inflector")

        if self.is_panda:
            for index, row in tqdm(self.terminology_data.iterrows(), desc="Composing the thematic matcher..."):

                self.matcher.add(
                    index if not column_id in terminology_data else row[column_id],
                    None,
                    *self.generate_input_matcher(row[column_label])
                )
                for label in row[column_alt_label]:
                    self.matcher.add(index if not column_id in terminology_data else row[column_id],
                        None,
                        *self.generate_input_matcher(row[column_label]))

        elif self.is_dict:
            for key, value in tqdm(self.terminology_data.items(), desc="Composing the thematic matcher..."):
                try:
                    self.matcher.add(
                        row[key],
                        None,
                        *self.generate_input_matcher(value)
                    )
                except:
                    pass
        else:
            for i, word in tqdm(enumerate(self.terminology_data), desc="Composing the thematic matcher..."):
                try:
                    self.matcher.add(
                        i,
                        None,
                        *self.generate_input_matcher(word)
                    )
                except:
                    pass

    def __call__(self, doc):
        """
        Overriding the __call__ method so it can be used as the spacy.matcher.Matcher object

        Parameters
        ----------
        doc : str
            text

        Returns
        -------
        list
            list of matches position found in the text
        """
        return self.matcher.match(doc)

    def generate_input_matcher(self, doc):
        """
        Generate the patterns that enable the identification of term (and its variation) for the matcher.

        Parameters
        ----------
        doc : str
            text

        Returns
        -------
        list or tuple
            patterns list
        """
        doc = doc.split()
        basic_form=doc
        lemma_=[self.lemmatizer.get(token.lower(), token.lower()) for token in doc]
        lower_=[token.lower() for token in doc]
        singular_=[self.inflector.singularize(token.lower()) for token in doc]
        plural_=[self.inflector.pluralize(token.lower()) for token in doc]
        return (basic_form,lower_,lemma_,singular_,plural_)

    def get_word(self, key):
        """
        Return the string representation for an id from the spacy.matcher.Matcher() results.

        Parameters
        ----------
        key : int or str
            id

        Returns
        -------
        str
            string representation for the id
        """

        if self.is_panda:
            if self.column_id_in:
                return self.terminology_data[self.terminology_data[self.column_id == key]]
            return self.terminology_data.iloc[key][self.column_label]
        return self.terminology_data[key]
	# coding= utf-8
	import warnings
	import re
	import importlib
	import glob
	import copy

	import pandas as pd
	import numpy as np
	from tqdm import tqdm
	from textblob import Blobber
	from textblob_fr import PatternTagger, PatternAnalyzer


	def match_sequence(seq,dataset):
	"""
	Return start,end positions of every occurrence of a sequence in a dataset

	Parameters
	----------
	seq : list
	sequence
	dataset : list
	dataset

	"""
	N = len(seq)
	if N < 1:
	raise ValueError("Sequence is empty !")

	if isinstance(dataset,list):
	dataset=np.asarray(dataset)
	if isinstance(seq,list):
	seq=np.asarray(seq,dtype=dataset.dtype)

	prefix_ind=np.where(dataset == seq[0])[0]
	results=[]
	for idx in prefix_ind:
	start,end=idx,idx+N
	if (dataset[start:end].tolist() == seq.tolist()):
	results.append([seq,start,end])
	return results

	def match_sequences(seqs,dataset):
	"""
	Return start,end positions of every occurrence of a liste of sequences in a dataset

	Parameters
	----------
	seqs : list
	sequences
	dataset : list
	dataset

	"""
	N=len(seqs)
	if N < 1:
	warnings.warn("Sequence Empty")
	return []

	if isinstance(dataset,list):
	dataset=np.asarray(dataset)
	if isinstance(seqs,list):
	seqs=np.array(seqs)

	prefixes_dict={seq[0]:(seq,i,len(seq)) for i,seq in enumerate(seqs)}
	prefixes=list(prefixes_dict.keys())
	prefix_ind=np.where(np.isin(dataset,prefixes))[0]
	results=[]
	for idx in prefix_ind:
	start,end=idx,idx+prefixes_dict[dataset[idx]][-1]
	if (dataset[start:end].tolist() == prefixes_dict[dataset[idx]][0]):
	results.append([prefixes_dict[dataset[idx]][1],start,end])
	return results


	def get_lemmatizer(lang):
	i = importlib.import_module("spacy.lang.{0}.lemmatizer".format(lang))
	return i.LOOKUP


	class Matcher():
	"""
	A class responsible of the matching of terms in a text.
	"""

	def __init__(self,lang,use_lower=True,use_lemma=True,use_singular=True,use_plural=True):
	"""
	Constructor

	Parameters
	----------
	lang : str
	language
	use_lower : bool, optional
	match on lower version (the default is True)
	use_lemma : bool, optional
	match on lemma (the default is True)
	use_singular : bool, optional
	match on singular (the default is True)
	use_plural : bool, optional
	match on plural (the default is True)
	"""

	if lang == "fr":
	self.blobber = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
	else:
	self.blobber= Blobber()
	self.index_to_id={}
	self.basic_form=[]
	self.term_lower_form=[]
	self.term_singular_form=[]
	self.term_plural_form=[]
	self.term_lemma_form=[]

	self.use_lemma,self.use_lower,self.use_singular,self.use_plural=use_lemma,use_lower,use_singular,use_plural
	self.lemmatizer=get_lemmatizer(lang)
	self.N = 0

	def add(self,id_,func,basic_form,lower_form=[],lemma_form=[],singular_form=[],plural_form=[]):
	"""
	Add a term or one of its variation

	Parameters
	----------
	id_ : str or int
	identifier
	func : func
	NotImplemented
	basic_form : list of str
	initial form
	lower_form : list, optional
	lower form (the default is [])
	lemma_form : list, optional
	lemma form (the default is [])
	singular_form : list, optional
	singular form (the default is [])
	plural_form : list, optional
	plural form (the default is [])
	"""
	self.index_to_id[self.N]=id_
	self.basic_form.append(basic_form)
	self.term_lower_form.append(lower_form)
	self.term_singular_form.append(singular_form)
	self.term_plural_form.append(plural_form)
	self.term_lemma_form.append(lemma_form)
	self.N+=1

	def match(self,text):
	"""
	Execute the matching

	Parameters
	----------
	text : str
	input

	Returns
	-------
	list
	list of matches(id,start,end)
	"""
	results=[]
	doc=self.blobber(text)
	tokens=list(doc.tokenize())
	tokens_lower=list(doc.lower().tokenize())
	tokens_lemma = [self.lemmatizer.get(token.lower(), token.lower()) for token in doc]
	results.extend(match_sequences(self.basic_form,tokens))
	if self.use_lemma:
	results.extend(match_sequences(self.term_lemma_form,tokens_lemma))
	if self.use_lower:
	results.extend(match_sequences(self.term_lower_form,tokens_lower))
	if self.use_singular:
	results.extend(match_sequences(self.term_singular_form,tokens_lower))
	if self.use_plural:
	results.extend(match_sequences(self.term_plural_form,tokens_lower))
	return self.parse_results(results)

	def parse_results(self,results):
	"""
	Parse raw match results

	Parameters
	----------
	results : list
	raw match results

	Returns
	-------
	list
	reformatted matches
	"""
	new_=[]
	for res in results:
	new_.append([self.index_to_id[res[0]],res[1],res[2]])
	return new_

	def __call__(self,text):
	return self.match(text)


	class TerminologyMatcher:
	"""
	A Matcher used to detect words from a terminology in a text. It uses the spacy Matcher class.

	Terminology can be given as simple 1D-array(`list`,`numpy.ndarray`), a Python `dict` or `pandas.Dataframe`.

	Usage:
	>>> terminology = ["Agroforesterie","équipe"]
	>>> matcher = TerminologyMatcher(terminology)
	>>> matcher(nlp("Cette homme travaille dans le domaine de l'agroforesterie. Plusieurs équipes du CIRAD travaille dans ce domaine."))
	[(0, 8, 9), (1, 11, 12)]
	"""

	def __init__(self, terminology_data, lang="fr", column_id="id", column_label="label", column_alt_label="alt_labels"):
	"""
	TerminologyMatcher Constructor

	Parameters
	----------
	terminology_data : terminology container
	Iterable object
	lang : str, optional
	language of the terminology (the default is "fr", which is French)
	column_id : str, optional
	the column name that contains the id of term (Only for pandas.Dataframe input) (the default is "id")
	column_label : str, optional
	The name of the column that contains the preferred label (the default is "label")
	column_alt_label : str, optional
	The name of the column that contains the alternate labels (must be iterable) (the default is "alt_labels")

	Raises
	------
	ValueError
	If the terminology variable is not iterable

	"""
	self.terminology_data = terminology_data
	if not hasattr(terminology_data, '__iter__'): # Checking Vocabulary
	raise ValueError(
	"The 'terminology_data' args must be an iterable!")

	self.is_dict = isinstance(terminology_data, dict)
	self.is_panda = isinstance(terminology_data, pd.DataFrame)
	self.column_id_in = column_id in terminology_data
	self.column_id = column_id
	self.column_label=column_label

	self.lang = lang

	self.lemmatizer = get_lemmatizer(self.lang) # Accelerate the process
	self.matcher = Matcher("fr")

	self.inflector = None
	try:
	from inflector import French, English
	if self.lang == "fr":
	self.inflector = French()
	elif self.lang == "en":
	self.inflector = English()
	except ImportError:
	raise ImportError("You must install the module `inflector` from https://github.com/Jacobe2169/Python-Inflector")

	if self.is_panda:
	for index, row in tqdm(self.terminology_data.iterrows(), desc="Composing the thematic matcher..."):

	self.matcher.add(
	index if not column_id in terminology_data else row[column_id],
	None,
	*self.generate_input_matcher(row[column_label])
	)
	for label in row[column_alt_label]:
	self.matcher.add(index if not column_id in terminology_data else row[column_id],
	None,
	*self.generate_input_matcher(row[column_label]))

	elif self.is_dict:
	for key, value in tqdm(self.terminology_data.items(), desc="Composing the thematic matcher..."):
	try:
	self.matcher.add(
	row[key],
	None,
	*self.generate_input_matcher(value)
	)
	except:
	pass
	else:
	for i, word in tqdm(enumerate(self.terminology_data), desc="Composing the thematic matcher..."):
	try:
	self.matcher.add(
	i,
	None,
	*self.generate_input_matcher(word)
	)
	except:
	pass

	def __call__(self, doc):
	"""
	Overriding the __call__ method so it can be used as the spacy.matcher.Matcher object

	Parameters
	----------
	doc : str
	text

	Returns
	-------
	list
	list of matches position found in the text
	"""
	return self.matcher.match(doc)

	def generate_input_matcher(self, doc):
	"""
	Generate the patterns that enable the identification of term (and its variation) for the matcher.

	Parameters
	----------
	doc : str
	text

	Returns
	-------
	list or tuple
	patterns list
	"""
	doc = doc.split()
	basic_form=doc
	lemma_=[self.lemmatizer.get(token.lower(), token.lower()) for token in doc]
	lower_=[token.lower() for token in doc]
	singular_=[self.inflector.singularize(token.lower()) for token in doc]
	plural_=[self.inflector.pluralize(token.lower()) for token in doc]
	return (basic_form,lower_,lemma_,singular_,plural_)

	def get_word(self, key):
	"""
	Return the string representation for an id from the spacy.matcher.Matcher() results.

	Parameters
	----------
	key : int or str
	id

	Returns
	-------
	str
	string representation for the id
	"""

	if self.is_panda:
	if self.column_id_in:
	return self.terminology_data[self.terminology_data[self.column_id == key]]
	return self.terminology_data.iloc[key][self.column_label]
	return self.terminology_data[key]