napsternxg/FastQuerySegmenter.py

## FastQuerySegmenter.py
"""Faster Implementation of Unsupervised Query Segmentation.

Uses vectorized operations

- author: @napsternxg

Unsupervised Query Segmentation Using only Query Logs [Mishra et. al. 2011]

https://www.microsoft.com/en-us/research/wp-content/uploads/2011/01/pp0295-mishra.pdf

Based on the python implementation of https://github.com/kchro/query-segmenter

Usage:
    from query_segmenter.unsupervised import Segmenter

    segmenter = FastQuerySegmenter()
    segmenter.compute_scores(queries)
    segments = segmenter.segment('new iphone 6')
    assert segments == ['new', 'iphone 6']

Related sources:

- [Neural Methods for Intent understanding/reformulation](https://docs.google.com/presentation/d/1YBtqRMCEw9dLrr5zLC1nZIYtygAvJ7T2vQmV7HahBO8/edit#slide=id.gac24ab650a_1_1)
- [Unsupervised Query Segmentation Using Clickthrough for Information Retrieval](https://www.cse.iitb.ac.in/~soumen/doc/www2013/QirWoo/LiHZW2011ClickQuerySegment.pdf)
- [Learning Noun Phrase Query Segmentation](https://aclanthology.org/D07-1086.pdf)
- [E-commerce Concept Mining](https://docs.google.com/presentation/d/1hBZnFF413H8eFz4czI-fibq1FdvcmowkLvfBh1tvxj0/edit#slide=id.gafd9c6c2a6_0_67)

"""


import math
import re
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, diags
from scipy.special import factorial
import numpy as np
from tqdm.auto import tqdm
from collections import Counter

import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)-8s %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
)


class Segmenter(object):
    """Implementation of Unsupervised Query Segmentation."""

    def __init__(self, save_stats=False):
        """Construct Segmenter."""
        self.scores = defaultdict(float)
        self.stats = defaultdict(lambda: self._initialize_ngram())
        self.save_stats = save_stats

    def compute_scores(self, queries, alpha=1, beta=0.0):
        """Compute segmentation scores.

        Parameters
        ----------
        queries : list (n_queries)
            Training data for determining segments.
        alpha : int, optional
            Minimum query frequency. All words in an n-gram must occur in
            at least alpha queries to be considered a "significant n-gram."
        beta : float, optional
            beta * k is the segmentation score threshold, where k is the number
            of queries that contains all the words of a given n-gram.

        Returns
        -------
        self.scores : dict
            A mapping of ngram to segmentation score.

        Algorithm
        ---------
        Consider an n-gram M = [word_1, word_2, ..., word_n].

        Let {query_1, query_2, ..., query_k} be the subset of queries
        that contain all the words of M in some order.

        To test if M is a multi-word entity, we check if M occurs
        together more often than not.

        """
        sig_ngrams = self._get_significant_ngrams(queries, alpha=alpha)
        self._precompute_stats(sig_ngrams, queries)

        for ngram in tqdm(sig_ngrams):
            N = self.stats[ngram]["frequency"]
            k = max(self.stats[ngram]["co_occur"], 1)
            E_X = self.stats[ngram]["expectation"]

            score = 2 * (N - E_X) ** 2 / k

            if score < beta * k:
                # if the score does not exceed threshold
                continue

            self.scores[ngram] = score

        return self.scores

    def segment(self, query):
        """Segment a query (single or batch).

        Parameters
        ----------
        query : str or array-like

        Returns
        -------
        segments : 1 or 2D list of strings

        """
        if isinstance(query, str):
            segments = self._segment(query)
        elif hasattr(query, "__iter__"):
            segments = list(map(self._segment, query))
        return segments

    def _get_significant_ngrams(self, queries, ngram_range=(2, 10), alpha=1):
        """Get significant n-grams in the list of queries.

        An n-gram must occur at least "alpha" times to be significant.
        CountVectorizer

        """
        cv = CountVectorizer(
            ngram_range=ngram_range, token_pattern=r"\w+", min_df=alpha
        )

        cv.fit(queries)
        sig_ngrams = list(cv.vocabulary_.keys())
        print(f"Found {len(sig_ngrams)} sig_ngrams")
        return sig_ngrams

    def _precompute_stats(self, sig_ngrams, queries):
        """Precompute counts for each ngram M.

        The segmentation score is a function of three values:
            - the number of times M occurs in any order ( k )
            - the number of times M occurs in that order ( N )
            - the total probability that M occurs in any order ( E(X) )

        A high score indicates that M is a Multi-Word Entity.
        A low score indicates that M is not a Multi-Word Entity.

        """
        for ngram in tqdm(sig_ngrams):
            ngram_words = ngram.split()
            ngram_len = len(ngram_words)
            for query in queries:
                sub_queries = query.split()
                if all(word in sub_queries for word in ngram_words):
                    # if all words in n-gram co-occurs in query
                    self.stats[ngram]["co_occur"] += 1
                    if self._match_ngram(ngram, query):
                        # if the n-gram occurs in query
                        self.stats[ngram]["frequency"] += 1

                    query_len = len(sub_queries)
                    numer = math.factorial(query_len - ngram_len + 1)
                    denom = math.factorial(query_len)
                    prob = numer / denom

                    self.stats[ngram]["expectation"] += prob

        return self.stats

    def _initialize_ngram(self):
        """Initialize the information to collect about each ngram."""
        return {"frequency": 0, "co_occur": 0, "expectation": 0}

    def _match_ngram(self, ngram, query):
        match = re.search(rf"\b{ngram}\b", query)
        return match is not None

    def _segment(self, query):
        """Segment a query with dynamic programming."""
        query = query.split()
        query_len = len(query)
        val = [0] * (query_len + 1)
        idx = [0] * (query_len + 1)
        for i in range(1, query_len + 1):
            max_val = -1
            for j in range(i):
                subquery = " ".join(query[j:i])

                if subquery in self.scores:
                    score = self.scores[subquery] + val[j]
                else:
                    score = val[j]

                if score >= max_val:
                    max_val = score
                    idx[i] = j
            val[i] = max_val

        segments = []
        while query_len > 0:
            query_len = idx[query_len]
            segments.insert(0, " ".join(query[query_len:]))
            query = query[:query_len]

        return segments


class FastQuerySegmenter(object):
    """Implementation of Unsupervised Query Segmentation."""

    def __init__(self):
        """Construct Segmenter."""
        self.scores = dict()
        self.stats = dict()

    def compute_scores(
        self, queries, alpha=1, beta=0.6, ngram_range=(2, 10), query_weights=None
    ):
        """Compute segmentation scores.

        Parameters
        ----------
        queries : list (n_queries)
            Training data for determining segments.
        alpha : int, optional
            Minimum query frequency. All words in an n-gram must occur in
            at least alpha queries to be considered a "significant n-gram."
        beta : float, optional
            beta * k is the segmentation score threshold, where k is the number
            of queries that contains all the words of a given n-gram.

        Returns
        -------
        self.scores : dict
            A mapping of ngram to segmentation score.

        Algorithm
        ---------
        Consider an n-gram M = [word_1, word_2, ..., word_n].

        Let {query_1, query_2, ..., query_k} be the subset of queries
        that contain all the words of M in some order.

        To test if M is a multi-word entity, we check if M occurs
        together more often than not.

        """
        self.scores = self._get_significant_ngrams(
            queries,
            ngram_range=ngram_range,
            alpha=alpha,
            beta=beta,
            query_weights=query_weights,
        )
        return self.scores

    def segment(self, query):
        """Segment a query (single or batch).

        Parameters
        ----------
        query : str or array-like

        Returns
        -------
        segments : 1 or 2D list of strings

        """
        if isinstance(query, str):
            segments = self._segment(query, return_scores=False)
        elif hasattr(query, "__iter__"):
            segments = [self._segment(q, return_scores=False) for q in query]
        return segments

    def _get_significant_ngrams(
        self, queries, ngram_range=(2, 10), alpha=1, beta=0.6, query_weights=None
    ):
        """Get significant n-grams in the list of queries.

        An n-gram must occur at least "alpha" times to be significant.
        CountVectorizer

        """

        if query_weights is not None:
            query_weights = diags(query_weights)
        else:
            query_weights = diags(np.ones(len(queries)))
        logging.info(f"{query_weights.shape=}")

        self.cv_unigram = CountVectorizer(
            ngram_range=(1, 1), token_pattern=r"\w+", min_df=1
        )
        transform_unigram_queries = self.cv_unigram.fit_transform(queries)
        logging.info(f"{transform_unigram_queries.shape=}")

        self.analyzer = self.cv_unigram.build_analyzer()

        query_len = transform_unigram_queries.sum(axis=1).A1
        logging.info(f"{query_len.shape=}")

        self.cv_ngram = CountVectorizer(
            ngram_range=ngram_range, token_pattern=r"\w+", min_df=alpha
        )
        transform_ngram_queries = self.cv_ngram.fit_transform(queries)
        logging.info(f"{transform_ngram_queries.shape=}")

        self.sig_ngrams = self.cv_ngram.get_feature_names_out()
        self.sig_ngrams_vocab = self.cv_ngram.vocabulary_

        transform_unigram_ngrams = self.cv_unigram.transform(self.sig_ngrams)
        logging.info(f"{transform_unigram_ngrams.shape=}")

        # This step is the slowest. Find ways to optimize it so that is runs for large datasets
        # ngram_co_occur = ((transform_unigram_ngrams > 0).astype(int) @ (transform_unigram_queries.T > 0).astype(int))
        # ngrams x unigrams dot unigrams x queries = ngrams x queries
        ngram_co_occur = (
            (transform_unigram_ngrams > 0)
            .astype(int)
            .dot(transform_unigram_queries.T > 0)
            .astype(int)
        )
        logging.info(f"{ngram_co_occur.shape=}")
        ngram_co_occur_nz = ngram_co_occur.nonzero()

        # Can be weighted using query_weights
        ngram_frequency = (
            (transform_ngram_queries.T > 0)
            .astype(int)
            .dot(query_weights)
            .sum(axis=1)
            .A1
        )
        logging.info(f"{ngram_frequency.shape=}")

        ngram_unigram_len = (transform_unigram_ngrams > 0).sum(axis=1).A1
        logging.info(f"{ngram_unigram_len.shape=}")

        # ngram_co_occur_equal_len = (ngram_co_occur[ngram_co_occur_nz] == ngram_unigram_len[ngram_co_occur_nz[0]]).A1
        logging.info(f"{ngram_co_occur.data.shape=}")
        logging.info(f"{ngram_unigram_len[ngram_co_occur_nz[0]].shape=}")

        ngram_co_occur_equal_len = (
            ngram_co_occur.data == ngram_unigram_len[ngram_co_occur_nz[0]]
        )
        logging.info(f"{ngram_co_occur_equal_len.shape=}")

        # Can include weight for queries using query_weights
        ngram_co_occur_true = (
            csr_matrix(
                (ngram_co_occur_equal_len, ngram_co_occur_nz),
                shape=ngram_co_occur.shape,
            )
            .dot(query_weights)
            .sum(axis=1)
            .A1
        )
        logging.info(f"{ngram_co_occur_true.shape=}")

        all_factorials = factorial(np.arange(query_len.max() + 1))
        logging.info(f"{all_factorials.shape=}")

        numerator = all_factorials[
            (
                query_len[ngram_co_occur_nz[1]]
                - ngram_unigram_len[ngram_co_occur_nz[0]]
                + 1
            )
        ]
        logging.info(f"{numerator.shape=}")

        denominator = all_factorials[query_len[ngram_co_occur_nz[1]]]
        logging.info(f"{denominator.shape=}")

        # Can include weight for queries using query_weights
        ngram_expected_prob = (
            csr_matrix(
                (
                    ngram_co_occur_equal_len * (numerator / denominator),
                    ngram_co_occur_nz,
                ),
                shape=ngram_co_occur.shape,
            )
            .dot(query_weights)
            .sum(axis=1)
            .A1
        )

        logging.info(f"{ngram_expected_prob.shape=}")

        if self.save_stats:
            self.stats = dict(
                co_occur=ngram_co_occur_true,
                frequency=ngram_frequency,
                expected_prob=ngram_expected_prob,
            )
        N = ngram_frequency
        k = np.clip(ngram_co_occur_true, 1, None)
        E_X = ngram_expected_prob

        scores = (2 * (N - E_X) ** 2) / k
        self.max_score = scores.max()
        logging.info(f"{scores.shape=}")
        if beta > 0:
            # scores = np.where(scores > beta*k, scores, 0)
            self.scores = dict(
                ((key, val) for key, val in zip(self.sig_ngrams, scores) if val > 0)
            )
        else:
            self.scores = dict(zip(self.sig_ngrams, scores))

        return self.scores

    def add_ground_truth_ngrams(self, ngrams):
        # self.max_score = max(self.scores.values())
        # Retail old scores for later reference
        self._old_scores = dict(self.scores)
        for ngram in tqdm(ngrams):
            ngrams = self.analyzer(ngram)
            if len(ngrams) > 1:
                ngram = " ".join(ngrams)
                self.scores[ngram] = self.max_score

    def reset_truth_ngrams(self):
        if hasattr(self, "_old_scores"):
            self.scores = dict(self._old_scores)
            del self._old_scores

    def _score_query(self, query):
        query = self.analyzer(query)
        query_len = len(query)
        val = [0] * (query_len + 1)
        idx = [0] * (query_len + 1)
        for i in range(1, query_len + 1):
            max_val = -1
            for j in range(i):
                subquery = " ".join(query[j:i])

                if subquery in self.scores:
                    score = self.scores[subquery] + val[j]
                else:
                    score = val[j]

                if score >= max_val:
                    max_val = score
                    idx[i] = j
            val[i] = max_val
        return query, val, idx

    def _segment(self, query, return_scores=False):
        """Segment a query with dynamic programming."""
        query, val, idx = self._score_query(query)
        query_len = len(query)
        segments = []
        while query_len > 0:
            query_len = idx[query_len]
            segments.insert(0, " ".join(query[query_len:]))
            query = query[:query_len]
        if return_scores:
            return segments, val
        return segments

    def segment_to_bio(self, query, tag_label="PHRASE"):
        """Segment a query with dynamic programming."""
        query, val, idx = self._score_query(query)
        query_len = len(query)
        segments = []
        bio_labels = []
        B_LABEL = f"B-{tag_label}"
        I_LABEL = f"I-{tag_label}"
        O_LABEL = "O"
        while query_len > 0:
            query_len = idx[query_len]
            seg = query[query_len:]
            # segments.insert(0, seg)
            segments.append(seg)
            n = len(seg)
            bio = [I_LABEL] * n
            if n == 1:
                bio = [O_LABEL]
            else:
                bio[0] = B_LABEL
            query = query[:query_len]
            # bio_labels.insert(0, bio)
            bio_labels.append(bio)
        segments = sum(reversed(segments), [])
        bio_labels = sum(reversed(bio_labels), [])
        return segments, bio_labels


if __name__ == "__main__":
    fast_segmenter = FastQuerySegmenter()
    queries = ["hello world", "how to find a new place", "where is a new world"]
    query_weights = [5, 6, 1]
    fast_segmenter.compute_scores(queries, alpha=1)
    fast_segmenter.add_ground_truth_ngrams(["new world", "new place"])
    ngram_counts = Counter()
    for query in tqdm(queries):
        segments = fast_segmenter.segment(query)
        ngram_counts.update([s for s in segments if " " in s])
        print(f"{query=} -> {segments=}")
    import matplotlib.pyplot as plt

    scores = np.array(list(fast_segmenter.scores.values()))
    plt.hist(np.log10(scores + 1), bins=100, cumulative=True, density=True)
    plt.xlabel("$X = log_{10}(score + 1)$")
    plt.ylabel("$P(x < X)$")
    top_ngrams = list(
        zip(range(100), sorted(fast_segmenter.scores.items(), key=lambda x: -x[1]))
    )
    print(top_ngrams)
	"""Faster Implementation of Unsupervised Query Segmentation.

	Uses vectorized operations

	- author: @napsternxg

	Unsupervised Query Segmentation Using only Query Logs [Mishra et. al. 2011]

	https://www.microsoft.com/en-us/research/wp-content/uploads/2011/01/pp0295-mishra.pdf

	Based on the python implementation of https://github.com/kchro/query-segmenter

	Usage:
	from query_segmenter.unsupervised import Segmenter

	segmenter = FastQuerySegmenter()
	segmenter.compute_scores(queries)
	segments = segmenter.segment('new iphone 6')
	assert segments == ['new', 'iphone 6']

	Related sources:

	- [Neural Methods for Intent understanding/reformulation](https://docs.google.com/presentation/d/1YBtqRMCEw9dLrr5zLC1nZIYtygAvJ7T2vQmV7HahBO8/edit#slide=id.gac24ab650a_1_1)
	- [Unsupervised Query Segmentation Using Clickthrough for Information Retrieval](https://www.cse.iitb.ac.in/~soumen/doc/www2013/QirWoo/LiHZW2011ClickQuerySegment.pdf)
	- [Learning Noun Phrase Query Segmentation](https://aclanthology.org/D07-1086.pdf)
	- [E-commerce Concept Mining](https://docs.google.com/presentation/d/1hBZnFF413H8eFz4czI-fibq1FdvcmowkLvfBh1tvxj0/edit#slide=id.gafd9c6c2a6_0_67)

	"""


	import math
	import re
	from sklearn.feature_extraction.text import CountVectorizer
	from scipy.sparse import csr_matrix, diags
	from scipy.special import factorial
	import numpy as np
	from tqdm.auto import tqdm
	from collections import Counter

	import logging

	logging.basicConfig(
	format="%(asctime)s %(levelname)-8s %(message)s",
	level=logging.INFO,
	datefmt="%Y-%m-%d %H:%M:%S",
	)


	class Segmenter(object):
	"""Implementation of Unsupervised Query Segmentation."""

	def __init__(self, save_stats=False):
	"""Construct Segmenter."""
	self.scores = defaultdict(float)
	self.stats = defaultdict(lambda: self._initialize_ngram())
	self.save_stats = save_stats

	def compute_scores(self, queries, alpha=1, beta=0.0):
	"""Compute segmentation scores.

	Parameters
	----------
	queries : list (n_queries)
	Training data for determining segments.
	alpha : int, optional
	Minimum query frequency. All words in an n-gram must occur in
	at least alpha queries to be considered a "significant n-gram."
	beta : float, optional
	beta * k is the segmentation score threshold, where k is the number
	of queries that contains all the words of a given n-gram.

	Returns
	-------
	self.scores : dict
	A mapping of ngram to segmentation score.

	Algorithm
	---------
	Consider an n-gram M = [word_1, word_2, ..., word_n].

	Let {query_1, query_2, ..., query_k} be the subset of queries
	that contain all the words of M in some order.

	To test if M is a multi-word entity, we check if M occurs
	together more often than not.

	"""
	sig_ngrams = self._get_significant_ngrams(queries, alpha=alpha)
	self._precompute_stats(sig_ngrams, queries)

	for ngram in tqdm(sig_ngrams):
	N = self.stats[ngram]["frequency"]
	k = max(self.stats[ngram]["co_occur"], 1)
	E_X = self.stats[ngram]["expectation"]

	score = 2 * (N - E_X) ** 2 / k

	if score < beta * k:
	# if the score does not exceed threshold
	continue

	self.scores[ngram] = score

	return self.scores

	def segment(self, query):
	"""Segment a query (single or batch).

	Parameters
	----------
	query : str or array-like

	Returns
	-------
	segments : 1 or 2D list of strings

	"""
	if isinstance(query, str):
	segments = self._segment(query)
	elif hasattr(query, "__iter__"):
	segments = list(map(self._segment, query))
	return segments

	def _get_significant_ngrams(self, queries, ngram_range=(2, 10), alpha=1):
	"""Get significant n-grams in the list of queries.

	An n-gram must occur at least "alpha" times to be significant.
	CountVectorizer

	"""
	cv = CountVectorizer(
	ngram_range=ngram_range, token_pattern=r"\w+", min_df=alpha
	)

	cv.fit(queries)
	sig_ngrams = list(cv.vocabulary_.keys())
	print(f"Found {len(sig_ngrams)} sig_ngrams")
	return sig_ngrams

	def _precompute_stats(self, sig_ngrams, queries):
	"""Precompute counts for each ngram M.

	The segmentation score is a function of three values:
	- the number of times M occurs in any order ( k )
	- the number of times M occurs in that order ( N )
	- the total probability that M occurs in any order ( E(X) )

	A high score indicates that M is a Multi-Word Entity.
	A low score indicates that M is not a Multi-Word Entity.

	"""
	for ngram in tqdm(sig_ngrams):
	ngram_words = ngram.split()
	ngram_len = len(ngram_words)
	for query in queries:
	sub_queries = query.split()
	if all(word in sub_queries for word in ngram_words):
	# if all words in n-gram co-occurs in query
	self.stats[ngram]["co_occur"] += 1
	if self._match_ngram(ngram, query):
	# if the n-gram occurs in query
	self.stats[ngram]["frequency"] += 1

	query_len = len(sub_queries)
	numer = math.factorial(query_len - ngram_len + 1)
	denom = math.factorial(query_len)
	prob = numer / denom

	self.stats[ngram]["expectation"] += prob

	return self.stats

	def _initialize_ngram(self):
	"""Initialize the information to collect about each ngram."""
	return {"frequency": 0, "co_occur": 0, "expectation": 0}

	def _match_ngram(self, ngram, query):
	match = re.search(rf"\b{ngram}\b", query)
	return match is not None

	def _segment(self, query):
	"""Segment a query with dynamic programming."""
	query = query.split()
	query_len = len(query)
	val = [0] * (query_len + 1)
	idx = [0] * (query_len + 1)
	for i in range(1, query_len + 1):
	max_val = -1
	for j in range(i):
	subquery = " ".join(query[j:i])

	if subquery in self.scores:
	score = self.scores[subquery] + val[j]
	else:
	score = val[j]

	if score >= max_val:
	max_val = score
	idx[i] = j
	val[i] = max_val

	segments = []
	while query_len > 0:
	query_len = idx[query_len]
	segments.insert(0, " ".join(query[query_len:]))
	query = query[:query_len]

	return segments


	class FastQuerySegmenter(object):
	"""Implementation of Unsupervised Query Segmentation."""

	def __init__(self):
	"""Construct Segmenter."""
	self.scores = dict()
	self.stats = dict()

	def compute_scores(
	self, queries, alpha=1, beta=0.6, ngram_range=(2, 10), query_weights=None
	):
	"""Compute segmentation scores.

	Parameters
	----------
	queries : list (n_queries)
	Training data for determining segments.
	alpha : int, optional
	Minimum query frequency. All words in an n-gram must occur in
	at least alpha queries to be considered a "significant n-gram."
	beta : float, optional
	beta * k is the segmentation score threshold, where k is the number
	of queries that contains all the words of a given n-gram.

	Returns
	-------
	self.scores : dict
	A mapping of ngram to segmentation score.

	Algorithm
	---------
	Consider an n-gram M = [word_1, word_2, ..., word_n].

	Let {query_1, query_2, ..., query_k} be the subset of queries
	that contain all the words of M in some order.

	To test if M is a multi-word entity, we check if M occurs
	together more often than not.

	"""
	self.scores = self._get_significant_ngrams(
	queries,
	ngram_range=ngram_range,
	alpha=alpha,
	beta=beta,
	query_weights=query_weights,
	)
	return self.scores

	def segment(self, query):
	"""Segment a query (single or batch).

	Parameters
	----------
	query : str or array-like

	Returns
	-------
	segments : 1 or 2D list of strings

	"""
	if isinstance(query, str):
	segments = self._segment(query, return_scores=False)
	elif hasattr(query, "__iter__"):
	segments = [self._segment(q, return_scores=False) for q in query]
	return segments

	def _get_significant_ngrams(
	self, queries, ngram_range=(2, 10), alpha=1, beta=0.6, query_weights=None
	):
	"""Get significant n-grams in the list of queries.

	An n-gram must occur at least "alpha" times to be significant.
	CountVectorizer

	"""

	if query_weights is not None:
	query_weights = diags(query_weights)
	else:
	query_weights = diags(np.ones(len(queries)))
	logging.info(f"{query_weights.shape=}")

	self.cv_unigram = CountVectorizer(
	ngram_range=(1, 1), token_pattern=r"\w+", min_df=1
	)
	transform_unigram_queries = self.cv_unigram.fit_transform(queries)
	logging.info(f"{transform_unigram_queries.shape=}")

	self.analyzer = self.cv_unigram.build_analyzer()

	query_len = transform_unigram_queries.sum(axis=1).A1
	logging.info(f"{query_len.shape=}")

	self.cv_ngram = CountVectorizer(
	ngram_range=ngram_range, token_pattern=r"\w+", min_df=alpha
	)
	transform_ngram_queries = self.cv_ngram.fit_transform(queries)
	logging.info(f"{transform_ngram_queries.shape=}")

	self.sig_ngrams = self.cv_ngram.get_feature_names_out()
	self.sig_ngrams_vocab = self.cv_ngram.vocabulary_

	transform_unigram_ngrams = self.cv_unigram.transform(self.sig_ngrams)
	logging.info(f"{transform_unigram_ngrams.shape=}")

	# This step is the slowest. Find ways to optimize it so that is runs for large datasets
	# ngram_co_occur = ((transform_unigram_ngrams > 0).astype(int) @ (transform_unigram_queries.T > 0).astype(int))
	# ngrams x unigrams dot unigrams x queries = ngrams x queries
	ngram_co_occur = (
	(transform_unigram_ngrams > 0)
	.astype(int)
	.dot(transform_unigram_queries.T > 0)
	.astype(int)
	)
	logging.info(f"{ngram_co_occur.shape=}")
	ngram_co_occur_nz = ngram_co_occur.nonzero()

	# Can be weighted using query_weights
	ngram_frequency = (
	(transform_ngram_queries.T > 0)
	.astype(int)
	.dot(query_weights)
	.sum(axis=1)
	.A1
	)
	logging.info(f"{ngram_frequency.shape=}")

	ngram_unigram_len = (transform_unigram_ngrams > 0).sum(axis=1).A1
	logging.info(f"{ngram_unigram_len.shape=}")

	# ngram_co_occur_equal_len = (ngram_co_occur[ngram_co_occur_nz] == ngram_unigram_len[ngram_co_occur_nz[0]]).A1
	logging.info(f"{ngram_co_occur.data.shape=}")
	logging.info(f"{ngram_unigram_len[ngram_co_occur_nz[0]].shape=}")

	ngram_co_occur_equal_len = (
	ngram_co_occur.data == ngram_unigram_len[ngram_co_occur_nz[0]]
	)
	logging.info(f"{ngram_co_occur_equal_len.shape=}")

	# Can include weight for queries using query_weights
	ngram_co_occur_true = (
	csr_matrix(
	(ngram_co_occur_equal_len, ngram_co_occur_nz),
	shape=ngram_co_occur.shape,
	)
	.dot(query_weights)
	.sum(axis=1)
	.A1
	)
	logging.info(f"{ngram_co_occur_true.shape=}")

	all_factorials = factorial(np.arange(query_len.max() + 1))
	logging.info(f"{all_factorials.shape=}")

	numerator = all_factorials[
	(
	query_len[ngram_co_occur_nz[1]]
	- ngram_unigram_len[ngram_co_occur_nz[0]]
	+ 1
	)
	]
	logging.info(f"{numerator.shape=}")

	denominator = all_factorials[query_len[ngram_co_occur_nz[1]]]
	logging.info(f"{denominator.shape=}")

	# Can include weight for queries using query_weights
	ngram_expected_prob = (
	csr_matrix(
	(
	ngram_co_occur_equal_len * (numerator / denominator),
	ngram_co_occur_nz,
	),
	shape=ngram_co_occur.shape,
	)
	.dot(query_weights)
	.sum(axis=1)
	.A1
	)

	logging.info(f"{ngram_expected_prob.shape=}")

	if self.save_stats:
	self.stats = dict(
	co_occur=ngram_co_occur_true,
	frequency=ngram_frequency,
	expected_prob=ngram_expected_prob,
	)
	N = ngram_frequency
	k = np.clip(ngram_co_occur_true, 1, None)
	E_X = ngram_expected_prob

	scores = (2 * (N - E_X) ** 2) / k
	self.max_score = scores.max()
	logging.info(f"{scores.shape=}")
	if beta > 0:
	# scores = np.where(scores > beta*k, scores, 0)
	self.scores = dict(
	((key, val) for key, val in zip(self.sig_ngrams, scores) if val > 0)
	)
	else:
	self.scores = dict(zip(self.sig_ngrams, scores))

	return self.scores

	def add_ground_truth_ngrams(self, ngrams):
	# self.max_score = max(self.scores.values())
	# Retail old scores for later reference
	self._old_scores = dict(self.scores)
	for ngram in tqdm(ngrams):
	ngrams = self.analyzer(ngram)
	if len(ngrams) > 1:
	ngram = " ".join(ngrams)
	self.scores[ngram] = self.max_score

	def reset_truth_ngrams(self):
	if hasattr(self, "_old_scores"):
	self.scores = dict(self._old_scores)
	del self._old_scores

	def _score_query(self, query):
	query = self.analyzer(query)
	query_len = len(query)
	val = [0] * (query_len + 1)
	idx = [0] * (query_len + 1)
	for i in range(1, query_len + 1):
	max_val = -1
	for j in range(i):
	subquery = " ".join(query[j:i])

	if subquery in self.scores:
	score = self.scores[subquery] + val[j]
	else:
	score = val[j]

	if score >= max_val:
	max_val = score
	idx[i] = j
	val[i] = max_val
	return query, val, idx

	def _segment(self, query, return_scores=False):
	"""Segment a query with dynamic programming."""
	query, val, idx = self._score_query(query)
	query_len = len(query)
	segments = []
	while query_len > 0:
	query_len = idx[query_len]
	segments.insert(0, " ".join(query[query_len:]))
	query = query[:query_len]
	if return_scores:
	return segments, val
	return segments

	def segment_to_bio(self, query, tag_label="PHRASE"):
	"""Segment a query with dynamic programming."""
	query, val, idx = self._score_query(query)
	query_len = len(query)
	segments = []
	bio_labels = []
	B_LABEL = f"B-{tag_label}"
	I_LABEL = f"I-{tag_label}"
	O_LABEL = "O"
	while query_len > 0:
	query_len = idx[query_len]
	seg = query[query_len:]
	# segments.insert(0, seg)
	segments.append(seg)
	n = len(seg)
	bio = [I_LABEL] * n
	if n == 1:
	bio = [O_LABEL]
	else:
	bio[0] = B_LABEL
	query = query[:query_len]
	# bio_labels.insert(0, bio)
	bio_labels.append(bio)
	segments = sum(reversed(segments), [])
	bio_labels = sum(reversed(bio_labels), [])
	return segments, bio_labels


	if __name__ == "__main__":
	fast_segmenter = FastQuerySegmenter()
	queries = ["hello world", "how to find a new place", "where is a new world"]
	query_weights = [5, 6, 1]
	fast_segmenter.compute_scores(queries, alpha=1)
	fast_segmenter.add_ground_truth_ngrams(["new world", "new place"])
	ngram_counts = Counter()
	for query in tqdm(queries):
	segments = fast_segmenter.segment(query)
	ngram_counts.update([s for s in segments if " " in s])
	print(f"{query=} -> {segments=}")
	import matplotlib.pyplot as plt

	scores = np.array(list(fast_segmenter.scores.values()))
	plt.hist(np.log10(scores + 1), bins=100, cumulative=True, density=True)
	plt.xlabel("$X = log_{10}(score + 1)$")
	plt.ylabel("$P(x < X)$")
	top_ngrams = list(
	zip(range(100), sorted(fast_segmenter.scores.items(), key=lambda x: -x[1]))
	)
	print(top_ngrams)