Skip to content

Instantly share code, notes, and snippets.

@napsternxg
Last active October 16, 2023 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save napsternxg/156147c9eb3084b75f681afd06effe07 to your computer and use it in GitHub Desktop.
Save napsternxg/156147c9eb3084b75f681afd06effe07 to your computer and use it in GitHub Desktop.
"""Faster Implementation of Unsupervised Query Segmentation.
Uses vectorized operations
- author: @napsternxg
Unsupervised Query Segmentation Using only Query Logs [Mishra et. al. 2011]
https://www.microsoft.com/en-us/research/wp-content/uploads/2011/01/pp0295-mishra.pdf
Based on the python implementation of https://github.com/kchro/query-segmenter
Usage:
from query_segmenter.unsupervised import Segmenter
segmenter = FastQuerySegmenter()
segmenter.compute_scores(queries)
segments = segmenter.segment('new iphone 6')
assert segments == ['new', 'iphone 6']
Related sources:
- [Neural Methods for Intent understanding/reformulation](https://docs.google.com/presentation/d/1YBtqRMCEw9dLrr5zLC1nZIYtygAvJ7T2vQmV7HahBO8/edit#slide=id.gac24ab650a_1_1)
- [Unsupervised Query Segmentation Using Clickthrough for Information Retrieval](https://www.cse.iitb.ac.in/~soumen/doc/www2013/QirWoo/LiHZW2011ClickQuerySegment.pdf)
- [Learning Noun Phrase Query Segmentation](https://aclanthology.org/D07-1086.pdf)
- [E-commerce Concept Mining](https://docs.google.com/presentation/d/1hBZnFF413H8eFz4czI-fibq1FdvcmowkLvfBh1tvxj0/edit#slide=id.gafd9c6c2a6_0_67)
"""
import math
import re
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, diags
from scipy.special import factorial
import numpy as np
from tqdm.auto import tqdm
from collections import Counter
import logging
logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)
class Segmenter(object):
"""Implementation of Unsupervised Query Segmentation."""
def __init__(self, save_stats=False):
"""Construct Segmenter."""
self.scores = defaultdict(float)
self.stats = defaultdict(lambda: self._initialize_ngram())
self.save_stats = save_stats
def compute_scores(self, queries, alpha=1, beta=0.0):
"""Compute segmentation scores.
Parameters
----------
queries : list (n_queries)
Training data for determining segments.
alpha : int, optional
Minimum query frequency. All words in an n-gram must occur in
at least alpha queries to be considered a "significant n-gram."
beta : float, optional
beta * k is the segmentation score threshold, where k is the number
of queries that contains all the words of a given n-gram.
Returns
-------
self.scores : dict
A mapping of ngram to segmentation score.
Algorithm
---------
Consider an n-gram M = [word_1, word_2, ..., word_n].
Let {query_1, query_2, ..., query_k} be the subset of queries
that contain all the words of M in some order.
To test if M is a multi-word entity, we check if M occurs
together more often than not.
"""
sig_ngrams = self._get_significant_ngrams(queries, alpha=alpha)
self._precompute_stats(sig_ngrams, queries)
for ngram in tqdm(sig_ngrams):
N = self.stats[ngram]["frequency"]
k = max(self.stats[ngram]["co_occur"], 1)
E_X = self.stats[ngram]["expectation"]
score = 2 * (N - E_X) ** 2 / k
if score < beta * k:
# if the score does not exceed threshold
continue
self.scores[ngram] = score
return self.scores
def segment(self, query):
"""Segment a query (single or batch).
Parameters
----------
query : str or array-like
Returns
-------
segments : 1 or 2D list of strings
"""
if isinstance(query, str):
segments = self._segment(query)
elif hasattr(query, "__iter__"):
segments = list(map(self._segment, query))
return segments
def _get_significant_ngrams(self, queries, ngram_range=(2, 10), alpha=1):
"""Get significant n-grams in the list of queries.
An n-gram must occur at least "alpha" times to be significant.
CountVectorizer
"""
cv = CountVectorizer(
ngram_range=ngram_range, token_pattern=r"\w+", min_df=alpha
)
cv.fit(queries)
sig_ngrams = list(cv.vocabulary_.keys())
print(f"Found {len(sig_ngrams)} sig_ngrams")
return sig_ngrams
def _precompute_stats(self, sig_ngrams, queries):
"""Precompute counts for each ngram M.
The segmentation score is a function of three values:
- the number of times M occurs in any order ( k )
- the number of times M occurs in that order ( N )
- the total probability that M occurs in any order ( E(X) )
A high score indicates that M is a Multi-Word Entity.
A low score indicates that M is not a Multi-Word Entity.
"""
for ngram in tqdm(sig_ngrams):
ngram_words = ngram.split()
ngram_len = len(ngram_words)
for query in queries:
sub_queries = query.split()
if all(word in sub_queries for word in ngram_words):
# if all words in n-gram co-occurs in query
self.stats[ngram]["co_occur"] += 1
if self._match_ngram(ngram, query):
# if the n-gram occurs in query
self.stats[ngram]["frequency"] += 1
query_len = len(sub_queries)
numer = math.factorial(query_len - ngram_len + 1)
denom = math.factorial(query_len)
prob = numer / denom
self.stats[ngram]["expectation"] += prob
return self.stats
def _initialize_ngram(self):
"""Initialize the information to collect about each ngram."""
return {"frequency": 0, "co_occur": 0, "expectation": 0}
def _match_ngram(self, ngram, query):
match = re.search(rf"\b{ngram}\b", query)
return match is not None
def _segment(self, query):
"""Segment a query with dynamic programming."""
query = query.split()
query_len = len(query)
val = [0] * (query_len + 1)
idx = [0] * (query_len + 1)
for i in range(1, query_len + 1):
max_val = -1
for j in range(i):
subquery = " ".join(query[j:i])
if subquery in self.scores:
score = self.scores[subquery] + val[j]
else:
score = val[j]
if score >= max_val:
max_val = score
idx[i] = j
val[i] = max_val
segments = []
while query_len > 0:
query_len = idx[query_len]
segments.insert(0, " ".join(query[query_len:]))
query = query[:query_len]
return segments
class FastQuerySegmenter(object):
"""Implementation of Unsupervised Query Segmentation."""
def __init__(self):
"""Construct Segmenter."""
self.scores = dict()
self.stats = dict()
def compute_scores(
self, queries, alpha=1, beta=0.6, ngram_range=(2, 10), query_weights=None
):
"""Compute segmentation scores.
Parameters
----------
queries : list (n_queries)
Training data for determining segments.
alpha : int, optional
Minimum query frequency. All words in an n-gram must occur in
at least alpha queries to be considered a "significant n-gram."
beta : float, optional
beta * k is the segmentation score threshold, where k is the number
of queries that contains all the words of a given n-gram.
Returns
-------
self.scores : dict
A mapping of ngram to segmentation score.
Algorithm
---------
Consider an n-gram M = [word_1, word_2, ..., word_n].
Let {query_1, query_2, ..., query_k} be the subset of queries
that contain all the words of M in some order.
To test if M is a multi-word entity, we check if M occurs
together more often than not.
"""
self.scores = self._get_significant_ngrams(
queries,
ngram_range=ngram_range,
alpha=alpha,
beta=beta,
query_weights=query_weights,
)
return self.scores
def segment(self, query):
"""Segment a query (single or batch).
Parameters
----------
query : str or array-like
Returns
-------
segments : 1 or 2D list of strings
"""
if isinstance(query, str):
segments = self._segment(query, return_scores=False)
elif hasattr(query, "__iter__"):
segments = [self._segment(q, return_scores=False) for q in query]
return segments
def _get_significant_ngrams(
self, queries, ngram_range=(2, 10), alpha=1, beta=0.6, query_weights=None
):
"""Get significant n-grams in the list of queries.
An n-gram must occur at least "alpha" times to be significant.
CountVectorizer
"""
if query_weights is not None:
query_weights = diags(query_weights)
else:
query_weights = diags(np.ones(len(queries)))
logging.info(f"{query_weights.shape=}")
self.cv_unigram = CountVectorizer(
ngram_range=(1, 1), token_pattern=r"\w+", min_df=1
)
transform_unigram_queries = self.cv_unigram.fit_transform(queries)
logging.info(f"{transform_unigram_queries.shape=}")
self.analyzer = self.cv_unigram.build_analyzer()
query_len = transform_unigram_queries.sum(axis=1).A1
logging.info(f"{query_len.shape=}")
self.cv_ngram = CountVectorizer(
ngram_range=ngram_range, token_pattern=r"\w+", min_df=alpha
)
transform_ngram_queries = self.cv_ngram.fit_transform(queries)
logging.info(f"{transform_ngram_queries.shape=}")
self.sig_ngrams = self.cv_ngram.get_feature_names_out()
self.sig_ngrams_vocab = self.cv_ngram.vocabulary_
transform_unigram_ngrams = self.cv_unigram.transform(self.sig_ngrams)
logging.info(f"{transform_unigram_ngrams.shape=}")
# This step is the slowest. Find ways to optimize it so that is runs for large datasets
# ngram_co_occur = ((transform_unigram_ngrams > 0).astype(int) @ (transform_unigram_queries.T > 0).astype(int))
# ngrams x unigrams dot unigrams x queries = ngrams x queries
ngram_co_occur = (
(transform_unigram_ngrams > 0)
.astype(int)
.dot(transform_unigram_queries.T > 0)
.astype(int)
)
logging.info(f"{ngram_co_occur.shape=}")
ngram_co_occur_nz = ngram_co_occur.nonzero()
# Can be weighted using query_weights
ngram_frequency = (
(transform_ngram_queries.T > 0)
.astype(int)
.dot(query_weights)
.sum(axis=1)
.A1
)
logging.info(f"{ngram_frequency.shape=}")
ngram_unigram_len = (transform_unigram_ngrams > 0).sum(axis=1).A1
logging.info(f"{ngram_unigram_len.shape=}")
# ngram_co_occur_equal_len = (ngram_co_occur[ngram_co_occur_nz] == ngram_unigram_len[ngram_co_occur_nz[0]]).A1
logging.info(f"{ngram_co_occur.data.shape=}")
logging.info(f"{ngram_unigram_len[ngram_co_occur_nz[0]].shape=}")
ngram_co_occur_equal_len = (
ngram_co_occur.data == ngram_unigram_len[ngram_co_occur_nz[0]]
)
logging.info(f"{ngram_co_occur_equal_len.shape=}")
# Can include weight for queries using query_weights
ngram_co_occur_true = (
csr_matrix(
(ngram_co_occur_equal_len, ngram_co_occur_nz),
shape=ngram_co_occur.shape,
)
.dot(query_weights)
.sum(axis=1)
.A1
)
logging.info(f"{ngram_co_occur_true.shape=}")
all_factorials = factorial(np.arange(query_len.max() + 1))
logging.info(f"{all_factorials.shape=}")
numerator = all_factorials[
(
query_len[ngram_co_occur_nz[1]]
- ngram_unigram_len[ngram_co_occur_nz[0]]
+ 1
)
]
logging.info(f"{numerator.shape=}")
denominator = all_factorials[query_len[ngram_co_occur_nz[1]]]
logging.info(f"{denominator.shape=}")
# Can include weight for queries using query_weights
ngram_expected_prob = (
csr_matrix(
(
ngram_co_occur_equal_len * (numerator / denominator),
ngram_co_occur_nz,
),
shape=ngram_co_occur.shape,
)
.dot(query_weights)
.sum(axis=1)
.A1
)
logging.info(f"{ngram_expected_prob.shape=}")
if self.save_stats:
self.stats = dict(
co_occur=ngram_co_occur_true,
frequency=ngram_frequency,
expected_prob=ngram_expected_prob,
)
N = ngram_frequency
k = np.clip(ngram_co_occur_true, 1, None)
E_X = ngram_expected_prob
scores = (2 * (N - E_X) ** 2) / k
self.max_score = scores.max()
logging.info(f"{scores.shape=}")
if beta > 0:
# scores = np.where(scores > beta*k, scores, 0)
self.scores = dict(
((key, val) for key, val in zip(self.sig_ngrams, scores) if val > 0)
)
else:
self.scores = dict(zip(self.sig_ngrams, scores))
return self.scores
def add_ground_truth_ngrams(self, ngrams):
# self.max_score = max(self.scores.values())
# Retail old scores for later reference
self._old_scores = dict(self.scores)
for ngram in tqdm(ngrams):
ngrams = self.analyzer(ngram)
if len(ngrams) > 1:
ngram = " ".join(ngrams)
self.scores[ngram] = self.max_score
def reset_truth_ngrams(self):
if hasattr(self, "_old_scores"):
self.scores = dict(self._old_scores)
del self._old_scores
def _score_query(self, query):
query = self.analyzer(query)
query_len = len(query)
val = [0] * (query_len + 1)
idx = [0] * (query_len + 1)
for i in range(1, query_len + 1):
max_val = -1
for j in range(i):
subquery = " ".join(query[j:i])
if subquery in self.scores:
score = self.scores[subquery] + val[j]
else:
score = val[j]
if score >= max_val:
max_val = score
idx[i] = j
val[i] = max_val
return query, val, idx
def _segment(self, query, return_scores=False):
"""Segment a query with dynamic programming."""
query, val, idx = self._score_query(query)
query_len = len(query)
segments = []
while query_len > 0:
query_len = idx[query_len]
segments.insert(0, " ".join(query[query_len:]))
query = query[:query_len]
if return_scores:
return segments, val
return segments
def segment_to_bio(self, query, tag_label="PHRASE"):
"""Segment a query with dynamic programming."""
query, val, idx = self._score_query(query)
query_len = len(query)
segments = []
bio_labels = []
B_LABEL = f"B-{tag_label}"
I_LABEL = f"I-{tag_label}"
O_LABEL = "O"
while query_len > 0:
query_len = idx[query_len]
seg = query[query_len:]
# segments.insert(0, seg)
segments.append(seg)
n = len(seg)
bio = [I_LABEL] * n
if n == 1:
bio = [O_LABEL]
else:
bio[0] = B_LABEL
query = query[:query_len]
# bio_labels.insert(0, bio)
bio_labels.append(bio)
segments = sum(reversed(segments), [])
bio_labels = sum(reversed(bio_labels), [])
return segments, bio_labels
if __name__ == "__main__":
fast_segmenter = FastQuerySegmenter()
queries = ["hello world", "how to find a new place", "where is a new world"]
query_weights = [5, 6, 1]
fast_segmenter.compute_scores(queries, alpha=1)
fast_segmenter.add_ground_truth_ngrams(["new world", "new place"])
ngram_counts = Counter()
for query in tqdm(queries):
segments = fast_segmenter.segment(query)
ngram_counts.update([s for s in segments if " " in s])
print(f"{query=} -> {segments=}")
import matplotlib.pyplot as plt
scores = np.array(list(fast_segmenter.scores.values()))
plt.hist(np.log10(scores + 1), bins=100, cumulative=True, density=True)
plt.xlabel("$X = log_{10}(score + 1)$")
plt.ylabel("$P(x < X)$")
top_ngrams = list(
zip(range(100), sorted(fast_segmenter.scores.items(), key=lambda x: -x[1]))
)
print(top_ngrams)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment