Last active
October 16, 2023 16:24
-
-
Save napsternxg/156147c9eb3084b75f681afd06effe07 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Faster Implementation of Unsupervised Query Segmentation. | |
Uses vectorized operations | |
- author: @napsternxg | |
Unsupervised Query Segmentation Using only Query Logs [Mishra et. al. 2011] | |
https://www.microsoft.com/en-us/research/wp-content/uploads/2011/01/pp0295-mishra.pdf | |
Based on the python implementation of https://github.com/kchro/query-segmenter | |
Usage: | |
from query_segmenter.unsupervised import Segmenter | |
segmenter = FastQuerySegmenter() | |
segmenter.compute_scores(queries) | |
segments = segmenter.segment('new iphone 6') | |
assert segments == ['new', 'iphone 6'] | |
Related sources: | |
- [Neural Methods for Intent understanding/reformulation](https://docs.google.com/presentation/d/1YBtqRMCEw9dLrr5zLC1nZIYtygAvJ7T2vQmV7HahBO8/edit#slide=id.gac24ab650a_1_1) | |
- [Unsupervised Query Segmentation Using Clickthrough for Information Retrieval](https://www.cse.iitb.ac.in/~soumen/doc/www2013/QirWoo/LiHZW2011ClickQuerySegment.pdf) | |
- [Learning Noun Phrase Query Segmentation](https://aclanthology.org/D07-1086.pdf) | |
- [E-commerce Concept Mining](https://docs.google.com/presentation/d/1hBZnFF413H8eFz4czI-fibq1FdvcmowkLvfBh1tvxj0/edit#slide=id.gafd9c6c2a6_0_67) | |
""" | |
import math | |
import re | |
from sklearn.feature_extraction.text import CountVectorizer | |
from scipy.sparse import csr_matrix, diags | |
from scipy.special import factorial | |
import numpy as np | |
from tqdm.auto import tqdm | |
from collections import Counter | |
import logging | |
logging.basicConfig( | |
format="%(asctime)s %(levelname)-8s %(message)s", | |
level=logging.INFO, | |
datefmt="%Y-%m-%d %H:%M:%S", | |
) | |
class Segmenter(object): | |
"""Implementation of Unsupervised Query Segmentation.""" | |
def __init__(self, save_stats=False): | |
"""Construct Segmenter.""" | |
self.scores = defaultdict(float) | |
self.stats = defaultdict(lambda: self._initialize_ngram()) | |
self.save_stats = save_stats | |
def compute_scores(self, queries, alpha=1, beta=0.0): | |
"""Compute segmentation scores. | |
Parameters | |
---------- | |
queries : list (n_queries) | |
Training data for determining segments. | |
alpha : int, optional | |
Minimum query frequency. All words in an n-gram must occur in | |
at least alpha queries to be considered a "significant n-gram." | |
beta : float, optional | |
beta * k is the segmentation score threshold, where k is the number | |
of queries that contains all the words of a given n-gram. | |
Returns | |
------- | |
self.scores : dict | |
A mapping of ngram to segmentation score. | |
Algorithm | |
--------- | |
Consider an n-gram M = [word_1, word_2, ..., word_n]. | |
Let {query_1, query_2, ..., query_k} be the subset of queries | |
that contain all the words of M in some order. | |
To test if M is a multi-word entity, we check if M occurs | |
together more often than not. | |
""" | |
sig_ngrams = self._get_significant_ngrams(queries, alpha=alpha) | |
self._precompute_stats(sig_ngrams, queries) | |
for ngram in tqdm(sig_ngrams): | |
N = self.stats[ngram]["frequency"] | |
k = max(self.stats[ngram]["co_occur"], 1) | |
E_X = self.stats[ngram]["expectation"] | |
score = 2 * (N - E_X) ** 2 / k | |
if score < beta * k: | |
# if the score does not exceed threshold | |
continue | |
self.scores[ngram] = score | |
return self.scores | |
def segment(self, query): | |
"""Segment a query (single or batch). | |
Parameters | |
---------- | |
query : str or array-like | |
Returns | |
------- | |
segments : 1 or 2D list of strings | |
""" | |
if isinstance(query, str): | |
segments = self._segment(query) | |
elif hasattr(query, "__iter__"): | |
segments = list(map(self._segment, query)) | |
return segments | |
def _get_significant_ngrams(self, queries, ngram_range=(2, 10), alpha=1): | |
"""Get significant n-grams in the list of queries. | |
An n-gram must occur at least "alpha" times to be significant. | |
CountVectorizer | |
""" | |
cv = CountVectorizer( | |
ngram_range=ngram_range, token_pattern=r"\w+", min_df=alpha | |
) | |
cv.fit(queries) | |
sig_ngrams = list(cv.vocabulary_.keys()) | |
print(f"Found {len(sig_ngrams)} sig_ngrams") | |
return sig_ngrams | |
def _precompute_stats(self, sig_ngrams, queries): | |
"""Precompute counts for each ngram M. | |
The segmentation score is a function of three values: | |
- the number of times M occurs in any order ( k ) | |
- the number of times M occurs in that order ( N ) | |
- the total probability that M occurs in any order ( E(X) ) | |
A high score indicates that M is a Multi-Word Entity. | |
A low score indicates that M is not a Multi-Word Entity. | |
""" | |
for ngram in tqdm(sig_ngrams): | |
ngram_words = ngram.split() | |
ngram_len = len(ngram_words) | |
for query in queries: | |
sub_queries = query.split() | |
if all(word in sub_queries for word in ngram_words): | |
# if all words in n-gram co-occurs in query | |
self.stats[ngram]["co_occur"] += 1 | |
if self._match_ngram(ngram, query): | |
# if the n-gram occurs in query | |
self.stats[ngram]["frequency"] += 1 | |
query_len = len(sub_queries) | |
numer = math.factorial(query_len - ngram_len + 1) | |
denom = math.factorial(query_len) | |
prob = numer / denom | |
self.stats[ngram]["expectation"] += prob | |
return self.stats | |
def _initialize_ngram(self): | |
"""Initialize the information to collect about each ngram.""" | |
return {"frequency": 0, "co_occur": 0, "expectation": 0} | |
def _match_ngram(self, ngram, query): | |
match = re.search(rf"\b{ngram}\b", query) | |
return match is not None | |
def _segment(self, query): | |
"""Segment a query with dynamic programming.""" | |
query = query.split() | |
query_len = len(query) | |
val = [0] * (query_len + 1) | |
idx = [0] * (query_len + 1) | |
for i in range(1, query_len + 1): | |
max_val = -1 | |
for j in range(i): | |
subquery = " ".join(query[j:i]) | |
if subquery in self.scores: | |
score = self.scores[subquery] + val[j] | |
else: | |
score = val[j] | |
if score >= max_val: | |
max_val = score | |
idx[i] = j | |
val[i] = max_val | |
segments = [] | |
while query_len > 0: | |
query_len = idx[query_len] | |
segments.insert(0, " ".join(query[query_len:])) | |
query = query[:query_len] | |
return segments | |
class FastQuerySegmenter(object): | |
"""Implementation of Unsupervised Query Segmentation.""" | |
def __init__(self): | |
"""Construct Segmenter.""" | |
self.scores = dict() | |
self.stats = dict() | |
def compute_scores( | |
self, queries, alpha=1, beta=0.6, ngram_range=(2, 10), query_weights=None | |
): | |
"""Compute segmentation scores. | |
Parameters | |
---------- | |
queries : list (n_queries) | |
Training data for determining segments. | |
alpha : int, optional | |
Minimum query frequency. All words in an n-gram must occur in | |
at least alpha queries to be considered a "significant n-gram." | |
beta : float, optional | |
beta * k is the segmentation score threshold, where k is the number | |
of queries that contains all the words of a given n-gram. | |
Returns | |
------- | |
self.scores : dict | |
A mapping of ngram to segmentation score. | |
Algorithm | |
--------- | |
Consider an n-gram M = [word_1, word_2, ..., word_n]. | |
Let {query_1, query_2, ..., query_k} be the subset of queries | |
that contain all the words of M in some order. | |
To test if M is a multi-word entity, we check if M occurs | |
together more often than not. | |
""" | |
self.scores = self._get_significant_ngrams( | |
queries, | |
ngram_range=ngram_range, | |
alpha=alpha, | |
beta=beta, | |
query_weights=query_weights, | |
) | |
return self.scores | |
def segment(self, query): | |
"""Segment a query (single or batch). | |
Parameters | |
---------- | |
query : str or array-like | |
Returns | |
------- | |
segments : 1 or 2D list of strings | |
""" | |
if isinstance(query, str): | |
segments = self._segment(query, return_scores=False) | |
elif hasattr(query, "__iter__"): | |
segments = [self._segment(q, return_scores=False) for q in query] | |
return segments | |
def _get_significant_ngrams( | |
self, queries, ngram_range=(2, 10), alpha=1, beta=0.6, query_weights=None | |
): | |
"""Get significant n-grams in the list of queries. | |
An n-gram must occur at least "alpha" times to be significant. | |
CountVectorizer | |
""" | |
if query_weights is not None: | |
query_weights = diags(query_weights) | |
else: | |
query_weights = diags(np.ones(len(queries))) | |
logging.info(f"{query_weights.shape=}") | |
self.cv_unigram = CountVectorizer( | |
ngram_range=(1, 1), token_pattern=r"\w+", min_df=1 | |
) | |
transform_unigram_queries = self.cv_unigram.fit_transform(queries) | |
logging.info(f"{transform_unigram_queries.shape=}") | |
self.analyzer = self.cv_unigram.build_analyzer() | |
query_len = transform_unigram_queries.sum(axis=1).A1 | |
logging.info(f"{query_len.shape=}") | |
self.cv_ngram = CountVectorizer( | |
ngram_range=ngram_range, token_pattern=r"\w+", min_df=alpha | |
) | |
transform_ngram_queries = self.cv_ngram.fit_transform(queries) | |
logging.info(f"{transform_ngram_queries.shape=}") | |
self.sig_ngrams = self.cv_ngram.get_feature_names_out() | |
self.sig_ngrams_vocab = self.cv_ngram.vocabulary_ | |
transform_unigram_ngrams = self.cv_unigram.transform(self.sig_ngrams) | |
logging.info(f"{transform_unigram_ngrams.shape=}") | |
# This step is the slowest. Find ways to optimize it so that is runs for large datasets | |
# ngram_co_occur = ((transform_unigram_ngrams > 0).astype(int) @ (transform_unigram_queries.T > 0).astype(int)) | |
# ngrams x unigrams dot unigrams x queries = ngrams x queries | |
ngram_co_occur = ( | |
(transform_unigram_ngrams > 0) | |
.astype(int) | |
.dot(transform_unigram_queries.T > 0) | |
.astype(int) | |
) | |
logging.info(f"{ngram_co_occur.shape=}") | |
ngram_co_occur_nz = ngram_co_occur.nonzero() | |
# Can be weighted using query_weights | |
ngram_frequency = ( | |
(transform_ngram_queries.T > 0) | |
.astype(int) | |
.dot(query_weights) | |
.sum(axis=1) | |
.A1 | |
) | |
logging.info(f"{ngram_frequency.shape=}") | |
ngram_unigram_len = (transform_unigram_ngrams > 0).sum(axis=1).A1 | |
logging.info(f"{ngram_unigram_len.shape=}") | |
# ngram_co_occur_equal_len = (ngram_co_occur[ngram_co_occur_nz] == ngram_unigram_len[ngram_co_occur_nz[0]]).A1 | |
logging.info(f"{ngram_co_occur.data.shape=}") | |
logging.info(f"{ngram_unigram_len[ngram_co_occur_nz[0]].shape=}") | |
ngram_co_occur_equal_len = ( | |
ngram_co_occur.data == ngram_unigram_len[ngram_co_occur_nz[0]] | |
) | |
logging.info(f"{ngram_co_occur_equal_len.shape=}") | |
# Can include weight for queries using query_weights | |
ngram_co_occur_true = ( | |
csr_matrix( | |
(ngram_co_occur_equal_len, ngram_co_occur_nz), | |
shape=ngram_co_occur.shape, | |
) | |
.dot(query_weights) | |
.sum(axis=1) | |
.A1 | |
) | |
logging.info(f"{ngram_co_occur_true.shape=}") | |
all_factorials = factorial(np.arange(query_len.max() + 1)) | |
logging.info(f"{all_factorials.shape=}") | |
numerator = all_factorials[ | |
( | |
query_len[ngram_co_occur_nz[1]] | |
- ngram_unigram_len[ngram_co_occur_nz[0]] | |
+ 1 | |
) | |
] | |
logging.info(f"{numerator.shape=}") | |
denominator = all_factorials[query_len[ngram_co_occur_nz[1]]] | |
logging.info(f"{denominator.shape=}") | |
# Can include weight for queries using query_weights | |
ngram_expected_prob = ( | |
csr_matrix( | |
( | |
ngram_co_occur_equal_len * (numerator / denominator), | |
ngram_co_occur_nz, | |
), | |
shape=ngram_co_occur.shape, | |
) | |
.dot(query_weights) | |
.sum(axis=1) | |
.A1 | |
) | |
logging.info(f"{ngram_expected_prob.shape=}") | |
if self.save_stats: | |
self.stats = dict( | |
co_occur=ngram_co_occur_true, | |
frequency=ngram_frequency, | |
expected_prob=ngram_expected_prob, | |
) | |
N = ngram_frequency | |
k = np.clip(ngram_co_occur_true, 1, None) | |
E_X = ngram_expected_prob | |
scores = (2 * (N - E_X) ** 2) / k | |
self.max_score = scores.max() | |
logging.info(f"{scores.shape=}") | |
if beta > 0: | |
# scores = np.where(scores > beta*k, scores, 0) | |
self.scores = dict( | |
((key, val) for key, val in zip(self.sig_ngrams, scores) if val > 0) | |
) | |
else: | |
self.scores = dict(zip(self.sig_ngrams, scores)) | |
return self.scores | |
def add_ground_truth_ngrams(self, ngrams): | |
# self.max_score = max(self.scores.values()) | |
# Retail old scores for later reference | |
self._old_scores = dict(self.scores) | |
for ngram in tqdm(ngrams): | |
ngrams = self.analyzer(ngram) | |
if len(ngrams) > 1: | |
ngram = " ".join(ngrams) | |
self.scores[ngram] = self.max_score | |
def reset_truth_ngrams(self): | |
if hasattr(self, "_old_scores"): | |
self.scores = dict(self._old_scores) | |
del self._old_scores | |
def _score_query(self, query): | |
query = self.analyzer(query) | |
query_len = len(query) | |
val = [0] * (query_len + 1) | |
idx = [0] * (query_len + 1) | |
for i in range(1, query_len + 1): | |
max_val = -1 | |
for j in range(i): | |
subquery = " ".join(query[j:i]) | |
if subquery in self.scores: | |
score = self.scores[subquery] + val[j] | |
else: | |
score = val[j] | |
if score >= max_val: | |
max_val = score | |
idx[i] = j | |
val[i] = max_val | |
return query, val, idx | |
def _segment(self, query, return_scores=False): | |
"""Segment a query with dynamic programming.""" | |
query, val, idx = self._score_query(query) | |
query_len = len(query) | |
segments = [] | |
while query_len > 0: | |
query_len = idx[query_len] | |
segments.insert(0, " ".join(query[query_len:])) | |
query = query[:query_len] | |
if return_scores: | |
return segments, val | |
return segments | |
def segment_to_bio(self, query, tag_label="PHRASE"): | |
"""Segment a query with dynamic programming.""" | |
query, val, idx = self._score_query(query) | |
query_len = len(query) | |
segments = [] | |
bio_labels = [] | |
B_LABEL = f"B-{tag_label}" | |
I_LABEL = f"I-{tag_label}" | |
O_LABEL = "O" | |
while query_len > 0: | |
query_len = idx[query_len] | |
seg = query[query_len:] | |
# segments.insert(0, seg) | |
segments.append(seg) | |
n = len(seg) | |
bio = [I_LABEL] * n | |
if n == 1: | |
bio = [O_LABEL] | |
else: | |
bio[0] = B_LABEL | |
query = query[:query_len] | |
# bio_labels.insert(0, bio) | |
bio_labels.append(bio) | |
segments = sum(reversed(segments), []) | |
bio_labels = sum(reversed(bio_labels), []) | |
return segments, bio_labels | |
if __name__ == "__main__": | |
fast_segmenter = FastQuerySegmenter() | |
queries = ["hello world", "how to find a new place", "where is a new world"] | |
query_weights = [5, 6, 1] | |
fast_segmenter.compute_scores(queries, alpha=1) | |
fast_segmenter.add_ground_truth_ngrams(["new world", "new place"]) | |
ngram_counts = Counter() | |
for query in tqdm(queries): | |
segments = fast_segmenter.segment(query) | |
ngram_counts.update([s for s in segments if " " in s]) | |
print(f"{query=} -> {segments=}") | |
import matplotlib.pyplot as plt | |
scores = np.array(list(fast_segmenter.scores.values())) | |
plt.hist(np.log10(scores + 1), bins=100, cumulative=True, density=True) | |
plt.xlabel("$X = log_{10}(score + 1)$") | |
plt.ylabel("$P(x < X)$") | |
top_ngrams = list( | |
zip(range(100), sorted(fast_segmenter.scores.items(), key=lambda x: -x[1])) | |
) | |
print(top_ngrams) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment