Last active
August 3, 2024 10:04
-
-
Save eliotl/411a045b45daf94b5184a9d34600c6ee to your computer and use it in GitHub Desktop.
hollowPM - Hollow Phonetic Matrix module
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is my Hollow Phonetics Matrix module: It is for handling different poetic | |
word search operations for my Flask website (https://puns.plus) It is a handler | |
for three large 45,000 x 45,000 nearest neighbor matrices that are too big for | |
me to want to load into memory. (One for overall sound similarity, one for | |
word2vec meaning similarity, and one for presence of individual phonetic features.) | |
Given different search requests, this translates from search word to vector | |
number, unpickles individual vectors from the disk as needed, performs the | |
necessary math, and then renders the results as words. | |
>>> pm = HollowPM(basepath) | |
Words that sound the most like 'clam'. (Phonetic similarity.) | |
>>> v1 = pm.ph('clam') | |
>>> v1 | |
<1x45829 sparse matrix of type '<class 'numpy.float64'>' | |
with 5000 stored elements in Compressed Sparse Row format> | |
>>> pm.render(v1, n=8) | |
['Klamm', 'clam', 'clammy', 'clams', 'clamp', 'clamor', 'clamps', 'clamped'] | |
Words that are most similar in meaning to 'clam'. (word2vec semantic similarity.) | |
>>> v2 = pm.w2('clam') | |
>>> pm.render(v2, n=8) | |
['clam', 'clams', 'oyster', 'lobster', 'crab', 'crabs', 'scallop', 'oysters'] | |
Words that start with the sounds "kl". | |
>>> v3 = pm.gr('#kl') | |
>>> pm.render(v3, n=8) | |
['Clo', 'clay', 'Clay', 'claw', 'clue', 'Cleo', 'Chloe', 'claws'] | |
Words that sound like 'clam' AND have a similar meaning to 'clam' | |
>>> vCombo = v1.multiply(v2) | |
>>> pm.render(vCombo, n=8) | |
['clam', 'clams', 'clambake', 'lamb', 'ham', 'yam', 'calamari', 'scampi'] | |
Words that sound like 'clam' AND have a similar meaning to 'clam' AND start with | |
"kl" | |
>>> vComboAll = vCombo.multiply(v3) | |
>>> pm.render(vComboAll, n=10) | |
['clam', 'clams', 'clambake', 'clove', 'claws', 'cloves', 'clump', 'clover', | |
'clownfish', 'clementine'] | |
Words that are most similar in sound to both 'clam' AND 'lobster' | |
>>> vv = pm.ph('clam*lobster') | |
>>> pm.render(vv, n=8) | |
['Kloster', 'clatter', 'clobber', 'clamber', 'cloister', 'cluster', 'clamor', | |
'clobbered'] | |
Words that have a "kl" sound but don't start with it | |
>>> pm.render(pm.gr('kl') - pm.gr('#kl'), n=8) | |
['cackling', 'quickly', 'Buckley', 'thickly', 'sickly', 'acclaim', 'psychically', | |
'sackcloth'] | |
""" | |
# phVecs is another module in this project which handles generating | |
# phonetic feature vectors and associated math. | |
from phVecs import FeatureBank, double_prune_N | |
import numpy as np | |
from scipy import sparse as sps | |
import sklearn as skl | |
import pickle | |
class HollowPM: | |
""" Hollow Phonetics Matrix wrapper. A handler for three large nearest- | |
neighbor matrices for a given vocabulary: Phonetic similarity (ph), | |
Semantic similarity (w2), and phonetic features (gr). | |
""" | |
def __init__(self, basepath, vocabName): | |
# Set the paths for opening the needed vectors | |
self._basepath = basepath | |
self._vocabName = vocabName | |
self._phNaybsPath = "phv_naybs/" + self._vocabName + "_" | |
self._w2NaybsPath = "w2v_naybs/" + self._vocabName + "_" | |
self._grNaybsPath = "grm_naybs/" + self._vocabName + "_" | |
self._posPath = "posDict" + self._vocabName | |
self._meterPath = "meterDict" + self._vocabName | |
# keyDict is a dictionary that maps words to indices and indices to words. | |
self.keyDict = self._open_pickle("keyDict" + self._vocabName) | |
# FeatureDeposit maps all of the individual phonetic feature vectors to their index. | |
self.fb = FeatureBank() | |
xShape = len(self.keyDict) // 2 # The number of words in the vocabulary | |
yShape = self.fb.maxLen_ # The number of phonetic features in the model | |
self.shape = (xShape, yShape) | |
def __getitem__(self, item): | |
"""Return the corresponding index or word for a given word or index.""" | |
return self.keyDict.get(item) | |
def _ix(self, item): | |
"""Return the index for a given word. If given an index, returns the | |
same index. | |
""" | |
if isinstance(item, str): | |
i = self.keyDict.get(item) | |
elif item in self.keyDict: | |
i = item | |
elif int(item) in self.keyDict: | |
i = int(item) | |
return i | |
def _ix_gr(self, item): | |
"""Return the index for a given phonetic feature. If given an index, | |
returns the same index. | |
""" | |
if isinstance(item, str): | |
i = self.fb[item] | |
elif isinstance(item, list): | |
i = self.fb[item] | |
elif isinstance(item, int): | |
if item in range(nd.maxLen_): | |
i = item | |
else: | |
i = None | |
return i | |
def _open_sum(self, indices, open_function): | |
"""Opens a list of nearest-neighbor vectors by index, adds and | |
normalizes them. | |
""" | |
vecs = [open_function(i) for i in indices] | |
vecs = [v for v in vecs if v.count_nonzero()] | |
summed = self.normalize(sum(vecs)) | |
return summed | |
def _open_prod(self, indices, open_function): | |
"""Opens a list of nearest-neighbor vectors by index, multiplies and | |
normalizes them. | |
""" | |
vecs = [open_function(i) for i in indices] | |
vecs = [v for v in vecs if v.count_nonzero()] | |
prodded = self.multiply_list(vecs, NORM=True) | |
return prodded | |
def _open_vector(self, prefix, i): | |
"""Opens a vector from the disk, given a prefix and index.""" | |
try: | |
return self._open_pickle(prefix + str(i)) | |
except: | |
return self.empty_nayb() | |
def _open_split_vector(self, prefix, i): | |
"""Opens a vector from the disk, if the vector is split into separate | |
numpy arrays of its data and indices | |
""" | |
data = self._open_pickle(prefix + str(i) + "_data") | |
inds = self._open_pickle(prefix + str(i) + "_inds") | |
return data, inds | |
def _n_range_handle(self, array, n): | |
""" | |
A protocol for handling requests for different slices as a function input. | |
Used in HollowPM.render() and HollowSS.print_nearest_crosses() | |
""" | |
if isinstance(n, tuple): | |
array = array[n[0] : n[1]] | |
elif n < 0: | |
array = array[n:] | |
else: | |
array = array[:n] | |
return array | |
def _open_pickle(self, path): | |
"""Opens a pickled object off of the disk in the path where the | |
HollowPM's data is stored. | |
""" | |
with open(self._basepath + path + ".pickle", "rb") as f: | |
item = pickle.load(f) | |
return item | |
def ph(self, i): | |
"""Open a word's phonetic nearest-neighbor vector, | |
or combine multiple words' nearest-neighbor vectors. | |
Uses cosine similarity of the phonetic feature vectors from phVecs to determine similarity. | |
Parameters | |
---------- | |
i: int or str | |
if int: returns a single word's nearest-neighbor vector by index. | |
if str: returns a single word's nearest-neighbor vector, or combines | |
multiple words' nearest-neighbor vectors. | |
To combine multiple words, input a string with the words separated by '*' or '+' | |
self.ph('clam+lobster') returns the sum of the vectors for 'clam' and 'lobster' | |
self.ph('clam*lobster') returns the product of the vectors for 'clam' and 'lobster' | |
Use '+' if you want to return neighbors for "clam OR lobster" | |
Use '*' if you want to return neighbors for "clam AND lobster" | |
If you mix '+' and '*', the order of operations is backwards: + resolves before * | |
Returns | |
------- | |
naybVec: CSR matrix | |
A nearest neighbors vector for the given word or combination of words. | |
""" | |
if "*" in i: | |
# Return the product of all of the words in the input string | |
return self._open_prod(i.split("*"), self.ph) | |
if "+" in i: | |
# Return the sum of all of the words in the input string | |
return self._open_sum(i.split("+"), self.ph) | |
i = self._ix(i) | |
naybVec = self._open_vector(self._phNaybsPath, i) | |
return naybVec | |
def w2(self, i): | |
"""Open a word's semantic nearest-neighbor vector, | |
or combine multiple words' nearest-neighbor vectors. | |
Uses word2vec cosine distance to determine semantic (meaning) similarity. | |
Parameters | |
---------- | |
i: int or str | |
if int: returns a single word's nearest-neighbor vector by index. | |
if str: returns a single word's nearest-neighbor vector, or combines | |
multiple words' nearest-neighbor vectors. | |
To combine multiple words, input a string with words separated by '*' or '+': | |
self.w2('China+France') returns the sum of the vectors for 'China' and 'France' | |
self.w2('China*France') returns the product of the vectors for 'China' and 'France' | |
Use '+' if you want to return neighbors of 'China' OR 'France' | |
Use '*' if you want to return neighbors of 'China' AND 'France' | |
If you mix '+' and '*', the order of operations is backwards: '+' resolves before '*' | |
Returns | |
------- | |
naybVec: CSR matrix | |
A nearest neighbors vector for the given word or combination of words. | |
""" | |
if "*" in i: | |
# Return the product of all of the words in the input string | |
return self._open_prod(i.split("*"), self.w2) | |
if "+" in i: | |
# Return the product of all of the words in the input string | |
return self._open_sum(i.split("+"), self.w2) | |
i = self._ix(i) | |
naybVec = self._open_vector(self._w2NaybsPath, i) | |
return naybVec | |
def gr(self, i): | |
"""Open a phonetic feature's nearest-neighbor vector, | |
or combine multiple phonetic features' nearest-neighbor vectors. | |
Uses normalized phonetic feature vectors from phVecs to determine similarity. | |
Parameters | |
---------- | |
i: int or str | |
if int: returns a single feature's nearest-neighbor vector by index. | |
if str: returns a single feature's nearest-neighbor vector, or combines | |
multiple features' nearest-neighbor vectors. | |
To combine multiple words, input a string with the words separated by '*' or '+' | |
self.gr('b+v') returns the sum of the vectors for 'b' and 'v' | |
self.gr('b*v') returns the product of the vectors for 'b' and 'v' | |
Use '+' if you want to return words with 'b' OR 'v' | |
Use '*' if you want to return words with 'b' AND 'v' | |
If you mix '+' and '*', the order of operations is backwards: '+' resolves before '*' | |
Returns | |
------- | |
naybVec: CSR matrix | |
A nearest neighbors vector for the given word or combination of words. | |
""" | |
if "*" in i: | |
# Return the product of all of the words in the input string | |
return self._open_prod(i.split("*"), self.gr) | |
if "+" in i: | |
# Return the sum of all of the words in the input string | |
return self._open_sum(i.split("+"), self.gr) | |
i = self._ix_gr(i) | |
naybVec = self._open_vector(self._grNaybsPath, i) | |
return naybVec | |
def render( | |
self, | |
naybVec, | |
n=20, | |
iList=None, | |
VALUES=False, | |
rounding=3, | |
POSITIVE=True, | |
BYMETER=False, | |
BYPOS=False, | |
): | |
"""Converts a nearest-neighbors vector into a sorted list of the words | |
it represents. | |
Parameters | |
---------- | |
naybVec: CSR Matrix | |
A nearest-neighbors vector corresponding to samples in the | |
HollowPM's vocabulary. | |
n: int or tuple | |
Determines which slice of neighbors to return: | |
If n is a positive int: Returns the top n values | |
If n is a negative int: Returns the bottom -n values | |
If n is a tuple: Returns the top n[0] to n[1] values | |
iList: array | |
A mapping for indices if naybVec represents a subset of | |
VALUES: bool | |
If True, render() will return a list of (word, value) tuples, | |
instead of just words. | |
rounding: int | |
Determines what place to round values to. | |
POSITIVE: bool | |
Whether to only return neighbors with positive scores. | |
BYMETER: bool | |
Whether to return the results in a dict, sorted by meter. | |
BYPOS: bool | |
Whether to return the results in a dict, sorted by part-of-speech. | |
""" | |
def sort_render(self, indices, sortDict, PRINT=True): | |
""" | |
Sorts a list of indices by a trait mapped in sortDict. | |
Prints the results or returns them in a defaultdict. | |
""" | |
sortedInds = defaultdict(list) | |
for i in indices: | |
sortedInds[sortDict[i]].append(self[i]) | |
if PRINT: | |
# Print the words to output | |
for k, v in sortedInds.item(): | |
print(k) | |
print(", ".join(items)) | |
else: | |
# Return as a defaultdict | |
return sortedInds | |
sorts = np.argsort(-naybVec.data) | |
if POSITIVE: | |
sorts = sorts[np.where(naybVec.data[sorts] > 0)] | |
sorts = self._n_range_handle(sorts, n) | |
if iList is not None: | |
indices = iList[naybVec.indices[sorts]] | |
else: | |
indices = naybVec.indices[sorts] | |
if BYMETER: | |
# Sort the words by their meter | |
sortDict = self._open_pickle(self.meterPath) | |
return self.sort_render(indices, sortDict, PRINT=False) | |
if BYPOS: | |
# Sort the words by their part of speech | |
sortDict = self._open_pickle(self.posPath) | |
return self.sort_render(indices, sortDict, PRINT=False) | |
if VALUES: | |
# Return the list of words with their values from the vector | |
outList = [ | |
(self[i], np.round(float(v), rounding)) | |
for i, v in zip(indices, naybVec.data[sorts]) | |
] | |
else: | |
# Return as a list of words | |
outList = [self[i] for i in indices] | |
return outList | |
def multiply_list(self, vecs, NORM=True): | |
"""Perform vector multiplication a list of vectors. | |
Normalizes them by default. | |
""" | |
runningVec = vecs[0] | |
for vec in vecs[1:]: | |
runningVec = runningVec.multiply(vec) | |
if NORM: | |
runningVec = self.normalize(runningVec) | |
return runningVec | |
def empty_nayb(self): | |
"""Return an empty nearest-neighbors vector""" | |
return sps.csr_matrix( | |
(np.array([]), np.array([]), np.array([0, 0])), shape=(1, self.shape[0]) | |
) | |
def count_to_nayb(self, count): | |
"""Converts a Counter object of phonetic features to a nearest-neighbor | |
vector. Can be used for performing phonetic searches for words outside | |
of the vocabulary. | |
""" | |
nayb = self.empty_nayb() | |
l2Norm = 0 | |
for k, v in count.items(): | |
nayb = nayb + self.gr(k) * v | |
l2Norm += v ** 2 | |
l2Norm **= 0.5 | |
nayb = nayb / l2Norm | |
return nayb | |
def normalize(self, vec): | |
"""Normalizes a vector using the L2 Norm.""" | |
return skl.preprocessing.normalize(vec) | |
def slml(self, sl, ml, m=1, vecN=None, n=None): | |
"""Runs a "Sounds like, means like" search: returning words which sound | |
like one word and have similar meanings to another. | |
Parameters | |
---------- | |
sl: str | |
A word or combination of words to search for phonetic similarity. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
ml: str | |
A word or combination of words to search for semantic similarity. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
m: float | |
The amount of weight to give to semantic similarity rather than | |
phonetic similarity. An exponent the values of the "sounds-like" | |
component are raised to. | |
vecN: int or None | |
The number of neighbors to retain for each search word. Defaults | |
to all. | |
n: int or None | |
The number of neighbors to render. By default, slml returns the | |
full neighbor vector as a vector. | |
If an int is given for n, slml returns a list of strings | |
Returns | |
------- | |
By default, the nearest-neighbor vector of the search. | |
If n is an int, this returns a list of strings of the top words from | |
the search. | |
""" | |
a = self.ph(sl) | |
b = self.w2(ml) | |
if vecN: | |
a = self.sort_vec(a, vecN, AS_VECTOR=True) | |
b = self.sort_vec(b, vecN, AS_VECTOR=True) | |
if m != 1: | |
a.data = a.data ** m | |
c = a.multiply(b) | |
if n is not None: | |
return self.render(c, n=n) | |
else: | |
return c | |
def pairs(self, aWord, bWord, m=0.5, n=25, vecN=None): | |
"""Runs a "perfect pairs" search on two words or categories. | |
Parameters | |
---------- | |
aWord: str | |
The first word or group of words to pull "synonyms" for. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
bWord: str | |
The second word or group of words to pull "synonyms" for. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
m: float | |
The amount of weight to give to semantic similarity rather than | |
phonetic similarity. Defaults to equal weights. | |
n: int | |
The number of neighbors to retain in the nearest-neighbors matrix | |
vecN: int | |
The number of "synonyms" to retain for each search word. Defaults | |
to all. | |
Returns | |
------- | |
A HollowSS object, which can be queried for overall top matches, or top | |
matches for a given word. | |
""" | |
aVec = self.w2(aWord) | |
bVec = self.w2(bWord) | |
if vecN is not None: | |
aVec = self.sort_vec(aVec, n=vecN, AS_VECTOR=True) | |
bVec = self.sort_vec(bVec, n=vecN, AS_VECTOR=True) | |
return self.hollow_pairs(aVec, bVec, self._phNaybsPath, m=m) | |
def flip_pairs(self, aWord, bWord, m=0.5, n=25, vecN=None): | |
"""Runs a "meaning mates" search on two words or groups of words. | |
(The converse of self.pairs()) | |
Parameters | |
---------- | |
aWord: str | |
The first word or group of words to pull phonetic neighbors for. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
bWord: str | |
The second word or group of words to pull phonetic neighbors for. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
m: float | |
The amount of weight to give to phonetic similarity rather than | |
semantic similarity. Defaults to equal weights. | |
n: int | |
The number of neighbors to retain in the nearest-neighbors matrix. | |
vecN: int | |
The number of phonetic neighbors to retain for each search word. | |
Defaults to all. | |
Returns | |
------- | |
A HollowSS object, which can be queried for overall top matches, or top | |
matches for a given word. | |
""" | |
aVec = self.ph(aWord) | |
bVec = self.ph(bWord) | |
if vecN is not None: | |
aVec = self.sort_vec(aVec, n=vecN, AS_VECTOR=True) | |
bVec = self.sort_vec(bVec, n=vecN, AS_VECTOR=True) | |
return self.hollow_pairs(aVec, bVec, self._w2NaybsPath, m=m) | |
def hollow_pairs(self, aVec, bVec, prefix, m=0.5, n=25): | |
""" | |
Does the math for generating a HollowSS search object. | |
Find all pairwise cosine distances between the elements of two | |
nearest-neighbor vectors. | |
N is a nearest-neighbors matrix, the result of the operation | |
x.T • A • y, where A is an adjacency matrix for the entire vocabulary, | |
and x and y are two input vectors. To save compute, only the vectors | |
which correspond to a nonzero value in aVec are loaded into memory. | |
N is then thresholded to only retain the top n neighbors in each row | |
and column. (This not only saves memory, but improves the salience of | |
the results.) | |
""" | |
# Make the vectors orthogonal so a word can only be in one category or | |
# the other. | |
aVec, bVec = self.deredund_vectors(aVec, bVec) | |
# Build the nearest-neighbors matrix as a scipy CSR matrix by compiling | |
# each row's data and indices. | |
dataList = [] | |
indsList = [] | |
indptrList = [0] | |
iList = aVec.indices | |
# For each index in aVec, open its nearest-neighbor vector and compile | |
# it into a matrix. | |
for i in iList: | |
vec = self._open_vector(prefix, i) | |
dataList.append(vec.data) | |
indsList.append(vec.indices) | |
indptrList.append(len(vec.indices)) | |
data = np.concatenate(dataList) | |
indices = np.concatenate(indsList) | |
indptr = np.cumsum(indptrList) | |
N = sps.csr_matrix((data, indices, indptr), shape=(len(iList), self.shape[0])) | |
# Compute the dot product of the two query vectors and N, to scale N by | |
# their weights. | |
if m is not None: | |
teenyAvec = sps.csr_matrix(aVec.data ** m) | |
bbVec = bVec.tocsr() | |
if m != 1: | |
bbVec.data = bbVec.data ** m | |
N = teenyAvec.T.multiply(N).multiply(bbVec) | |
# Threshold N to retain only the top n neighbors for each row and | |
# column. | |
if n is not None: | |
N = double_prune_N(N, n=n) | |
# Convert the nearest-neighbors matrix and query vectors to a | |
# HollowSS object. | |
ss = HollowSS(self, N, iList, aVec, bVec) | |
return ss | |
def deredund_vectors(self, aVec, bVec): | |
"""Make two vectors orthogonal by eliminating any points which are more | |
highly expressed in the other vector.""" | |
uniqueA = self.cull_distal_words(aVec, bVec) | |
uniqueB = self.cull_distal_words(bVec, aVec) | |
return uniqueA, uniqueB | |
def cull_distal_words(self, aVec, bVec): | |
"""Return a copy of aVec with only the values that are higher in aVec | |
than bVec.""" | |
uniqueA = aVec.multiply(aVec > bVec) | |
return uniqueA | |
def sort_vec(self, vec, n=100, AS_VECTOR=False): | |
"""Return the top n indices of a vector, or returns a culled version | |
of the vector containing only the top n values. | |
""" | |
try: | |
# Pull the indices of the top n values from a vector's data. | |
topInds = np.argpartition(vec.data, -n)[-n:] | |
except: | |
topInds = range(len(vec.data)) | |
if AS_VECTOR: | |
vec = sps.csr_matrix( | |
(vec.data[topInds], vec.indices[topInds], np.array([0, len(topInds)])), | |
shape=vec.shape, | |
) | |
outVec = vec | |
else: | |
sorts = np.argsort(-vec.data[topInds]) | |
indices = vec.indices[topInds[sorts]] | |
outVec = indices | |
return outVec | |
""" | |
Container for a SynonymSearch matrix, a nearest-neighbors matrix for two given | |
clusters of synonyms. Used for the "pairs" and "flip_pairs" searches. | |
e.g. "Find first names and countries that sound alike" | |
(Categories are approximated here by returning w2v neighbors for a small sample | |
of terms.) | |
(For w2v reasons, cuing first names in this way seems to bias it towards | |
women's names.) | |
(The scores are the product of the words' semantic distances from the search | |
words and their phonetic distance from each other.) | |
>>> ss = pm.pairs('Jason*Mary', 'France*China', vecN=500) | |
>>> ss.print_nearest_crosses(10) | |
('Francie', 'France', 0.06555291557083659) | |
('Fran', 'France', 0.06401958667751825) | |
('Frances', 'France', 0.06169720053114153) | |
('Brittany', 'Britain', 0.05366786355699321) | |
('Mollie', 'Mali', 0.045043048421428575) | |
('Molly', 'Mali', 0.04318341143473779) | |
('Candace', 'Canada', 0.036466033543298616) | |
('Joanne', 'Japan', 0.035450886042190224) | |
('Bridget', 'Britain', 0.03468668989727991) | |
('Cindy', 'India', 0.03356244630824604) | |
Return the top matches for the "first name" category | |
>>> ss.a_render(8) | |
['Julie', 'Christine', 'Jody', 'Angela', 'Rebecca', 'Jennifer', 'Melissa', | |
'Valerie'] | |
Return the countries that sound the most like "Diane" | |
>>> ss.neighborest('Diane', n=4) | |
['Sudan', 'Japan', 'japan', 'Bretagne'] | |
Return the countries that sound the most like "Jody" | |
>>> ss.neighborest('Jody', n=4) | |
['Cambodia', 'Japan', 'Germany', 'Saudi'] | |
Return the countries that sound the most like "Melissa" | |
>>> ss.neighborest('Melissa', n=4) | |
['Malaysia', 'Bolivia', 'Macao', 'Switzerland'] | |
""" | |
class HollowSS: | |
""" | |
A search object for querying the pairwise nearest neighbors between two | |
lists of words. The lists of words are represented by aVec and bVec. | |
Parameters | |
---------- | |
pm: HollowPM | |
The Phonetics Matrix handler object for the vocabulary this search | |
was performed in. | |
N: CSR Matrix | |
The nearest neighbors matrix for the given pairs search. | |
iList: array | |
The indices in aVec; it maps the rows of N back onto HollowPM's | |
whole vocabulary. | |
aVec: 1-D CSR Matrix | |
The values of Category A. The nearest-neighbor vector of the first | |
input word or category. | |
bVec: 1-D CSR Matrix | |
The values of Category B. The nearest-neighbor vector of the second | |
input word or category. | |
""" | |
def __init__(self, pm, N, iList, aVec, bVec): | |
self.pm = pm | |
self.N = N | |
self.iList = iList | |
self.aVec = aVec | |
self.bVec = bVec | |
self.cross_self() | |
def __getitem__(self, i): | |
return self.neighborest(i) | |
def _get_i(self, word): | |
# If a word's index is in iList then it is in Category A and needs to | |
# be mapped to N. | |
pmI = self.pm._ix(word) | |
i = np.where(self.iList == pmI)[0] | |
if i: | |
i = int(i) | |
else: | |
i = pmI | |
return i | |
def _a_neighborest(self, word, n=30): | |
"""Returns the best matches for a given word or index in Category A.""" | |
pmI = self.pm._ix(word) | |
if not pmI in self.iList: | |
v = self.pm.empty_nayb() | |
else: | |
i = self._get_i(pmI) | |
v = self.N[i] | |
return self.pm.render(v, n=n) | |
def _b_neighborest(self, word, n=30): | |
"""Returns the best matches for a given word or index in Category B.""" | |
i = self.pm._ix(word) | |
v = self.N[:, i].T.tocsr() | |
return self.pm.render(v, iList=self.iList, n=n) | |
def cross_self(self, n=10000): | |
"""Convert the nearest-neighbors matrix into a list of the top overall | |
pairs | |
""" | |
# Pull the top overall pairs in the search by their value in N. | |
cooN = self.N.tocoo() | |
# sort the values of N as a COOrdinate Matrix | |
sorts = np.argsort(-cooN.data) | |
if n > 0: | |
sorts = sorts[:n] | |
# Organize them into ((index_A, index_B), score) tuples. | |
crosses = [ | |
((self.iList[int(a)], int(b)), s) | |
for a, b, s in zip(cooN.row[sorts], cooN.col[sorts], cooN.data[sorts]) | |
] | |
self.crosses = crosses | |
def print_nearest_crosses(self, n=40): | |
"""Print the overall top pairs from the search""" | |
crosses = self.pm._n_range_handle(self.crosses, n) | |
for (a, b), c in crosses: | |
print((self.pm[a], self.pm[b], c)) | |
def neighborest(self, word, n=20): | |
"""Return the best matches for a given word in the search. | |
Parameters | |
---------- | |
word: str | |
A word resulting from the search: One | |
""" | |
i = self.pm._ix(word) | |
if i in self.iList: | |
return self._a_neighborest(word, n) | |
else: | |
return self._b_neighborest(word, n) | |
def a_render(self, n=30): | |
"""Returns a list of the top matches for Category A.""" | |
return self.pm.render(self.aVec, n) | |
def b_render(self, n=30): | |
"""Returns a list of the top matches for Category B.""" | |
return self.pm.render(self.bVec, n) | |
def top_aNaybs(self, n=8, nn=10): | |
"""Prints out the top nn matches for the top n words in Category A.""" | |
inds = self.pm.sort_vec(self.aVec, n) | |
for i in inds: | |
print(self.pm[i]) | |
print(self.a_neighborest(i, n)) | |
def top_bNaybs(self, n=8, nn=10): | |
"""Prints out the top nn matches for the top n words in Category B.""" | |
inds = self.pm.sort_vec(self.bVec, n) | |
for i in inds: | |
print(self.pm[i]) | |
print(self.b_neighborest(i, nn)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment