hollowPM - Hollow Phonetic Matrix module
""" | |
This is my Hollow Phonetics Matrix module: It is for handling different poetic | |
word search operations for my Flask website (https://puns.plus) It is a handler | |
for three large 45,000 x 45,000 nearest neighbor matrices that are too big for | |
me to want to load into memory. (One for overall sound similarity, one for | |
word2vec meaning similarity, and one for presence of individual phonetic features.) | |
Given different search requests, this translates from search word to vector | |
number, unpickles individual vectors from the disk as needed, performs the | |
necessary math, and then renders the results as words. | |
>>> pm = HollowPM(basepath) | |
Words that sound the most like 'clam'. (Phonetic similarity.) | |
>>> v1 = pm.ph('clam') | |
>>> v1 | |
<1x45829 sparse matrix of type '<class 'numpy.float64'>' | |
with 5000 stored elements in Compressed Sparse Row format> | |
>>> pm.render(v1, n=8) | |
['Klamm', 'clam', 'clammy', 'clams', 'clamp', 'clamor', 'clamps', 'clamped'] | |
Words that are most similar in meaning to 'clam'. (word2vec semantic similarity.) | |
>>> v2 = pm.w2('clam') | |
>>> pm.render(v2, n=8) | |
['clam', 'clams', 'oyster', 'lobster', 'crab', 'crabs', 'scallop', 'oysters'] | |
Words that start with the sounds "kl". | |
>>> v3 = pm.gr('#kl') | |
>>> pm.render(v3, n=8) | |
['Clo', 'clay', 'Clay', 'claw', 'clue', 'Cleo', 'Chloe', 'claws'] | |
Words that sound like 'clam' AND have a similar meaning to 'clam' | |
>>> vCombo = v1.multiply(v2) | |
>>> pm.render(vCombo, n=8) | |
['clam', 'clams', 'clambake', 'lamb', 'ham', 'yam', 'calamari', 'scampi'] | |
Words that sound like 'clam' AND have a similar meaning to 'clam' AND start with | |
"kl" | |
>>> vComboAll = vCombo.multiply(v3) | |
>>> pm.render(vComboAll, n=10) | |
['clam', 'clams', 'clambake', 'clove', 'claws', 'cloves', 'clump', 'clover', | |
'clownfish', 'clementine'] | |
Words that are most similar in sound to both 'clam' AND 'lobster' | |
>>> vv = pm.ph('clam*lobster') | |
>>> pm.render(vv, n=8) | |
['Kloster', 'clatter', 'clobber', 'clamber', 'cloister', 'cluster', 'clamor', | |
'clobbered'] | |
Words that have a "kl" sound but don't start with it | |
>>> pm.render(pm.gr('kl') - pm.gr('#kl'), n=8) | |
['cackling', 'quickly', 'Buckley', 'thickly', 'sickly', 'acclaim', 'psychically', | |
'sackcloth'] | |
""" | |
# phVecs is another module in this project which handles generating | |
# phonetic feature vectors and associated math. | |
from phVecs import FeatureBank, double_prune_N | |
import numpy as np | |
from scipy import sparse as sps | |
import sklearn as skl | |
import pickle | |
class HollowPM: | |
""" Hollow Phonetics Matrix wrapper. A handler for three large nearest- | |
neighbor matrices for a given vocabulary: Phonetic similarity (ph), | |
Semantic similarity (w2), and phonetic features (gr). | |
""" | |
def __init__(self, basepath, vocabName): | |
# Set the paths for opening the needed vectors | |
self._basepath = basepath | |
self._vocabName = vocabName | |
self._phNaybsPath = "phv_naybs/" + self._vocabName + "_" | |
self._w2NaybsPath = "w2v_naybs/" + self._vocabName + "_" | |
self._grNaybsPath = "grm_naybs/" + self._vocabName + "_" | |
self._posPath = "posDict" + self._vocabName | |
self._meterPath = "meterDict" + self._vocabName | |
# keyDict is a dictionary that maps words to indices and indices to words. | |
self.keyDict = self._open_pickle("keyDict" + self._vocabName) | |
# FeatureDeposit maps all of the individual phonetic feature vectors to their index. | |
self.fb = FeatureBank() | |
xShape = len(self.keyDict) // 2 # The number of words in the vocabulary | |
yShape = self.fb.maxLen_ # The number of phonetic features in the model | |
self.shape = (xShape, yShape) | |
def __getitem__(self, item): | |
"""Return the corresponding index or word for a given word or index.""" | |
return self.keyDict.get(item) | |
def _ix(self, item): | |
"""Return the index for a given word. If given an index, returns the | |
same index. | |
""" | |
if isinstance(item, str): | |
i = self.keyDict.get(item) | |
elif item in self.keyDict: | |
i = item | |
elif int(item) in self.keyDict: | |
i = int(item) | |
return i | |
def _ix_gr(self, item): | |
"""Return the index for a given phonetic feature. If given an index, | |
returns the same index. | |
""" | |
if isinstance(item, str): | |
i = self.fb[item] | |
elif isinstance(item, list): | |
i = self.fb[item] | |
elif isinstance(item, int): | |
if item in range(nd.maxLen_): | |
i = item | |
else: | |
i = None | |
return i | |
def _open_sum(self, indices, open_function): | |
"""Opens a list of nearest-neighbor vectors by index, adds and | |
normalizes them. | |
""" | |
vecs = [open_function(i) for i in indices] | |
vecs = [v for v in vecs if v.count_nonzero()] | |
summed = self.normalize(sum(vecs)) | |
return summed | |
def _open_prod(self, indices, open_function): | |
"""Opens a list of nearest-neighbor vectors by index, multiplies and | |
normalizes them. | |
""" | |
vecs = [open_function(i) for i in indices] | |
vecs = [v for v in vecs if v.count_nonzero()] | |
prodded = self.multiply_list(vecs, NORM=True) | |
return prodded | |
def _open_vector(self, prefix, i): | |
"""Opens a vector from the disk, given a prefix and index.""" | |
try: | |
return self._open_pickle(prefix + str(i)) | |
except: | |
return self.empty_nayb() | |
def _open_split_vector(self, prefix, i): | |
"""Opens a vector from the disk, if the vector is split into separate | |
numpy arrays of its data and indices | |
""" | |
data = self._open_pickle(prefix + str(i) + "_data") | |
inds = self._open_pickle(prefix + str(i) + "_inds") | |
return data, inds | |
def _n_range_handle(self, array, n): | |
""" | |
A protocol for handling requests for different slices as a function input. | |
Used in HollowPM.render() and HollowSS.print_nearest_crosses() | |
""" | |
if isinstance(n, tuple): | |
array = array[n[0] : n[1]] | |
elif n < 0: | |
array = array[n:] | |
else: | |
array = array[:n] | |
return array | |
def _open_pickle(self, path): | |
"""Opens a pickled object off of the disk in the path where the | |
HollowPM's data is stored. | |
""" | |
with open(self._basepath + path + ".pickle", "rb") as f: | |
item = pickle.load(f) | |
return item | |
def ph(self, i): | |
"""Open a word's phonetic nearest-neighbor vector, | |
or combine multiple words' nearest-neighbor vectors. | |
Uses cosine similarity of the phonetic feature vectors from phVecs to determine similarity. | |
Parameters | |
---------- | |
i: int or str | |
if int: returns a single word's nearest-neighbor vector by index. | |
if str: returns a single word's nearest-neighbor vector, or combines | |
multiple words' nearest-neighbor vectors. | |
To combine multiple words, input a string with the words separated by '*' or '+' | |
self.ph('clam+lobster') returns the sum of the vectors for 'clam' and 'lobster' | |
self.ph('clam*lobster') returns the product of the vectors for 'clam' and 'lobster' | |
Use '+' if you want to return neighbors for "clam OR lobster" | |
Use '*' if you want to return neighbors for "clam AND lobster" | |
If you mix '+' and '*', the order of operations is backwards: + resolves before * | |
Returns | |
------- | |
naybVec: CSR matrix | |
A nearest neighbors vector for the given word or combination of words. | |
""" | |
if "*" in i: | |
# Return the product of all of the words in the input string | |
return self._open_prod(i.split("*"), self.ph) | |
if "+" in i: | |
# Return the sum of all of the words in the input string | |
return self._open_sum(i.split("+"), self.ph) | |
i = self._ix(i) | |
naybVec = self._open_vector(self._phNaybsPath, i) | |
return naybVec | |
def w2(self, i): | |
"""Open a word's semantic nearest-neighbor vector, | |
or combine multiple words' nearest-neighbor vectors. | |
Uses word2vec cosine distance to determine semantic (meaning) similarity. | |
Parameters | |
---------- | |
i: int or str | |
if int: returns a single word's nearest-neighbor vector by index. | |
if str: returns a single word's nearest-neighbor vector, or combines | |
multiple words' nearest-neighbor vectors. | |
To combine multiple words, input a string with words separated by '*' or '+': | |
self.w2('China+France') returns the sum of the vectors for 'China' and 'France' | |
self.w2('China*France') returns the product of the vectors for 'China' and 'France' | |
Use '+' if you want to return neighbors of 'China' OR 'France' | |
Use '*' if you want to return neighbors of 'China' AND 'France' | |
If you mix '+' and '*', the order of operations is backwards: '+' resolves before '*' | |
Returns | |
------- | |
naybVec: CSR matrix | |
A nearest neighbors vector for the given word or combination of words. | |
""" | |
if "*" in i: | |
# Return the product of all of the words in the input string | |
return self._open_prod(i.split("*"), self.w2) | |
if "+" in i: | |
# Return the product of all of the words in the input string | |
return self._open_sum(i.split("+"), self.w2) | |
i = self._ix(i) | |
naybVec = self._open_vector(self._w2NaybsPath, i) | |
return naybVec | |
def gr(self, i): | |
"""Open a phonetic feature's nearest-neighbor vector, | |
or combine multiple phonetic features' nearest-neighbor vectors. | |
Uses normalized phonetic feature vectors from phVecs to determine similarity. | |
Parameters | |
---------- | |
i: int or str | |
if int: returns a single feature's nearest-neighbor vector by index. | |
if str: returns a single feature's nearest-neighbor vector, or combines | |
multiple features' nearest-neighbor vectors. | |
To combine multiple words, input a string with the words separated by '*' or '+' | |
self.gr('b+v') returns the sum of the vectors for 'b' and 'v' | |
self.gr('b*v') returns the product of the vectors for 'b' and 'v' | |
Use '+' if you want to return words with 'b' OR 'v' | |
Use '*' if you want to return words with 'b' AND 'v' | |
If you mix '+' and '*', the order of operations is backwards: '+' resolves before '*' | |
Returns | |
------- | |
naybVec: CSR matrix | |
A nearest neighbors vector for the given word or combination of words. | |
""" | |
if "*" in i: | |
# Return the product of all of the words in the input string | |
return self._open_prod(i.split("*"), self.gr) | |
if "+" in i: | |
# Return the sum of all of the words in the input string | |
return self._open_sum(i.split("+"), self.gr) | |
i = self._ix_gr(i) | |
naybVec = self._open_vector(self._grNaybsPath, i) | |
return naybVec | |
def render( | |
self, | |
naybVec, | |
n=20, | |
iList=None, | |
VALUES=False, | |
rounding=3, | |
POSITIVE=True, | |
BYMETER=False, | |
BYPOS=False, | |
): | |
"""Converts a nearest-neighbors vector into a sorted list of the words | |
it represents. | |
Parameters | |
---------- | |
naybVec: CSR Matrix | |
A nearest-neighbors vector corresponding to samples in the | |
HollowPM's vocabulary. | |
n: int or tuple | |
Determines which slice of neighbors to return: | |
If n is a positive int: Returns the top n values | |
If n is a negative int: Returns the bottom -n values | |
If n is a tuple: Returns the top n[0] to n[1] values | |
iList: array | |
A mapping for indices if naybVec represents a subset of | |
VALUES: bool | |
If True, render() will return a list of (word, value) tuples, | |
instead of just words. | |
rounding: int | |
Determines what place to round values to. | |
POSITIVE: bool | |
Whether to only return neighbors with positive scores. | |
BYMETER: bool | |
Whether to return the results in a dict, sorted by meter. | |
BYPOS: bool | |
Whether to return the results in a dict, sorted by part-of-speech. | |
""" | |
def sort_render(self, indices, sortDict, PRINT=True): | |
""" | |
Sorts a list of indices by a trait mapped in sortDict. | |
Prints the results or returns them in a defaultdict. | |
""" | |
sortedInds = defaultdict(list) | |
for i in indices: | |
sortedInds[sortDict[i]].append(self[i]) | |
if PRINT: | |
# Print the words to output | |
for k, v in sortedInds.item(): | |
print(k) | |
print(", ".join(items)) | |
else: | |
# Return as a defaultdict | |
return sortedInds | |
sorts = np.argsort(-naybVec.data) | |
if POSITIVE: | |
sorts = sorts[np.where(naybVec.data[sorts] > 0)] | |
sorts = self._n_range_handle(sorts, n) | |
if iList is not None: | |
indices = iList[naybVec.indices[sorts]] | |
else: | |
indices = naybVec.indices[sorts] | |
if BYMETER: | |
# Sort the words by their meter | |
sortDict = self._open_pickle(self.meterPath) | |
return self.sort_render(indices, sortDict, PRINT=False) | |
if BYPOS: | |
# Sort the words by their part of speech | |
sortDict = self._open_pickle(self.posPath) | |
return self.sort_render(indices, sortDict, PRINT=False) | |
if VALUES: | |
# Return the list of words with their values from the vector | |
outList = [ | |
(self[i], np.round(float(v), rounding)) | |
for i, v in zip(indices, naybVec.data[sorts]) | |
] | |
else: | |
# Return as a list of words | |
outList = [self[i] for i in indices] | |
return outList | |
def multiply_list(self, vecs, NORM=True): | |
"""Perform vector multiplication a list of vectors. | |
Normalizes them by default. | |
""" | |
runningVec = vecs[0] | |
for vec in vecs[1:]: | |
runningVec = runningVec.multiply(vec) | |
if NORM: | |
runningVec = self.normalize(runningVec) | |
return runningVec | |
def empty_nayb(self): | |
"""Return an empty nearest-neighbors vector""" | |
return sps.csr_matrix( | |
(np.array([]), np.array([]), np.array([0, 0])), shape=(1, self.shape[0]) | |
) | |
def count_to_nayb(self, count): | |
"""Converts a Counter object of phonetic features to a nearest-neighbor | |
vector. Can be used for performing phonetic searches for words outside | |
of the vocabulary. | |
""" | |
nayb = self.empty_nayb() | |
l2Norm = 0 | |
for k, v in count.items(): | |
nayb = nayb + self.gr(k) * v | |
l2Norm += v ** 2 | |
l2Norm **= 0.5 | |
nayb = nayb / l2Norm | |
return nayb | |
def normalize(self, vec): | |
"""Normalizes a vector using the L2 Norm.""" | |
return skl.preprocessing.normalize(vec) | |
def slml(self, sl, ml, m=1, vecN=None, n=None): | |
"""Runs a "Sounds like, means like" search: returning words which sound | |
like one word and have similar meanings to another. | |
Parameters | |
---------- | |
sl: str | |
A word or combination of words to search for phonetic similarity. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
ml: str | |
A word or combination of words to search for semantic similarity. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
m: float | |
The amount of weight to give to semantic similarity rather than | |
phonetic similarity. An exponent the values of the "sounds-like" | |
component are raised to. | |
vecN: int or None | |
The number of neighbors to retain for each search word. Defaults | |
to all. | |
n: int or None | |
The number of neighbors to render. By default, slml returns the | |
full neighbor vector as a vector. | |
If an int is given for n, slml returns a list of strings | |
Returns | |
------- | |
By default, the nearest-neighbor vector of the search. | |
If n is an int, this returns a list of strings of the top words from | |
the search. | |
""" | |
a = self.ph(sl) | |
b = self.w2(ml) | |
if vecN: | |
a = self.sort_vec(a, vecN, AS_VECTOR=True) | |
b = self.sort_vec(b, vecN, AS_VECTOR=True) | |
if m != 1: | |
a.data = a.data ** m | |
c = a.multiply(b) | |
if n is not None: | |
return self.render(c, n=n) | |
else: | |
return c | |
def pairs(self, aWord, bWord, m=0.5, n=25, vecN=None): | |
"""Runs a "perfect pairs" search on two words or categories. | |
Parameters | |
---------- | |
aWord: str | |
The first word or group of words to pull "synonyms" for. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
bWord: str | |
The second word or group of words to pull "synonyms" for. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
m: float | |
The amount of weight to give to semantic similarity rather than | |
phonetic similarity. Defaults to equal weights. | |
n: int | |
The number of neighbors to retain in the nearest-neighbors matrix | |
vecN: int | |
The number of "synonyms" to retain for each search word. Defaults | |
to all. | |
Returns | |
------- | |
A HollowSS object, which can be queried for overall top matches, or top | |
matches for a given word. | |
""" | |
aVec = self.w2(aWord) | |
bVec = self.w2(bWord) | |
if vecN is not None: | |
aVec = self.sort_vec(aVec, n=vecN, AS_VECTOR=True) | |
bVec = self.sort_vec(bVec, n=vecN, AS_VECTOR=True) | |
return self.hollow_pairs(aVec, bVec, self._phNaybsPath, m=m) | |
def flip_pairs(self, aWord, bWord, m=0.5, n=25, vecN=None): | |
"""Runs a "meaning mates" search on two words or groups of words. | |
(The converse of self.pairs()) | |
Parameters | |
---------- | |
aWord: str | |
The first word or group of words to pull phonetic neighbors for. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
bWord: str | |
The second word or group of words to pull phonetic neighbors for. | |
(Words separated by '+' have their vectors added) | |
(Words separated by '*' have their vectors multiplied) | |
m: float | |
The amount of weight to give to phonetic similarity rather than | |
semantic similarity. Defaults to equal weights. | |
n: int | |
The number of neighbors to retain in the nearest-neighbors matrix. | |
vecN: int | |
The number of phonetic neighbors to retain for each search word. | |
Defaults to all. | |
Returns | |
------- | |
A HollowSS object, which can be queried for overall top matches, or top | |
matches for a given word. | |
""" | |
aVec = self.ph(aWord) | |
bVec = self.ph(bWord) | |
if vecN is not None: | |
aVec = self.sort_vec(aVec, n=vecN, AS_VECTOR=True) | |
bVec = self.sort_vec(bVec, n=vecN, AS_VECTOR=True) | |
return self.hollow_pairs(aVec, bVec, self._w2NaybsPath, m=m) | |
def hollow_pairs(self, aVec, bVec, prefix, m=0.5, n=25): | |
""" | |
Does the math for generating a HollowSS search object. | |
Find all pairwise cosine distances between the elements of two | |
nearest-neighbor vectors. | |
N is a nearest-neighbors matrix, the result of the operation | |
x.T • A • y, where A is an adjacency matrix for the entire vocabulary, | |
and x and y are two input vectors. To save compute, only the vectors | |
which correspond to a nonzero value in aVec are loaded into memory. | |
N is then thresholded to only retain the top n neighbors in each row | |
and column. (This not only saves memory, but improves the salience of | |
the results.) | |
""" | |
# Make the vectors orthogonal so a word can only be in one category or | |
# the other. | |
aVec, bVec = self.deredund_vectors(aVec, bVec) | |
# Build the nearest-neighbors matrix as a scipy CSR matrix by compiling | |
# each row's data and indices. | |
dataList = [] | |
indsList = [] | |
indptrList = [0] | |
iList = aVec.indices | |
# For each index in aVec, open its nearest-neighbor vector and compile | |
# it into a matrix. | |
for i in iList: | |
vec = self._open_vector(prefix, i) | |
dataList.append(vec.data) | |
indsList.append(vec.indices) | |
indptrList.append(len(vec.indices)) | |
data = np.concatenate(dataList) | |
indices = np.concatenate(indsList) | |
indptr = np.cumsum(indptrList) | |
N = sps.csr_matrix((data, indices, indptr), shape=(len(iList), self.shape[0])) | |
# Compute the dot product of the two query vectors and N, to scale N by | |
# their weights. | |
if m is not None: | |
teenyAvec = sps.csr_matrix(aVec.data ** m) | |
bbVec = bVec.tocsr() | |
if m != 1: | |
bbVec.data = bbVec.data ** m | |
N = teenyAvec.T.multiply(N).multiply(bbVec) | |
# Threshold N to retain only the top n neighbors for each row and | |
# column. | |
if n is not None: | |
N = double_prune_N(N, n=n) | |
# Convert the nearest-neighbors matrix and query vectors to a | |
# HollowSS object. | |
ss = HollowSS(self, N, iList, aVec, bVec) | |
return ss | |
def deredund_vectors(self, aVec, bVec): | |
"""Make two vectors orthogonal by eliminating any points which are more | |
highly expressed in the other vector.""" | |
uniqueA = self.cull_distal_words(aVec, bVec) | |
uniqueB = self.cull_distal_words(bVec, aVec) | |
return uniqueA, uniqueB | |
def cull_distal_words(self, aVec, bVec): | |
"""Return a copy of aVec with only the values that are higher in aVec | |
than bVec.""" | |
uniqueA = aVec.multiply(aVec > bVec) | |
return uniqueA | |
def sort_vec(self, vec, n=100, AS_VECTOR=False): | |
"""Return the top n indices of a vector, or returns a culled version | |
of the vector containing only the top n values. | |
""" | |
try: | |
# Pull the indices of the top n values from a vector's data. | |
topInds = np.argpartition(vec.data, -n)[-n:] | |
except: | |
topInds = range(len(vec.data)) | |
if AS_VECTOR: | |
vec = sps.csr_matrix( | |
(vec.data[topInds], vec.indices[topInds], np.array([0, len(topInds)])), | |
shape=vec.shape, | |
) | |
outVec = vec | |
else: | |
sorts = np.argsort(-vec.data[topInds]) | |
indices = vec.indices[topInds[sorts]] | |
outVec = indices | |
return outVec | |
""" | |
Container for a SynonymSearch matrix, a nearest-neighbors matrix for two given | |
clusters of synonyms. Used for the "pairs" and "flip_pairs" searches. | |
e.g. "Find first names and countries that sound alike" | |
(Categories are approximated here by returning w2v neighbors for a small sample | |
of terms.) | |
(For w2v reasons, cuing first names in this way seems to bias it towards | |
women's names.) | |
(The scores are the product of the words' semantic distances from the search | |
words and their phonetic distance from each other.) | |
>>> ss = pm.pairs('Jason*Mary', 'France*China', vecN=500) | |
>>> ss.print_nearest_crosses(10) | |
('Francie', 'France', 0.06555291557083659) | |
('Fran', 'France', 0.06401958667751825) | |
('Frances', 'France', 0.06169720053114153) | |
('Brittany', 'Britain', 0.05366786355699321) | |
('Mollie', 'Mali', 0.045043048421428575) | |
('Molly', 'Mali', 0.04318341143473779) | |
('Candace', 'Canada', 0.036466033543298616) | |
('Joanne', 'Japan', 0.035450886042190224) | |
('Bridget', 'Britain', 0.03468668989727991) | |
('Cindy', 'India', 0.03356244630824604) | |
Return the top matches for the "first name" category | |
>>> ss.a_render(8) | |
['Julie', 'Christine', 'Jody', 'Angela', 'Rebecca', 'Jennifer', 'Melissa', | |
'Valerie'] | |
Return the countries that sound the most like "Diane" | |
>>> ss.neighborest('Diane', n=4) | |
['Sudan', 'Japan', 'japan', 'Bretagne'] | |
Return the countries that sound the most like "Jody" | |
>>> ss.neighborest('Jody', n=4) | |
['Cambodia', 'Japan', 'Germany', 'Saudi'] | |
Return the countries that sound the most like "Melissa" | |
>>> ss.neighborest('Melissa', n=4) | |
['Malaysia', 'Bolivia', 'Macao', 'Switzerland'] | |
""" | |
class HollowSS: | |
""" | |
A search object for querying the pairwise nearest neighbors between two | |
lists of words. The lists of words are represented by aVec and bVec. | |
Parameters | |
---------- | |
pm: HollowPM | |
The Phonetics Matrix handler object for the vocabulary this search | |
was performed in. | |
N: CSR Matrix | |
The nearest neighbors matrix for the given pairs search. | |
iList: array | |
The indices in aVec; it maps the rows of N back onto HollowPM's | |
whole vocabulary. | |
aVec: 1-D CSR Matrix | |
The values of Category A. The nearest-neighbor vector of the first | |
input word or category. | |
bVec: 1-D CSR Matrix | |
The values of Category B. The nearest-neighbor vector of the second | |
input word or category. | |
""" | |
def __init__(self, pm, N, iList, aVec, bVec): | |
self.pm = pm | |
self.N = N | |
self.iList = iList | |
self.aVec = aVec | |
self.bVec = bVec | |
self.cross_self() | |
def __getitem__(self, i): | |
return self.neighborest(i) | |
def _get_i(self, word): | |
# If a word's index is in iList then it is in Category A and needs to | |
# be mapped to N. | |
pmI = self.pm._ix(word) | |
i = np.where(self.iList == pmI)[0] | |
if i: | |
i = int(i) | |
else: | |
i = pmI | |
return i | |
def _a_neighborest(self, word, n=30): | |
"""Returns the best matches for a given word or index in Category A.""" | |
pmI = self.pm._ix(word) | |
if not pmI in self.iList: | |
v = self.pm.empty_nayb() | |
else: | |
i = self._get_i(pmI) | |
v = self.N[i] | |
return self.pm.render(v, n=n) | |
def _b_neighborest(self, word, n=30): | |
"""Returns the best matches for a given word or index in Category B.""" | |
i = self.pm._ix(word) | |
v = self.N[:, i].T.tocsr() | |
return self.pm.render(v, iList=self.iList, n=n) | |
def cross_self(self, n=10000): | |
"""Convert the nearest-neighbors matrix into a list of the top overall | |
pairs | |
""" | |
# Pull the top overall pairs in the search by their value in N. | |
cooN = self.N.tocoo() | |
# sort the values of N as a COOrdinate Matrix | |
sorts = np.argsort(-cooN.data) | |
if n > 0: | |
sorts = sorts[:n] | |
# Organize them into ((index_A, index_B), score) tuples. | |
crosses = [ | |
((self.iList[int(a)], int(b)), s) | |
for a, b, s in zip(cooN.row[sorts], cooN.col[sorts], cooN.data[sorts]) | |
] | |
self.crosses = crosses | |
def print_nearest_crosses(self, n=40): | |
"""Print the overall top pairs from the search""" | |
crosses = self.pm._n_range_handle(self.crosses, n) | |
for (a, b), c in crosses: | |
print((self.pm[a], self.pm[b], c)) | |
def neighborest(self, word, n=20): | |
"""Return the best matches for a given word in the search. | |
Parameters | |
---------- | |
word: str | |
A word resulting from the search: One | |
""" | |
i = self.pm._ix(word) | |
if i in self.iList: | |
return self._a_neighborest(word, n) | |
else: | |
return self._b_neighborest(word, n) | |
def a_render(self, n=30): | |
"""Returns a list of the top matches for Category A.""" | |
return self.pm.render(self.aVec, n) | |
def b_render(self, n=30): | |
"""Returns a list of the top matches for Category B.""" | |
return self.pm.render(self.bVec, n) | |
def top_aNaybs(self, n=8, nn=10): | |
"""Prints out the top nn matches for the top n words in Category A.""" | |
inds = self.pm.sort_vec(self.aVec, n) | |
for i in inds: | |
print(self.pm[i]) | |
print(self.a_neighborest(i, n)) | |
def top_bNaybs(self, n=8, nn=10): | |
"""Prints out the top nn matches for the top n words in Category B.""" | |
inds = self.pm.sort_vec(self.bVec, n) | |
for i in inds: | |
print(self.pm[i]) | |
print(self.b_neighborest(i, nn)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment