Skip to content

Instantly share code, notes, and snippets.

@ashokc
Created December 23, 2019 02:03
Show Gist options
  • Save ashokc/5bbc70ee1f455bf05b4f2a602971a719 to your computer and use it in GitHub Desktop.
Save ashokc/5bbc70ee1f455bf05b4f2a602971a719 to your computer and use it in GitHub Desktop.
def scoreBySigTerms (test_docs, sig_words, n_sig_terns):
scoresByLabel = {}
for label in [0,1]:
useSigWords = sig_words[label][0:n_sig_terns]
vectorizer = CountVectorizer(analyzer=lambda x: x, min_df=1, vocabulary=useSigWords)
test_doc_vectors = vectorizer.transform(test_docs)
a = np.sum(test_doc_vectors,axis=1)
b = []
for i in range(len(a)):
b.append(a[i,0])
scoresByLabel[label] = b
predicted_labels, probabilities = [], np.zeros((len(test_docs),2))
for i, testDoc in enumerate(test_docs):
docScores = np.array([scoresByLabel[label][i] for label in [0,1]])
probabilities[i] = docScores / (np.sum(predList) + 1.0e-16) # Linear normalization to 1. Can do softmax...
predicted_label = np.argmax(probabilities[i])
predicted_labels.append(predicted_label)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment