Skip to content

Instantly share code, notes, and snippets.

@Garfounkel
Last active July 8, 2020 23:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Garfounkel/db724c9b89666b8e39e9e12042e0e16e to your computer and use it in GitHub Desktop.
Save Garfounkel/db724c9b89666b8e39e9e12042e0e16e to your computer and use it in GitHub Desktop.
Nvidia NLP blog sparsity
def document_search(text_df, query, vectorizer, tfidf_matrix, top_n=3):
query_vec = vectorizer.transform(Series([query]))
similarities = efficient_csr_cosine_similarity(query_vec, tfidf_matrix, matrix_normalized=True)
similarities = similarities.todense().reshape(-1)
best_idx = similarities.argsort()[-top_n:][::-1]
pp = cudf.DataFrame({
'text': text_df['text'].iloc[best_idx],
'similarity': similarities[best_idx]
})
return pp
from cuml.common.sparsefuncs import csr_row_normalize_l2
def efficient_csr_cosine_similarity(query, tfidf_matrix, matrix_normalized=False):
query = csr_row_normalize_l2(query, inplace=False)
if not matrix_normalized:
tfidf_matrix = csr_row_normalize_l2(tfidf_matrix, inplace=False)
return tfidf_matrix.dot(query.T)
from cuml.feature_extraction.text import CountVectorizer
from cuml.feature_extraction.text import TfidfTransformer
cv = CountVectorizer()
count = cv.fit_transform(tweets) # 22.1s / 2m41s for sklearn
tf = TfidfTransformer()
tfidf_matrix = tf.fit_transform(count) # 3.8s / 13.6s for sklearn
tfidf_matrix.shape # (4827372, 5435706)
import cudf
def join_df(path):
data = cudf.DataFrame()
for file in os.listdir(path):
print(f"In path : {path}{file}")
temp = cudf.read_csv(path+file)
temp = temp[temp.lang=='en']
data = cudf.concat([data,temp])
return data
df = join_df('tweets/')
df.shape # (4827372, 22)
tweets = Series(df['text'])
document_search(df, 'computer science and NLP', vec, tfidf_matrix)
document_search(df, 'nvidia gpu', vec, tfidf_matrix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment