-
-
Save amankharwal/a96ee73d62c90b82e2e19575b83e8cbb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sort_coo(coo_matrix): | |
tuples = zip(coo_matrix.col, coo_matrix.data) | |
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) | |
def extract_topn_from_vector(feature_names, sorted_items, topn=10): | |
"""get the feature names and tf-idf score of top n items""" | |
#use only topn items from vector | |
sorted_items = sorted_items[:topn] | |
score_vals = [] | |
feature_vals = [] | |
for idx, score in sorted_items: | |
fname = feature_names[idx] | |
#keep track of feature name and its corresponding score | |
score_vals.append(round(score, 3)) | |
feature_vals.append(feature_names[idx]) | |
#create a tuples of feature,score | |
#results = zip(feature_vals,score_vals) | |
results= {} | |
for idx in range(len(feature_vals)): | |
results[feature_vals[idx]]=score_vals[idx] | |
return results | |
# get feature names | |
feature_names=cv.get_feature_names() | |
def get_keywords(idx, docs): | |
#generate tf-idf for the given document | |
tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]])) | |
#sort the tf-idf vectors by descending order of scores | |
sorted_items=sort_coo(tf_idf_vector.tocoo()) | |
#extract only the top n; n here is 10 | |
keywords=extract_topn_from_vector(feature_names,sorted_items,10) | |
return keywords | |
def print_results(idx,keywords, df): | |
# now print the results | |
print("\n=====Title=====") | |
print(df['title'][idx]) | |
print("\n=====Abstract=====") | |
print(df['abstract'][idx]) | |
print("\n===Keywords===") | |
for k in keywords: | |
print(k,keywords[k]) | |
idx=941 | |
keywords=get_keywords(idx, docs) | |
print_results(idx,keywords, df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment