Skip to content

Instantly share code, notes, and snippets.

View abhishek-shrm's full-sized avatar

ABHISHEK SHARMA abhishek-shrm

  • ZS Associates
  • New Delhi, India
View GitHub Profile
# Extracting keyphrases for abstracts in test set
df_test['pred_keys']=df_test['abstract'].apply(extract_keyphrase)
df_test.head()
# Function for extracting keyphrases
def extract_keyphrase(text):
# Creating Spacy's Doc object
doc=textacy.make_spacy_doc(text,lang=en)
# Getting top 5 keyphrases from the text
keyphrases=textacy.ke.textrank(doc,normalize='lower',topn=0.33)
return [phrase for phrase,score in keyphrases]
# Extracting keyphrases for abstracts in validation set
# Getting top num_keys keyphrases from the text
keyphrases=textacy.ke.textrank(doc,normalize='lower',topn=0.33)
keyphrases
# Creating Spacy's Doc object
doc=textacy.make_spacy_doc(df_val['abstract'][3],lang=en)
doc
# Importing textaCy
import textacy
# Importing for keyword extraction(mandatory)
import textacy.ke
# Loading spacy model
en = textacy.load_spacy_lang("en_core_web_sm")
# Installing textacy
!pip install textacy
# Getting list of all the files containing abstracts
abs_list=glob.glob('/content/test/*.abstr')
# Getting index value
index=sorted([int(i.split('.')[0].split('/')[-1]) for i in abs_list])
# Getting abstracts
abstracts=dict.fromkeys(index,None)
for text_file in abs_list:
file=open(text_file,encoding='utf8').read()
# Creating a dataframe of abstracts and keywords in the validation set
df_val=pd.DataFrame.from_dict([abstracts,keys]).T
df_val.columns=['abstract','keys']
df_val=df_val.reset_index()
print('Shape=>',df_val.shape)
df_val.head()
# Function for pre-processing keywords
def preprocess_keys(text):
# Case Normalization
text=text.lower()
# Removing hyphen and whitespaces
text=re.sub('(\s|-)+',' ',text)
# Removing any leading and trailing spaces
text=text.strip()
return text
# Getting list of keys
keys=dict.fromkeys(index,None)
for i in keys_text:
keys[i]=keys_text[i].split(';')
# Printing sample keywords
dict(list(keys.items())[:4])