ABHISHEK SHARMA abhishek-shrm

## keyword-extraction-textrank-17.py
# Extracting keyphrases for abstracts in test set
df_test['pred_keys']=df_test['abstract'].apply(extract_keyphrase)
df_test.head()

## keyword-extraction-textrank-16.py
# Function for extracting keyphrases
def extract_keyphrase(text):
  # Creating Spacy's Doc object
  doc=textacy.make_spacy_doc(text,lang=en)
  # Getting top 5 keyphrases from the text
  keyphrases=textacy.ke.textrank(doc,normalize='lower',topn=0.33)

  return [phrase for phrase,score in keyphrases]

# Extracting keyphrases for abstracts in validation set

## keyword-extraction-textrank-15.py
# Getting top num_keys keyphrases from the text
keyphrases=textacy.ke.textrank(doc,normalize='lower',topn=0.33)
keyphrases

## keyword-extraction-textrank-14.py
# Creating Spacy's Doc object
doc=textacy.make_spacy_doc(df_val['abstract'][3],lang=en)
doc

## keyword-extraction-textrank-13.py
# Importing textaCy
import textacy
# Importing for keyword extraction(mandatory)
import textacy.ke

# Loading spacy model
en = textacy.load_spacy_lang("en_core_web_sm")

## keyword-extraction-textrank-12.py
# Installing textacy
!pip install textacy

## keyword-extraction-textrank-11.py
# Getting list of all the files containing abstracts
abs_list=glob.glob('/content/test/*.abstr')

# Getting index value
index=sorted([int(i.split('.')[0].split('/')[-1]) for i in abs_list])

# Getting abstracts
abstracts=dict.fromkeys(index,None)
for text_file in abs_list:
  file=open(text_file,encoding='utf8').read()

## keyword-extraction-textrank-10.py
# Creating a dataframe of abstracts and keywords in the validation set
df_val=pd.DataFrame.from_dict([abstracts,keys]).T
df_val.columns=['abstract','keys']
df_val=df_val.reset_index()
print('Shape=>',df_val.shape)
df_val.head()

## keyword-extraction-textrank-9.py
# Function for pre-processing keywords
def preprocess_keys(text):
  # Case Normalization
  text=text.lower()
  # Removing hyphen and whitespaces
  text=re.sub('(\s|-)+',' ',text)
  # Removing any leading and trailing spaces
  text=text.strip()

  return text

## keyword-extraction-textrank-8.py
# Getting list of keys
keys=dict.fromkeys(index,None)

for i in keys_text:
  keys[i]=keys_text[i].split(';')

# Printing sample keywords
dict(list(keys.items())[:4])
	# Extracting keyphrases for abstracts in test set
	df_test['pred_keys']=df_test['abstract'].apply(extract_keyphrase)
	df_test.head()
	# Function for extracting keyphrases
	def extract_keyphrase(text):
	# Creating Spacy's Doc object
	doc=textacy.make_spacy_doc(text,lang=en)
	# Getting top 5 keyphrases from the text
	keyphrases=textacy.ke.textrank(doc,normalize='lower',topn=0.33)

	return [phrase for phrase,score in keyphrases]

	# Extracting keyphrases for abstracts in validation set
	# Getting top num_keys keyphrases from the text
	keyphrases=textacy.ke.textrank(doc,normalize='lower',topn=0.33)
	keyphrases
	# Creating Spacy's Doc object
	doc=textacy.make_spacy_doc(df_val['abstract'][3],lang=en)
	doc
	# Importing textaCy
	import textacy
	# Importing for keyword extraction(mandatory)
	import textacy.ke

	# Loading spacy model
	en = textacy.load_spacy_lang("en_core_web_sm")
	# Getting list of all the files containing abstracts
	abs_list=glob.glob('/content/test/*.abstr')

	# Getting index value
	index=sorted([int(i.split('.')[0].split('/')[-1]) for i in abs_list])

	# Getting abstracts
	abstracts=dict.fromkeys(index,None)
	for text_file in abs_list:
	file=open(text_file,encoding='utf8').read()
	# Creating a dataframe of abstracts and keywords in the validation set
	df_val=pd.DataFrame.from_dict([abstracts,keys]).T
	df_val.columns=['abstract','keys']
	df_val=df_val.reset_index()
	print('Shape=>',df_val.shape)
	df_val.head()
	# Function for pre-processing keywords
	def preprocess_keys(text):
	# Case Normalization
	text=text.lower()
	# Removing hyphen and whitespaces
	text=re.sub('(\s\|-)+',' ',text)
	# Removing any leading and trailing spaces
	text=text.strip()

	return text
	# Getting list of keys
	keys=dict.fromkeys(index,None)

	for i in keys_text:
	keys[i]=keys_text[i].split(';')

	# Printing sample keywords
	dict(list(keys.items())[:4])