This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Creating a dataframe of abstracts and keywords in the validation set | |
df_val=pd.DataFrame.from_dict([abstracts,keys]).T | |
df_val.columns=['abstract','keys'] | |
df_val=df_val.reset_index() | |
print('Shape=>',df_val.shape) | |
df_val.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Function for pre-processing keywords | |
def preprocess_keys(text): | |
# Case Normalization | |
text=text.lower() | |
# Removing hyphen and whitespaces | |
text=re.sub('(\s|-)+',' ',text) | |
# Removing any leading and trailing spaces | |
text=text.strip() | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Getting list of keys | |
keys=dict.fromkeys(index,None) | |
for i in keys_text: | |
keys[i]=keys_text[i].split(';') | |
# Printing sample keywords | |
dict(list(keys.items())[:4]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extracting text from all the keys files | |
keys_text=dict.fromkeys(index,None) | |
for key_file in keys_list: | |
file=open(key_file,encoding='utf-8').read() | |
keys_text[int(key_file.split('.')[0].split('/')[-1])]=file | |
# Printing content of five files | |
dict(list(keys_text.items())[:5]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Getting list of files containing uncontrolled keys | |
keys_list=glob.glob('/content/validation/*.uncontr') | |
keys_list[:5] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Getting abstracts | |
abstracts=dict.fromkeys(index,None) | |
for text_file in abs_list: | |
file=open(text_file,encoding='utf8').read() | |
abstracts[int(text_file.split('.')[0].split('/')[-1])]=file | |
# Printing five abstracts | |
dict(list(abstracts.items())[:5]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Getting index value | |
index=sorted([int(i.split('.')[0].split('/')[-1]) for i in abs_list]) | |
index[:5] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
len(abs_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Getting list of all the files containing abstracts | |
abs_list=glob.glob('/content/validation/*.abstr') | |
abs_list[:5] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# For working with Regular Expressions | |
import re | |
# For getting pathnames | |
import glob | |
# For numerical computing | |
import numpy as np | |
# For data handling | |
import pandas as pd |