Skip to content

Instantly share code, notes, and snippets.

View abhishek-shrm's full-sized avatar

ABHISHEK SHARMA abhishek-shrm

  • ZS Associates
  • New Delhi, India
View GitHub Profile
# Creating a dataframe of abstracts and keywords in the validation set
df_val=pd.DataFrame.from_dict([abstracts,keys]).T
df_val.columns=['abstract','keys']
df_val=df_val.reset_index()
print('Shape=>',df_val.shape)
df_val.head()
# Function for pre-processing keywords
def preprocess_keys(text):
# Case Normalization
text=text.lower()
# Removing hyphen and whitespaces
text=re.sub('(\s|-)+',' ',text)
# Removing any leading and trailing spaces
text=text.strip()
return text
# Getting list of keys
keys=dict.fromkeys(index,None)
for i in keys_text:
keys[i]=keys_text[i].split(';')
# Printing sample keywords
dict(list(keys.items())[:4])
# Extracting text from all the keys files
keys_text=dict.fromkeys(index,None)
for key_file in keys_list:
file=open(key_file,encoding='utf-8').read()
keys_text[int(key_file.split('.')[0].split('/')[-1])]=file
# Printing content of five files
dict(list(keys_text.items())[:5])
# Getting list of files containing uncontrolled keys
keys_list=glob.glob('/content/validation/*.uncontr')
keys_list[:5]
# Getting abstracts
abstracts=dict.fromkeys(index,None)
for text_file in abs_list:
file=open(text_file,encoding='utf8').read()
abstracts[int(text_file.split('.')[0].split('/')[-1])]=file
# Printing five abstracts
dict(list(abstracts.items())[:5])
# Getting index value
index=sorted([int(i.split('.')[0].split('/')[-1]) for i in abs_list])
index[:5]
# Getting list of all the files containing abstracts
abs_list=glob.glob('/content/validation/*.abstr')
abs_list[:5]
# For working with Regular Expressions
import re
# For getting pathnames
import glob
# For numerical computing
import numpy as np
# For data handling
import pandas as pd