Created
May 3, 2020 07:32
-
-
Save chauhanakash23/8ce4f9bb5dff0e15a7603bef2bd02183 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from tqdm import tqdm | |
from difflib import SequenceMatcher | |
import re | |
import pickle | |
def matcher(string, pattern): | |
''' | |
Return the start and end index of any pattern present in the text. | |
''' | |
match_list = [] | |
pattern = pattern.strip() | |
seqMatch = SequenceMatcher(None, string, pattern, autojunk=False) | |
match = seqMatch.find_longest_match(0, len(string), 0, len(pattern)) | |
if (match.size == len(pattern)): | |
start = match.a | |
end = match.a + match.size | |
match_tup = (start, end) | |
string = string.replace(pattern, "X" * len(pattern), 1) | |
match_list.append(match_tup) | |
return match_list, string | |
def mark_sentence(s, match_list): | |
''' | |
Marks all the entities in the sentence as per the BIO scheme. | |
''' | |
word_dict = {} | |
for word in s.split(): | |
word_dict[word] = 'O' | |
for start, end, e_type in match_list: | |
temp_str = s[start:end] | |
tmp_list = temp_str.split() | |
if len(tmp_list) > 1: | |
word_dict[tmp_list[0]] = 'B-' + e_type | |
for w in tmp_list[1:]: | |
word_dict[w] = 'I-' + e_type | |
else: | |
word_dict[temp_str] = 'B-' + e_type | |
return word_dict | |
def clean(text): | |
''' | |
Just a helper fuction to add a space before the punctuations for better tokenization | |
''' | |
filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[", | |
"\\", "]", "_", "`", "{", "}", "~", "'"] | |
for i in text: | |
if i in filters: | |
text = text.replace(i, " " + i) | |
return text | |
def create_data(df, filepath): | |
''' | |
The function responsible for the creation of data in the said format. | |
''' | |
with open(filepath , 'w') as f: | |
for text, annotation in zip(df.text, df.annotation): | |
text = clean(text) | |
text_ = text | |
match_list = [] | |
for i in annotation: | |
a, text_ = matcher(text, i[0]) | |
match_list.append((a[0][0], a[0][1], i[1])) | |
d = mark_sentence(text, match_list) | |
for i in d.keys(): | |
f.writelines(i + ' ' + d[i] +'\n') | |
f.writelines('\n') | |
def main(): | |
## An example dataframe. | |
data = pd.DataFrame([["Horses are too tall and they pretend to care about your feelings", [("Horses", "ANIMAL")]], | |
["Who is Shaka Khan?", [("Shaka Khan", "PERSON")]], | |
["I like London and Berlin.", [("London", "LOCATION"), ("Berlin", "LOCATION")]], | |
["There is a banyan tree in the courtyard", [("banyan tree", "TREE")]]], columns=['text', 'annotation']) | |
## path to save the txt file. | |
filepath = 'train/train.txt' | |
## creating the file. | |
create_data(data, filepath) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment