Skip to content

Instantly share code, notes, and snippets.

@chauhanakash23
Created May 3, 2020 07:32
Show Gist options
  • Save chauhanakash23/8ce4f9bb5dff0e15a7603bef2bd02183 to your computer and use it in GitHub Desktop.
Save chauhanakash23/8ce4f9bb5dff0e15a7603bef2bd02183 to your computer and use it in GitHub Desktop.
import pandas as pd
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle
def matcher(string, pattern):
'''
Return the start and end index of any pattern present in the text.
'''
match_list = []
pattern = pattern.strip()
seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
if (match.size == len(pattern)):
start = match.a
end = match.a + match.size
match_tup = (start, end)
string = string.replace(pattern, "X" * len(pattern), 1)
match_list.append(match_tup)
return match_list, string
def mark_sentence(s, match_list):
'''
Marks all the entities in the sentence as per the BIO scheme.
'''
word_dict = {}
for word in s.split():
word_dict[word] = 'O'
for start, end, e_type in match_list:
temp_str = s[start:end]
tmp_list = temp_str.split()
if len(tmp_list) > 1:
word_dict[tmp_list[0]] = 'B-' + e_type
for w in tmp_list[1:]:
word_dict[w] = 'I-' + e_type
else:
word_dict[temp_str] = 'B-' + e_type
return word_dict
def clean(text):
'''
Just a helper fuction to add a space before the punctuations for better tokenization
'''
filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[",
"\\", "]", "_", "`", "{", "}", "~", "'"]
for i in text:
if i in filters:
text = text.replace(i, " " + i)
return text
def create_data(df, filepath):
'''
The function responsible for the creation of data in the said format.
'''
with open(filepath , 'w') as f:
for text, annotation in zip(df.text, df.annotation):
text = clean(text)
text_ = text
match_list = []
for i in annotation:
a, text_ = matcher(text, i[0])
match_list.append((a[0][0], a[0][1], i[1]))
d = mark_sentence(text, match_list)
for i in d.keys():
f.writelines(i + ' ' + d[i] +'\n')
f.writelines('\n')
def main():
## An example dataframe.
data = pd.DataFrame([["Horses are too tall and they pretend to care about your feelings", [("Horses", "ANIMAL")]],
["Who is Shaka Khan?", [("Shaka Khan", "PERSON")]],
["I like London and Berlin.", [("London", "LOCATION"), ("Berlin", "LOCATION")]],
["There is a banyan tree in the courtyard", [("banyan tree", "TREE")]]], columns=['text', 'annotation'])
## path to save the txt file.
filepath = 'train/train.txt'
## creating the file.
create_data(data, filepath)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment