chauhanakash23/create_data.py

## create_data.py
import pandas as pd
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle

def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip()
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        match_list.append(match_tup)

    return match_list, string

def mark_sentence(s, match_list):
    '''
    Marks all the entities in the sentence as per the BIO scheme.
    '''
    word_dict = {}
    for word in s.split():
        word_dict[word] = 'O'

    for start, end, e_type in match_list:
        temp_str = s[start:end]
        tmp_list = temp_str.split()
        if len(tmp_list) > 1:
            word_dict[tmp_list[0]] = 'B-' + e_type
            for w in tmp_list[1:]:
                word_dict[w] = 'I-' + e_type
        else:
            word_dict[temp_str] = 'B-' + e_type
    return word_dict

def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", "'"]
    for i in text:
        if i in filters:
            text = text.replace(i, " " + i)

    return text

def create_data(df, filepath):
    '''
    The function responsible for the creation of data in the said format.
    '''
    with open(filepath , 'w') as f:
        for text, annotation in zip(df.text, df.annotation):
            text = clean(text)
            text_ = text
            match_list = []
            for i in annotation:
                a, text_ = matcher(text, i[0])
                match_list.append((a[0][0], a[0][1], i[1]))

            d = mark_sentence(text, match_list)

            for i in d.keys():
                f.writelines(i + ' ' + d[i] +'\n')
            f.writelines('\n')

def main():
    ## An example dataframe.
    data = pd.DataFrame([["Horses are too tall and they pretend to care about your feelings", [("Horses", "ANIMAL")]],
                  ["Who is Shaka Khan?", [("Shaka Khan", "PERSON")]],
                  ["I like London and Berlin.", [("London", "LOCATION"), ("Berlin", "LOCATION")]],
                  ["There is a banyan tree in the courtyard", [("banyan tree", "TREE")]]], columns=['text', 'annotation'])

    ## path to save the txt file.
    filepath = 'train/train.txt'
    ## creating the file.
    create_data(data, filepath)

if __name__ == '__main__':
    main()
	import pandas as pd
	from tqdm import tqdm
	from difflib import SequenceMatcher
	import re
	import pickle

	def matcher(string, pattern):
	'''
	Return the start and end index of any pattern present in the text.
	'''
	match_list = []
	pattern = pattern.strip()
	seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
	match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
	if (match.size == len(pattern)):
	start = match.a
	end = match.a + match.size
	match_tup = (start, end)
	string = string.replace(pattern, "X" * len(pattern), 1)
	match_list.append(match_tup)

	return match_list, string

	def mark_sentence(s, match_list):
	'''
	Marks all the entities in the sentence as per the BIO scheme.
	'''
	word_dict = {}
	for word in s.split():
	word_dict[word] = 'O'

	for start, end, e_type in match_list:
	temp_str = s[start:end]
	tmp_list = temp_str.split()
	if len(tmp_list) > 1:
	word_dict[tmp_list[0]] = 'B-' + e_type
	for w in tmp_list[1:]:
	word_dict[w] = 'I-' + e_type
	else:
	word_dict[temp_str] = 'B-' + e_type
	return word_dict

	def clean(text):
	'''
	Just a helper fuction to add a space before the punctuations for better tokenization
	'''
	filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[",
	"\\", "]", "_", "`", "{", "}", "~", "'"]
	for i in text:
	if i in filters:
	text = text.replace(i, " " + i)

	return text

	def create_data(df, filepath):
	'''
	The function responsible for the creation of data in the said format.
	'''
	with open(filepath , 'w') as f:
	for text, annotation in zip(df.text, df.annotation):
	text = clean(text)
	text_ = text
	match_list = []
	for i in annotation:
	a, text_ = matcher(text, i[0])
	match_list.append((a[0][0], a[0][1], i[1]))

	d = mark_sentence(text, match_list)

	for i in d.keys():
	f.writelines(i + ' ' + d[i] +'\n')
	f.writelines('\n')

	def main():
	## An example dataframe.
	data = pd.DataFrame([["Horses are too tall and they pretend to care about your feelings", [("Horses", "ANIMAL")]],
	["Who is Shaka Khan?", [("Shaka Khan", "PERSON")]],
	["I like London and Berlin.", [("London", "LOCATION"), ("Berlin", "LOCATION")]],
	["There is a banyan tree in the courtyard", [("banyan tree", "TREE")]]], columns=['text', 'annotation'])

	## path to save the txt file.
	filepath = 'train/train.txt'
	## creating the file.
	create_data(data, filepath)

	if __name__ == '__main__':
	main()