ttpro1995/unnest_data.py Secret

## unnest_data.py
import sys

END_TAG = 0
START_TAG = 1
NOT_TAG = -1

def detect_tag(in_token):
    """
    detect tag in token
    :param in_token:
    :return:
    """
    if "<START:" in in_token:
        return START_TAG
    elif "<END>" == in_token:
        return END_TAG

    return NOT_TAG

def remove_nest_tag(in_str):
    """
    với <START:ORGANIZATION> Sở Cảnh sát Phòng cháy , chữa cháy ( PCCC ) và cứu nạn , cứu hộ <START:LOCATION> Hà Nội <END> <END>
    :param in_str:
    :return:
    """
    state = 0
    taglist = []
    tag_dict = dict()
    sentence_token = in_str.split()
    ## detect token tag
    max_nest = 0

    for index, token in enumerate(sentence_token):
        # print(token + str(detect_tag(token)))
        tag = detect_tag(token)
        if tag > 0:
            state += 1
            if max_nest < state:
                max_nest = state
            token_info = (index, state, token)
            taglist.append(token_info)
            tag_dict[index] = token_info
        elif tag == 0:
            token_info = (index, state, token)
            taglist.append(token_info)
            tag_dict[index] = token_info
            state -= 1


    generate_sentences = []
    for state in range(max_nest+1):
        generate_sentence_token = []
        for index, token in enumerate(sentence_token):
            if detect_tag(token) >= 0: # is a tag
                token_info = tag_dict[index]
                if token_info[1] == state:
                    generate_sentence_token.append(token)
            elif detect_tag(token) == -1 : # not a tag
                generate_sentence_token.append(token)
        sentence = ' '.join(generate_sentence_token)
        generate_sentences.append(sentence)
    return generate_sentences


    # generate sentence
    print(taglist)

def test():
    tstr = "với <START:ORGANIZATION> Sở Cảnh sát Phòng cháy , chữa cháy ( PCCC ) và cứu nạn , cứu hộ <START:LOCATION> Hà Nội <END> <END>"
    result = remove_nest_tag(tstr)
    print("-----")
    for sentence in result:
        print(sentence)

if __name__ == "__main__":
    """
    un-nest dataset for opennlp name
    """
    # test()
    if len(sys.argv) > 1:
        inpath = sys.argv[1]
        infile = open(inpath, 'r')
        outfile = open(inpath+".out", 'w')
        for line in infile:
            sentences = remove_nest_tag(line)
            for sentence in sentences:
                outfile.write(sentence+"\n")
        outfile.close()
    else:
        print("usage: python unnest_data.py input.txt")


    pass
	import sys

	END_TAG = 0
	START_TAG = 1
	NOT_TAG = -1

	def detect_tag(in_token):
	"""
	detect tag in token
	:param in_token:
	:return:
	"""
	if "<START:" in in_token:
	return START_TAG
	elif "<END>" == in_token:
	return END_TAG

	return NOT_TAG

	def remove_nest_tag(in_str):
	"""
	với <START:ORGANIZATION> Sở Cảnh sát Phòng cháy , chữa cháy ( PCCC ) và cứu nạn , cứu hộ <START:LOCATION> Hà Nội <END> <END>
	:param in_str:
	:return:
	"""
	state = 0
	taglist = []
	tag_dict = dict()
	sentence_token = in_str.split()
	## detect token tag
	max_nest = 0

	for index, token in enumerate(sentence_token):
	# print(token + str(detect_tag(token)))
	tag = detect_tag(token)
	if tag > 0:
	state += 1
	if max_nest < state:
	max_nest = state
	token_info = (index, state, token)
	taglist.append(token_info)
	tag_dict[index] = token_info
	elif tag == 0:
	token_info = (index, state, token)
	taglist.append(token_info)
	tag_dict[index] = token_info
	state -= 1


	generate_sentences = []
	for state in range(max_nest+1):
	generate_sentence_token = []
	for index, token in enumerate(sentence_token):
	if detect_tag(token) >= 0: # is a tag
	token_info = tag_dict[index]
	if token_info[1] == state:
	generate_sentence_token.append(token)
	elif detect_tag(token) == -1 : # not a tag
	generate_sentence_token.append(token)
	sentence = ' '.join(generate_sentence_token)
	generate_sentences.append(sentence)
	return generate_sentences


	# generate sentence
	print(taglist)

	def test():
	tstr = "với <START:ORGANIZATION> Sở Cảnh sát Phòng cháy , chữa cháy ( PCCC ) và cứu nạn , cứu hộ <START:LOCATION> Hà Nội <END> <END>"
	result = remove_nest_tag(tstr)
	print("-----")
	for sentence in result:
	print(sentence)

	if __name__ == "__main__":
	"""
	un-nest dataset for opennlp name
	"""
	# test()
	if len(sys.argv) > 1:
	inpath = sys.argv[1]
	infile = open(inpath, 'r')
	outfile = open(inpath+".out", 'w')
	for line in infile:
	sentences = remove_nest_tag(line)
	for sentence in sentences:
	outfile.write(sentence+"\n")
	outfile.close()
	else:
	print("usage: python unnest_data.py input.txt")



	pass