Omrigan/Sense

## Sense
import re
def get_sense(text):
    tokens = text.split()
    good_tokens = set()
    prev_end = True
    i = 0
    while i < len(tokens):
        t = tokens[i]
        if t[0].isupper() and not prev_end:
            # prev_i = i
            # while i < len(tokens) - 1 and tokens[i][0].isupper():
            #     i += 1
            #good_tokens.add(" ".join(tokens[prev_i:i]))
            good_tokens.add(t)
        if t[0] in {'"', '\'', '«'}:
            prev_i = i
            end_char = t[0]
            if end_char=='«':
                end_char = '»'
            while i<len(tokens)-1 and end_char not in tokens[i]:
                i+=1
            if i-prev_i<5:
                good_tokens.add(" ".join(tokens[prev_i:i+1]))
        prev_end = False
        if t[-1] in {'.', '!', '?', '-', '—'}:
            prev_end = True
        i+=1

        good_tokens = {re.sub('[,!\\.\\?\\)\\(\\[\\]]', '', _) for _ in good_tokens}
    return good_tokens


print(get_sense(open('testtext.txt').read()))
	import re
	def get_sense(text):
	tokens = text.split()
	good_tokens = set()
	prev_end = True
	i = 0
	while i < len(tokens):
	t = tokens[i]
	if t[0].isupper() and not prev_end:
	# prev_i = i
	# while i < len(tokens) - 1 and tokens[i][0].isupper():
	# i += 1
	#good_tokens.add(" ".join(tokens[prev_i:i]))
	good_tokens.add(t)
	if t[0] in {'"', '\'', '«'}:
	prev_i = i
	end_char = t[0]
	if end_char=='«':
	end_char = '»'
	while i<len(tokens)-1 and end_char not in tokens[i]:
	i+=1
	if i-prev_i<5:
	good_tokens.add(" ".join(tokens[prev_i:i+1]))
	prev_end = False
	if t[-1] in {'.', '!', '?', '-', '—'}:
	prev_end = True
	i+=1

	good_tokens = {re.sub('[,!\\.\\?\\)\\(\\[\\]]', '', _) for _ in good_tokens}
	return good_tokens


	print(get_sense(open('testtext.txt').read()))