Skip to content

Instantly share code, notes, and snippets.

@Omrigan
Created June 22, 2016 07:50
Show Gist options
  • Save Omrigan/5793d96ea0d322d884c28033136fd639 to your computer and use it in GitHub Desktop.
Save Omrigan/5793d96ea0d322d884c28033136fd639 to your computer and use it in GitHub Desktop.
import re
def get_sense(text):
tokens = text.split()
good_tokens = set()
prev_end = True
i = 0
while i < len(tokens):
t = tokens[i]
if t[0].isupper() and not prev_end:
# prev_i = i
# while i < len(tokens) - 1 and tokens[i][0].isupper():
# i += 1
#good_tokens.add(" ".join(tokens[prev_i:i]))
good_tokens.add(t)
if t[0] in {'"', '\'', '«'}:
prev_i = i
end_char = t[0]
if end_char=='«':
end_char = '»'
while i<len(tokens)-1 and end_char not in tokens[i]:
i+=1
if i-prev_i<5:
good_tokens.add(" ".join(tokens[prev_i:i+1]))
prev_end = False
if t[-1] in {'.', '!', '?', '-', '—'}:
prev_end = True
i+=1
good_tokens = {re.sub('[,!\\.\\?\\)\\(\\[\\]]', '', _) for _ in good_tokens}
return good_tokens
print(get_sense(open('testtext.txt').read()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment