Skip to content

Instantly share code, notes, and snippets.

@amake
Last active March 26, 2016 13:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amake/061d8cca54ad2f65ebf7 to your computer and use it in GitHub Desktop.
Save amake/061d8cca54ad2f65ebf7 to your computer and use it in GitHub Desktop.
A simple Python phrase extractor
'''
A simple phrase extractor
Usage: cat file.txt | python phraseextractor.py [max_ngram] [min_count]
Options:
max_ngram: Maximum phrase length in words (default: 5)
min_count: Minimum number of phrase occurrences (default: 3)
'''
import sys
import re
from collections import Counter
remove_chars = re.compile(ur'[.(),;:0-9\u2013\u2014\u2019]')
stop_words = ['the', 'to', 'is', 'of', 'as', 'it', 'in', 'an', 'a', 'and',
'be', 'if', 'or', 'by', 'for', 'do', 'that', 'from', 'on',
'not', 'are', 'at', 'have', 'en', 'eu', 'has', 'been']
def get_words(text):
return remove_chars.sub('', text).lower().split()
def repeated_items(items, min_count):
counted = Counter(items)
return [item for item, n in counted.iteritems() if n > min_count]
def ngrams(words, n):
if len(words) < n:
return []
return [tuple(words[i:i + n]) for i in xrange(len(words) - n + 1)]
def repeated_phrases(text, max_ngram, min_count):
words = get_words(text)
return [repeated for n in xrange(2, max_ngram + 1)
for repeated in repeated_items(ngrams(words, n), min_count)
if repeated]
def remove_subphrases(phrases):
return [phrase for phrase in phrases
if all(phrase == p or phrase not in p for p in phrases)]
def is_trivial_phrase(phrase):
good_words = [word for word in phrase if word not in stop_words]
return len(good_words) <= len(phrase) / 2
def get_phrases(text, max_ngram=5, min_count=3):
repeated = [phrase
for phrase in repeated_phrases(text, max_ngram, min_count)
if not is_trivial_phrase(phrase)]
filtered = remove_subphrases([' '.join(phrase)
for phrase in repeated])
return list(reversed(filtered))
if __name__ == '__main__':
input = sys.stdin.read().decode('utf-8')
args = [input] + [int(arg) for arg in sys.argv[1:]]
phrases = get_phrases(*args)
print '\n'.join(phrases)
sys.stdout.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment