Last active
March 26, 2016 13:11
-
-
Save amake/061d8cca54ad2f65ebf7 to your computer and use it in GitHub Desktop.
A simple Python phrase extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
A simple phrase extractor | |
Usage: cat file.txt | python phraseextractor.py [max_ngram] [min_count] | |
Options: | |
max_ngram: Maximum phrase length in words (default: 5) | |
min_count: Minimum number of phrase occurrences (default: 3) | |
''' | |
import sys | |
import re | |
from collections import Counter | |
remove_chars = re.compile(ur'[.(),;:0-9\u2013\u2014\u2019]') | |
stop_words = ['the', 'to', 'is', 'of', 'as', 'it', 'in', 'an', 'a', 'and', | |
'be', 'if', 'or', 'by', 'for', 'do', 'that', 'from', 'on', | |
'not', 'are', 'at', 'have', 'en', 'eu', 'has', 'been'] | |
def get_words(text): | |
return remove_chars.sub('', text).lower().split() | |
def repeated_items(items, min_count): | |
counted = Counter(items) | |
return [item for item, n in counted.iteritems() if n > min_count] | |
def ngrams(words, n): | |
if len(words) < n: | |
return [] | |
return [tuple(words[i:i + n]) for i in xrange(len(words) - n + 1)] | |
def repeated_phrases(text, max_ngram, min_count): | |
words = get_words(text) | |
return [repeated for n in xrange(2, max_ngram + 1) | |
for repeated in repeated_items(ngrams(words, n), min_count) | |
if repeated] | |
def remove_subphrases(phrases): | |
return [phrase for phrase in phrases | |
if all(phrase == p or phrase not in p for p in phrases)] | |
def is_trivial_phrase(phrase): | |
good_words = [word for word in phrase if word not in stop_words] | |
return len(good_words) <= len(phrase) / 2 | |
def get_phrases(text, max_ngram=5, min_count=3): | |
repeated = [phrase | |
for phrase in repeated_phrases(text, max_ngram, min_count) | |
if not is_trivial_phrase(phrase)] | |
filtered = remove_subphrases([' '.join(phrase) | |
for phrase in repeated]) | |
return list(reversed(filtered)) | |
if __name__ == '__main__': | |
input = sys.stdin.read().decode('utf-8') | |
args = [input] + [int(arg) for arg in sys.argv[1:]] | |
phrases = get_phrases(*args) | |
print '\n'.join(phrases) | |
sys.stdout.flush() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment