amake/phraseextractor.py

## phraseextractor.py
'''
A simple phrase extractor

Usage: cat file.txt | python phraseextractor.py [max_ngram] [min_count]

Options:
  max_ngram: Maximum phrase length in words (default: 5)
  min_count: Minimum number of phrase occurrences (default: 3)
'''

import sys
import re
from collections import Counter

remove_chars = re.compile(ur'[.(),;:0-9\u2013\u2014\u2019]')
stop_words = ['the', 'to', 'is', 'of', 'as', 'it', 'in', 'an', 'a', 'and',
              'be', 'if', 'or', 'by', 'for', 'do', 'that', 'from', 'on',
              'not', 'are', 'at', 'have', 'en', 'eu', 'has', 'been']

def get_words(text):
    return remove_chars.sub('', text).lower().split()

def repeated_items(items, min_count):
    counted = Counter(items)
    return [item for item, n in counted.iteritems() if n > min_count]

def ngrams(words, n):
    if len(words) < n:
        return []
    return [tuple(words[i:i + n]) for i in xrange(len(words) - n + 1)]

def repeated_phrases(text, max_ngram, min_count):
    words = get_words(text)
    return [repeated for n in xrange(2, max_ngram + 1)
            for repeated in repeated_items(ngrams(words, n), min_count)
            if repeated]

def remove_subphrases(phrases):
    return [phrase for phrase in phrases
            if all(phrase == p or phrase not in p for p in phrases)]

def is_trivial_phrase(phrase):
    good_words = [word for word in phrase if word not in stop_words]
    return len(good_words) <= len(phrase) / 2

def get_phrases(text, max_ngram=5, min_count=3):
    repeated = [phrase
                for phrase in repeated_phrases(text, max_ngram, min_count)
                if not is_trivial_phrase(phrase)]
    filtered = remove_subphrases([' '.join(phrase)
                                  for phrase in repeated])
    return list(reversed(filtered))

if __name__ == '__main__':
    input = sys.stdin.read().decode('utf-8')
    args = [input] + [int(arg) for arg in sys.argv[1:]]
    phrases = get_phrases(*args)
    print '\n'.join(phrases)
    sys.stdout.flush()
	'''
	A simple phrase extractor

	Usage: cat file.txt \| python phraseextractor.py [max_ngram] [min_count]

	Options:
	max_ngram: Maximum phrase length in words (default: 5)
	min_count: Minimum number of phrase occurrences (default: 3)
	'''

	import sys
	import re
	from collections import Counter

	remove_chars = re.compile(ur'[.(),;:0-9\u2013\u2014\u2019]')
	stop_words = ['the', 'to', 'is', 'of', 'as', 'it', 'in', 'an', 'a', 'and',
	'be', 'if', 'or', 'by', 'for', 'do', 'that', 'from', 'on',
	'not', 'are', 'at', 'have', 'en', 'eu', 'has', 'been']

	def get_words(text):
	return remove_chars.sub('', text).lower().split()

	def repeated_items(items, min_count):
	counted = Counter(items)
	return [item for item, n in counted.iteritems() if n > min_count]

	def ngrams(words, n):
	if len(words) < n:
	return []
	return [tuple(words[i:i + n]) for i in xrange(len(words) - n + 1)]

	def repeated_phrases(text, max_ngram, min_count):
	words = get_words(text)
	return [repeated for n in xrange(2, max_ngram + 1)
	for repeated in repeated_items(ngrams(words, n), min_count)
	if repeated]

	def remove_subphrases(phrases):
	return [phrase for phrase in phrases
	if all(phrase == p or phrase not in p for p in phrases)]

	def is_trivial_phrase(phrase):
	good_words = [word for word in phrase if word not in stop_words]
	return len(good_words) <= len(phrase) / 2

	def get_phrases(text, max_ngram=5, min_count=3):
	repeated = [phrase
	for phrase in repeated_phrases(text, max_ngram, min_count)
	if not is_trivial_phrase(phrase)]
	filtered = remove_subphrases([' '.join(phrase)
	for phrase in repeated])
	return list(reversed(filtered))

	if __name__ == '__main__':
	input = sys.stdin.read().decode('utf-8')
	args = [input] + [int(arg) for arg in sys.argv[1:]]
	phrases = get_phrases(*args)
	print '\n'.join(phrases)
	sys.stdout.flush()