andreiolariu/phrase_extraction.py

## phrase_extraction.py
# for more info check out http://webmining.olariu.org/is-winter-really-coming
import re
from math import log, sqrt
import matplotlib.pyplot as pyplot

DEPTH = 3 # minimum depth for tree construction = minimum phrase length
OCCURRENCES = 10 # minimum number of phrase occurrences

text = open('game.txt').read() # reading input data
text = text.lower()
text = text.replace('\xe2\x80\x99', '') # treating apostrophes
words = re.findall(r'\w+', text) # spliting words

# constructing the tree of phrases
tree = {}

# first, phrases of length 1 (aka words) are constructed
for i in range(len(words)):
  word = words[i]
  if word not in tree:
    tree[word] = {'positions': []}
  tree[word]['positions'].append(i)

# then, as long as they are frequent enough,
# we go deeper in constructing the tree
def expand(node):
  if len(node['positions']) >= OCCURRENCES:
    node['children'] = {}
    for p in node['positions']:
      if p + 1 < len(words):
        word = words[p + 1]
        if word not in node['children']:
          node['children'][word] = {'positions': []}
        node['children'][word]['positions'].append(p + 1)
    for child in node['children'].itervalues():
      expand(child)

for node in tree.itervalues():
  expand(node)

# now we extract the frequent phrases of length > DEPTH
# we also compute a score based on the phrase length (in characters)
# and on the number of occurrences

phrases = []

def search(node, before, depth):
  if len(node['positions']) >= OCCURRENCES:
    if depth >= DEPTH:
      phrase = {
          'text': before,
          'score': len(before) * log(len(node['positions'])),
      }
      phrases.append(phrase)
    for word, child in node['children'].iteritems():
      search(child, before + ' ' + word, depth + 1)

for word, node in tree.iteritems():
  search(node, word, 1)

# sort by score
phrases.sort(key=lambda x: -x['score'])

for p in phrases[:100]:
  print p['text']

# plot results
pyplot.plot([p['score'] for p in phrases], \
    [log(i + 1) for i in range(len(phrases))], 'ro')
pyplot.xlabel('phrase score')
pyplot.ylabel('log(phrase rank)')
pyplot.show()
	# for more info check out http://webmining.olariu.org/is-winter-really-coming
	import re
	from math import log, sqrt
	import matplotlib.pyplot as pyplot

	DEPTH = 3 # minimum depth for tree construction = minimum phrase length
	OCCURRENCES = 10 # minimum number of phrase occurrences

	text = open('game.txt').read() # reading input data
	text = text.lower()
	text = text.replace('\xe2\x80\x99', '') # treating apostrophes
	words = re.findall(r'\w+', text) # spliting words

	# constructing the tree of phrases
	tree = {}

	# first, phrases of length 1 (aka words) are constructed
	for i in range(len(words)):
	word = words[i]
	if word not in tree:
	tree[word] = {'positions': []}
	tree[word]['positions'].append(i)

	# then, as long as they are frequent enough,
	# we go deeper in constructing the tree
	def expand(node):
	if len(node['positions']) >= OCCURRENCES:
	node['children'] = {}
	for p in node['positions']:
	if p + 1 < len(words):
	word = words[p + 1]
	if word not in node['children']:
	node['children'][word] = {'positions': []}
	node['children'][word]['positions'].append(p + 1)
	for child in node['children'].itervalues():
	expand(child)

	for node in tree.itervalues():
	expand(node)

	# now we extract the frequent phrases of length > DEPTH
	# we also compute a score based on the phrase length (in characters)
	# and on the number of occurrences

	phrases = []

	def search(node, before, depth):
	if len(node['positions']) >= OCCURRENCES:
	if depth >= DEPTH:
	phrase = {
	'text': before,
	'score': len(before) * log(len(node['positions'])),
	}
	phrases.append(phrase)
	for word, child in node['children'].iteritems():
	search(child, before + ' ' + word, depth + 1)

	for word, node in tree.iteritems():
	search(node, word, 1)

	# sort by score
	phrases.sort(key=lambda x: -x['score'])

	for p in phrases[:100]:
	print p['text']

	# plot results
	pyplot.plot([p['score'] for p in phrases], \
	[log(i + 1) for i in range(len(phrases))], 'ro')
	pyplot.xlabel('phrase score')
	pyplot.ylabel('log(phrase rank)')
	pyplot.show()