Skip to content

Instantly share code, notes, and snippets.

@docete
Created November 13, 2012 10:49
Show Gist options
  • Save docete/4065176 to your computer and use it in GitHub Desktop.
Save docete/4065176 to your computer and use it in GitHub Desktop.
运用词频,词的内聚度和词的自由运用程度
#!/usr/bin/env python
# -*- coding: utf8 -*-
import collections
import math
import sys
import getopt
import re
def suffix(s, i):
return s[i:]
def prefix(s, i):
return s[:i]
def info_entropy(d):
result = 0
total = sum([v for _,v in d.iteritems()])
for w, cnt in d.iteritems():
p = cnt * 1.0 / total
result -= p * math.log(p)
return result
def word_detect(sentences, cohesion, dof):
MAX_WORD_LEN = 5
word_candidates = set()
word_freqs = collections.Counter()
word_rates = collections.Counter()
word_rights = {}
word_lefts = {}
print >> sys.stderr, "Start Processing [%d] sentences" % (len(sentences))
for s in sentences:
print >> sys.stderr, "Calc word frequency for [%s]" % (s.encode('utf8'))
corpus_length = len(s)
word_length = min(corpus_length, MAX_WORD_LEN)
word_in_sentences = set()
for i in xrange(1, word_length+1):
for j in xrange(0, corpus_length-i+1):
candidate = s[j:j+i]
word_in_sentences.add(candidate)
word_candidates.add(candidate)
word_freqs[candidate] += 1
print >> sys.stderr, "Calc right neighbors for [%s]" % (s.encode('utf8'))
SUFFIX_ORDERS = []
for i in xrange(1, len(s)+1):
SUFFIX_ORDERS.append(suffix(s, len(s)-i))
SUFFIX_ORDERS.sort()
for word in word_in_sentences:
if not word_rights.has_key(word):
word_rights[word] = {}
for suffix_order in SUFFIX_ORDERS:
if suffix_order.startswith(word) and len(suffix_order) > len(word):
if word_rights[word].has_key(suffix_order[len(word):len(word)+1]):
word_rights[word][suffix_order[len(word):len(word)+1]] += 1
else:
word_rights[word][suffix_order[len(word):len(word)+1]] = 1
print >> sys.stderr, "Calc left neighbors for [%s]" % (s.encode('utf8'))
PREFIX_ORDERS = []
for i in xrange(1, len(s)+1):
PREFIX_ORDERS.append(prefix(s, i))
PREFIX_ORDERS.sort()
for word in word_in_sentences:
if not word_lefts.has_key(word):
word_lefts[word] = {}
for prefix_order in PREFIX_ORDERS:
if prefix_order.endswith(word) and len(prefix_order) > len(word):
if word_lefts[word].has_key(prefix_order[len(prefix_order)-len(word)-1:len(prefix_order)-len(word)]):
word_lefts[word][prefix_order[len(prefix_order)-len(word)-1:len(prefix_order)-len(word)]] += 1
else:
word_lefts[word][prefix_order[len(prefix_order)-len(word)-1:len(prefix_order)-len(word)]] = 1
total = sum([freq for _, freq in word_freqs.iteritems()])
for word, freq in word_freqs.iteritems():
word_rates[word] = freq*1.0/total
#print word.encode('utf8'), word_freqs[word]
# calc term_frequency, term_freedom, term_solification
solid_threhold = cohesion
freedom_threhold = dof
for word in word_candidates:
rate = word_rates[word]
solid = 0
if len(word) <= 1:
continue
solidification = 0
for i in xrange(1, len(word)):
t = word_rates[word[:i]] * word_rates[word[i:]]
solidification = max(solidification, t)
solid = rate / solidification
if solid < solid_threhold:
continue
left_info_entropy = info_entropy(word_lefts[word])
right_info_entropy = info_entropy(word_rights[word])
freedom = min(left_info_entropy, right_info_entropy)
if freedom < freedom_threhold:
continue
print "%s\t%s\t%s\t%s" % (word.encode('utf8'), word_freqs[word], solid, freedom)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "hf:c:d:", ["help", "--file=", "--cohesion=", "--dof="])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(1)
input_files = None
cohesion = 150
dof = 3
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
elif o in ("-f", "--file"):
input_files = a.split()
elif o in ("-c", "--cohesion"):
cohesion = int(a)
elif o in ("-d", "--dof"):
dof = int(a)
else:
assert False, "unhandled option"
content = []
for f in input_files:
fd = open(f, 'r')
for line in fd:
line = line.strip()
if len(line) > 0:
print >> sys.stderr, "processing: [%s][%d] and got [%d] sentences before." % (line, len(line), len(content))
line = unicode(line, 'utf8')
content.extend([s for s in re.split('\W+|[a-zA-Z0-9]+', line, 0, re.UNICODE) if len(s.strip())>0])
fd.close()
word_detect(content, cohesion, dof)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment