Created
November 13, 2012 10:49
-
-
Save docete/4065176 to your computer and use it in GitHub Desktop.
运用词频,词的内聚度和词的自由运用程度
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf8 -*- | |
import collections | |
import math | |
import sys | |
import getopt | |
import re | |
def suffix(s, i): | |
return s[i:] | |
def prefix(s, i): | |
return s[:i] | |
def info_entropy(d): | |
result = 0 | |
total = sum([v for _,v in d.iteritems()]) | |
for w, cnt in d.iteritems(): | |
p = cnt * 1.0 / total | |
result -= p * math.log(p) | |
return result | |
def word_detect(sentences, cohesion, dof): | |
MAX_WORD_LEN = 5 | |
word_candidates = set() | |
word_freqs = collections.Counter() | |
word_rates = collections.Counter() | |
word_rights = {} | |
word_lefts = {} | |
print >> sys.stderr, "Start Processing [%d] sentences" % (len(sentences)) | |
for s in sentences: | |
print >> sys.stderr, "Calc word frequency for [%s]" % (s.encode('utf8')) | |
corpus_length = len(s) | |
word_length = min(corpus_length, MAX_WORD_LEN) | |
word_in_sentences = set() | |
for i in xrange(1, word_length+1): | |
for j in xrange(0, corpus_length-i+1): | |
candidate = s[j:j+i] | |
word_in_sentences.add(candidate) | |
word_candidates.add(candidate) | |
word_freqs[candidate] += 1 | |
print >> sys.stderr, "Calc right neighbors for [%s]" % (s.encode('utf8')) | |
SUFFIX_ORDERS = [] | |
for i in xrange(1, len(s)+1): | |
SUFFIX_ORDERS.append(suffix(s, len(s)-i)) | |
SUFFIX_ORDERS.sort() | |
for word in word_in_sentences: | |
if not word_rights.has_key(word): | |
word_rights[word] = {} | |
for suffix_order in SUFFIX_ORDERS: | |
if suffix_order.startswith(word) and len(suffix_order) > len(word): | |
if word_rights[word].has_key(suffix_order[len(word):len(word)+1]): | |
word_rights[word][suffix_order[len(word):len(word)+1]] += 1 | |
else: | |
word_rights[word][suffix_order[len(word):len(word)+1]] = 1 | |
print >> sys.stderr, "Calc left neighbors for [%s]" % (s.encode('utf8')) | |
PREFIX_ORDERS = [] | |
for i in xrange(1, len(s)+1): | |
PREFIX_ORDERS.append(prefix(s, i)) | |
PREFIX_ORDERS.sort() | |
for word in word_in_sentences: | |
if not word_lefts.has_key(word): | |
word_lefts[word] = {} | |
for prefix_order in PREFIX_ORDERS: | |
if prefix_order.endswith(word) and len(prefix_order) > len(word): | |
if word_lefts[word].has_key(prefix_order[len(prefix_order)-len(word)-1:len(prefix_order)-len(word)]): | |
word_lefts[word][prefix_order[len(prefix_order)-len(word)-1:len(prefix_order)-len(word)]] += 1 | |
else: | |
word_lefts[word][prefix_order[len(prefix_order)-len(word)-1:len(prefix_order)-len(word)]] = 1 | |
total = sum([freq for _, freq in word_freqs.iteritems()]) | |
for word, freq in word_freqs.iteritems(): | |
word_rates[word] = freq*1.0/total | |
#print word.encode('utf8'), word_freqs[word] | |
# calc term_frequency, term_freedom, term_solification | |
solid_threhold = cohesion | |
freedom_threhold = dof | |
for word in word_candidates: | |
rate = word_rates[word] | |
solid = 0 | |
if len(word) <= 1: | |
continue | |
solidification = 0 | |
for i in xrange(1, len(word)): | |
t = word_rates[word[:i]] * word_rates[word[i:]] | |
solidification = max(solidification, t) | |
solid = rate / solidification | |
if solid < solid_threhold: | |
continue | |
left_info_entropy = info_entropy(word_lefts[word]) | |
right_info_entropy = info_entropy(word_rights[word]) | |
freedom = min(left_info_entropy, right_info_entropy) | |
if freedom < freedom_threhold: | |
continue | |
print "%s\t%s\t%s\t%s" % (word.encode('utf8'), word_freqs[word], solid, freedom) | |
def main(): | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], "hf:c:d:", ["help", "--file=", "--cohesion=", "--dof="]) | |
except getopt.error, msg: | |
print msg | |
print "for help use --help" | |
sys.exit(1) | |
input_files = None | |
cohesion = 150 | |
dof = 3 | |
for o, a in opts: | |
if o in ("-h", "--help"): | |
print __doc__ | |
sys.exit(0) | |
elif o in ("-f", "--file"): | |
input_files = a.split() | |
elif o in ("-c", "--cohesion"): | |
cohesion = int(a) | |
elif o in ("-d", "--dof"): | |
dof = int(a) | |
else: | |
assert False, "unhandled option" | |
content = [] | |
for f in input_files: | |
fd = open(f, 'r') | |
for line in fd: | |
line = line.strip() | |
if len(line) > 0: | |
print >> sys.stderr, "processing: [%s][%d] and got [%d] sentences before." % (line, len(line), len(content)) | |
line = unicode(line, 'utf8') | |
content.extend([s for s in re.split('\W+|[a-zA-Z0-9]+', line, 0, re.UNICODE) if len(s.strip())>0]) | |
fd.close() | |
word_detect(content, cohesion, dof) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment