Skip to content

Instantly share code, notes, and snippets.

@rockyzhengwu
Last active July 9, 2018 03:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rockyzhengwu/732e25f0fada7574bb839d42d20fddc7 to your computer and use it in GitHub Desktop.
Save rockyzhengwu/732e25f0fada7574bb839d42d20fddc7 to your computer and use it in GitHub Desktop.
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from collections import defaultdict
import matplotlib.pyplot as plt
import math
import numpy as np
class BookStat():
def __init__(self, book_path):
self.book_path = book_path
self.sents = []
self.words = defaultdict(int)
self._load_book()
self.total_word = sum(self.words.values())
self.word_prob = {k:v/self.total_word for k, v in self.words.items()}
def _load_book(self):
f = open(self.book_path)
self.sents = sent_tokenize(f.read())
for sent in self.sents:
words = word_tokenize(sent)
for w in words:
self.words[w] += + 1
def show_zipf(self):
sort_words = sorted(self.words.items(), key=lambda x:x[1], reverse=True)
x = []
y = []
count_dict = defaultdict(int)
for i, info in enumerate(sort_words):
x.append(math.log(i+1))
y.append(math.log(info[1]))
count_dict[info[1]] += 1
x = np.array(x)
y = np.array(y)
s_count = sorted(count_dict.items(), key=lambda x:x[0])
count_num = [item[0] for item in s_count]
count_value = [item[1] for item in s_count]
plt.plot(x, y)
for info in sort_words[:100]:
print(info)
print(len(self.words))
print(sum(self.words.values()))
print(count_num)
print(count_value)
plt.show()
def caculate_sent_entroy(self):
entroy_list = []
for sent in self.sents:
entroy = 0
sent_words = word_tokenize(sent)
for w in sent_words:
wp = self.word_prob[w]
entroy += -1*wp * math.log(wp)
entroy_list.append(entroy)
arg_index = np.argsort(entroy_list)
plt.figure()
plt.hist(entroy_list,bins=len(self.sents))
total_entroy = sum(entroy_list)
tmp_entroy = 0.0
counter = 0
for i, index in enumerate(arg_index[::-1]):
tmp_entroy += entroy_list[index]
print(entroy_list[index])
if tmp_entroy> total_entroy *0.8:
print(entroy_list[index])
counter = i
print("80% count:", i)
break
for i in arg_index[len(self.sents)-counter:]:
words = word_tokenize(self.sents[i])
print("#"*30+":%f"%(entroy_list[i]), " word count", len(words))
print(self.sents[i])
print("total_sent :", len(self.sents))
print("counter: ", counter)
print(counter/len(self.sents))
print("总的熵:", total_entroy)
if __name__ == "__main__":
book = BookStat("17500.txt.utf-8")
book.show_zipf()
book.caculate_sent_entroy()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment