Created
October 19, 2019 01:06
-
-
Save bluecookies/c78fe32954c1db44989e34abe2efa405 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import math | |
# the files are encoded in gb but I converted them to utf-8 | |
# csv reader wants newlines as '' | |
with open("CharFreq_inform.txt", newline = '', encoding = "utf-8") as unigrams: | |
fieldnames = ["index", "char", "freq"] | |
reader = csv.DictReader(unigrams, delimiter = '\t', fieldnames = fieldnames) | |
# skip the first six rows, including the header | |
for i in range(6): | |
next(reader, None) | |
# create a dictionary of frequencies | |
freq_dict = { row["char"]: int(row["freq"]) for row in reader } | |
# assert(len(freq_dict) == 8954) | |
total_freq = sum(freq_dict.values()) | |
prob_dict = { k: v / total_freq for k, v in freq_dict.items() } | |
zeroth_order_entropy = sum(map(lambda p: - math.log(p, 2) * p, prob_dict.values())) | |
with open("Bigram_news.txt", newline = '', encoding = "utf-8") as bigrams: | |
fieldnames = ["index", "bigram", "freq", "mi", "cum"] | |
reader = csv.DictReader(bigrams, delimiter = '\t', fieldnames = fieldnames) | |
# skip first two rows, including header | |
next(reader, None) | |
next(reader, None) | |
# want to calculate -\Sum_x p(x) \Sum_y p(y|x)log(p(y|x)) | |
# log(p(y|x)) = I(x, y) + log(p(y)) where I(x, y) is the mutual information (mi) | |
# => \Sum_y p(y|x)log(p(y|x)) = (I(x, y) + log p(y))*2^I(x, y)*p(y) | |
# mutual information is the amount of extra information you gain by knowing x or y for the other | |
# will use the frequencies from the unigrams for p(x) and p(y) | |
total = 0 | |
for row in reader: | |
# python is cool - [x, y] = = "贸易" gives x = 贸, y = 易 | |
[cx, cy] = row["bigram"] | |
mi = float(row["mi"]) # I(x, y), mutual information | |
nx = freq_dict[cx] # p(x) * N, will divide later | |
py = prob_dict[cy] # p(y) | |
# the inner term, p(y|x) * log p(y|x) | |
inner = (mi + math.log(py, 2)) * (2 ** mi) * py | |
total += nx * inner | |
first_order_entropy = - total/total_freq | |
print(zeroth_order_entropy) | |
print(first_order_entropy) | |
# results: | |
# 0th order 现代信息 9.562894552919264 | |
# first order 新闻 | 现代信息 4.04086609238876 | |
# 0th order 现代全体 9.665541472638475 | |
# first order 新闻 | 现代全体 4.264654145085788 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment