Skip to content

Instantly share code, notes, and snippets.

@bluecookies
Created October 19, 2019 01:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bluecookies/c78fe32954c1db44989e34abe2efa405 to your computer and use it in GitHub Desktop.
Save bluecookies/c78fe32954c1db44989e34abe2efa405 to your computer and use it in GitHub Desktop.
import csv
import math
# the files are encoded in gb but I converted them to utf-8
# csv reader wants newlines as ''
with open("CharFreq_inform.txt", newline = '', encoding = "utf-8") as unigrams:
fieldnames = ["index", "char", "freq"]
reader = csv.DictReader(unigrams, delimiter = '\t', fieldnames = fieldnames)
# skip the first six rows, including the header
for i in range(6):
next(reader, None)
# create a dictionary of frequencies
freq_dict = { row["char"]: int(row["freq"]) for row in reader }
# assert(len(freq_dict) == 8954)
total_freq = sum(freq_dict.values())
prob_dict = { k: v / total_freq for k, v in freq_dict.items() }
zeroth_order_entropy = sum(map(lambda p: - math.log(p, 2) * p, prob_dict.values()))
with open("Bigram_news.txt", newline = '', encoding = "utf-8") as bigrams:
fieldnames = ["index", "bigram", "freq", "mi", "cum"]
reader = csv.DictReader(bigrams, delimiter = '\t', fieldnames = fieldnames)
# skip first two rows, including header
next(reader, None)
next(reader, None)
# want to calculate -\Sum_x p(x) \Sum_y p(y|x)log(p(y|x))
# log(p(y|x)) = I(x, y) + log(p(y)) where I(x, y) is the mutual information (mi)
# => \Sum_y p(y|x)log(p(y|x)) = (I(x, y) + log p(y))*2^I(x, y)*p(y)
# mutual information is the amount of extra information you gain by knowing x or y for the other
# will use the frequencies from the unigrams for p(x) and p(y)
total = 0
for row in reader:
# python is cool - [x, y] = = "贸易" gives x = 贸, y = 易
[cx, cy] = row["bigram"]
mi = float(row["mi"]) # I(x, y), mutual information
nx = freq_dict[cx] # p(x) * N, will divide later
py = prob_dict[cy] # p(y)
# the inner term, p(y|x) * log p(y|x)
inner = (mi + math.log(py, 2)) * (2 ** mi) * py
total += nx * inner
first_order_entropy = - total/total_freq
print(zeroth_order_entropy)
print(first_order_entropy)
# results:
# 0th order 现代信息 9.562894552919264
# first order 新闻 | 现代信息 4.04086609238876
# 0th order 现代全体 9.665541472638475
# first order 新闻 | 现代全体 4.264654145085788
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment