bluecookies/chinese_entropy.py

## chinese_entropy.py
import csv
import math

# the files are encoded in gb but I converted them to utf-8
# csv reader wants newlines as ''
with open("CharFreq_inform.txt", newline = '', encoding = "utf-8") as unigrams:
    fieldnames = ["index", "char", "freq"]
    reader = csv.DictReader(unigrams, delimiter = '\t', fieldnames = fieldnames)
    # skip the first six rows, including the header
    for i in range(6):
        next(reader, None)
    # create a dictionary of frequencies
    freq_dict = { row["char"]: int(row["freq"]) for row in reader }
    # assert(len(freq_dict) == 8954)
    total_freq = sum(freq_dict.values())
    prob_dict = { k: v / total_freq for k, v in freq_dict.items() }
    zeroth_order_entropy = sum(map(lambda p: - math.log(p, 2) * p, prob_dict.values()))


with open("Bigram_news.txt", newline = '', encoding = "utf-8") as bigrams:
    fieldnames = ["index", "bigram", "freq", "mi", "cum"]
    reader = csv.DictReader(bigrams, delimiter = '\t', fieldnames = fieldnames)
    # skip first two rows, including header
    next(reader, None)
    next(reader, None)
    # want to calculate -\Sum_x p(x) \Sum_y p(y|x)log(p(y|x))
    # log(p(y|x)) = I(x, y) + log(p(y)) where I(x, y) is the mutual information (mi)
    # => \Sum_y p(y|x)log(p(y|x)) = (I(x, y) + log p(y))*2^I(x, y)*p(y)
    # mutual information is the amount of extra information you gain by knowing x or y for the other
    # will use the frequencies from the unigrams for p(x) and p(y)
    total = 0
    for row in reader:
        # python is cool - [x, y] = = "贸易" gives x = 贸, y = 易
        [cx, cy] = row["bigram"]
        mi = float(row["mi"])        # I(x, y), mutual information
        nx = freq_dict[cx]    # p(x) * N, will divide later
        py = prob_dict[cy]    # p(y)
        # the inner term, p(y|x) * log p(y|x)
        inner = (mi + math.log(py, 2)) * (2 ** mi) * py
        total += nx * inner
    first_order_entropy = - total/total_freq

print(zeroth_order_entropy)
print(first_order_entropy)

# results:
# 0th order    现代信息          9.562894552919264
# first order  新闻 | 现代信息    4.04086609238876
# 0th order    现代全体          9.665541472638475
# first order  新闻 | 现代全体    4.264654145085788
	import csv
	import math

	# the files are encoded in gb but I converted them to utf-8
	# csv reader wants newlines as ''
	with open("CharFreq_inform.txt", newline = '', encoding = "utf-8") as unigrams:
	fieldnames = ["index", "char", "freq"]
	reader = csv.DictReader(unigrams, delimiter = '\t', fieldnames = fieldnames)
	# skip the first six rows, including the header
	for i in range(6):
	next(reader, None)
	# create a dictionary of frequencies
	freq_dict = { row["char"]: int(row["freq"]) for row in reader }
	# assert(len(freq_dict) == 8954)
	total_freq = sum(freq_dict.values())
	prob_dict = { k: v / total_freq for k, v in freq_dict.items() }
	zeroth_order_entropy = sum(map(lambda p: - math.log(p, 2) * p, prob_dict.values()))


	with open("Bigram_news.txt", newline = '', encoding = "utf-8") as bigrams:
	fieldnames = ["index", "bigram", "freq", "mi", "cum"]
	reader = csv.DictReader(bigrams, delimiter = '\t', fieldnames = fieldnames)
	# skip first two rows, including header
	next(reader, None)
	next(reader, None)
	# want to calculate -\Sum_x p(x) \Sum_y p(y\|x)log(p(y\|x))
	# log(p(y\|x)) = I(x, y) + log(p(y)) where I(x, y) is the mutual information (mi)
	# => \Sum_y p(y\|x)log(p(y\|x)) = (I(x, y) + log p(y))2^I(x, y)p(y)
	# mutual information is the amount of extra information you gain by knowing x or y for the other
	# will use the frequencies from the unigrams for p(x) and p(y)
	total = 0
	for row in reader:
	# python is cool - [x, y] = = "贸易" gives x = 贸, y = 易
	[cx, cy] = row["bigram"]
	mi = float(row["mi"]) # I(x, y), mutual information
	nx = freq_dict[cx] # p(x) * N, will divide later
	py = prob_dict[cy] # p(y)
	# the inner term, p(y\|x) * log p(y\|x)
	inner = (mi + math.log(py, 2)) * (2 ** mi) * py
	total += nx * inner
	first_order_entropy = - total/total_freq

	print(zeroth_order_entropy)
	print(first_order_entropy)

	# results:
	# 0th order 现代信息 9.562894552919264
	# first order 新闻 \| 现代信息 4.04086609238876
	# 0th order 现代全体 9.665541472638475
	# first order 新闻 \| 现代全体 4.264654145085788