Skip to content

Instantly share code, notes, and snippets.

@coltenwebb
Created January 3, 2023 18:16
Show Gist options
  • Save coltenwebb/44d0a6bca76b43c8703fbd11fdc5704f to your computer and use it in GitHub Desktop.
Save coltenwebb/44d0a6bca76b43c8703fbd11fdc5704f to your computer and use it in GitHub Desktop.
empirical entropy of text file
// g++-12 ordered_entropy.cpp -std=c++17 && ./a.out enwik8
#include <bits/stdc++.h>
using namespace std;
bool filt(unsigned char c) {
if (c >= 'a' && c <= 'z') return true;
if (c >= 'A' && c <= 'Z') return true;
if (c >= '0' && c <= '9') return true;
//if (c == '.' || c==',' || c==' ') return true;
if (c==' ') return true;
return false;
}
void calc_entropy(istream& in) {
int cnt = 0;
vector<int> cnt0(1<<8);
vector<int> cnt1(1<<16);
vector<int> cnt2(1<<24);
int cur = 0;
while (in) {
unsigned char c = in.get();
if (!filt(c)) continue;
cur <<= 8;
cur += c;
cnt++;
cnt0[cur&(cnt0.size()-1)]++;
cnt1[cur&(cnt1.size()-1)]++;
cnt2[cur&(cnt2.size()-1)]++;
}
double h0=0,h1=0,h2 = 0,h3=0;
for (int i = 0; i < (1<<8); i++) {
if (!cnt0[i]) continue;
double p = (double)cnt0[i]/cnt;
h0 += -log2(p) * p;
}
for (int i = 0; i < (1<<16); i++) {
int a = i >> 8;
if (!cnt0[a] || !cnt1[i]) continue;
double pa = (double)cnt0[a]/cnt;
double pb = (double)cnt1[i]/cnt0[a];
h1 += -log2(pb) * pb * pa;
}
for (int i = 0; i < (1<<24); i++) {
int a = i >> 16;
int b = i >> 8;
if (!cnt0[a] || !cnt1[b] || !cnt2[i]) continue;
double pa = (double)cnt0[a]/cnt;
double pb = (double)cnt1[b]/cnt0[a];
double pc = (double)cnt2[i]/cnt1[b];
h2 += -log2(pc) * pc * pb * pa;
}
cout << h0 << "\n";
cout << h1 << "\n";
cout << h2 << "\n";
}
int main(int argc, char* args[]) {
if (argc != 2) {
cerr << "invalid args\n";
exit(1);
}
ifstream fin {args[1]};
if (!fin) {
cerr << "couldnt open file\n";
exit(1);
}
calc_entropy(fin);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment