Created
October 6, 2012 09:37
-
-
Save MaskRay/3844497 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
words = [] | |
while l = gets | |
len = $1 if l =~ /^---len: (\d+)/ | |
next if len == 1 | |
if l =~ /^(\d+): (\S+) pmi: ([\d.]+) entropy: ([\w.]+)/ | |
cnt = $1.to_i | |
pmi = $3.to_f | |
entropy = $4.to_i | |
if entropy > 80 and pmi > 0.3 | |
words << [cnt, $2, pmi, entropy] | |
end | |
end | |
end | |
words.sort_by! {|x| x[3] * x[2] }.reverse!.take(50).each do |a| | |
puts a[1] | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <cmath> | |
#include <functional> | |
#include <numeric> | |
#include <unordered_map> | |
#include <vector> | |
#include <stdint.h> | |
#include <stdio.h> | |
using namespace std; | |
bool isCJKUnifiedIdeograph(int32_t c) | |
{ | |
return 0x4E00 <= c && c < 0xA000; | |
} | |
bool isCJKSymbol(int32_t c) | |
{ | |
return 0x3000 <= c && c < 0x3040; | |
} | |
class UTF8Reader | |
{ | |
public: | |
UTF8Reader(FILE* in) : in(in) {} | |
int32_t get() { | |
int s, c, t; | |
if ((s = fgetc(in)) < 0x80) return s; | |
c = __builtin_clz(~ s & 0xFF) - (sizeof(unsigned) * 8 - 8); | |
if (! (2 <= c && c <= 6)) return -2; | |
s &= (1 << 7 - c) - 1; | |
while (--c) { | |
t = fgetc(in); | |
s = (s << 6) + (t & 0x3F); | |
} | |
return s; | |
} | |
protected: | |
FILE* in; | |
}; | |
class UTF8Writer | |
{ | |
public: | |
UTF8Writer(FILE* out) : out(out) {} | |
int put(int32_t c) { | |
if (c < 0x80) | |
fputc(c, out); | |
else { | |
int d; | |
if (c < 0x800) { | |
fputc(0xC0 | c >> 6 & 31, out); | |
d = 1; | |
} else if (c < 0x10000) { | |
fputc(0xE0 | c >> 12 & 15, out); | |
d = 2; | |
} else if (c < 0x200000) { | |
fputc(0xF0 | c >> 18 & 7, out); | |
d = 3; | |
} else if (c < 0x4000000) { | |
fputc(0xF8 | c >> 24 & 3, out); | |
d = 4; | |
} else if (c < 0x80000000) { | |
fputc(0xFC | c >> 30, out); | |
d = 5; | |
} else | |
return -2; | |
while (d--) | |
fputc(0x80 | c >> d*6 & 63, out); | |
} | |
return 0; | |
} | |
protected: | |
FILE* out; | |
}; | |
namespace std | |
{ | |
template<> | |
class hash<vector<uint16_t>> { | |
public: | |
size_t operator()(const vector<uint16_t> &s) const | |
{ | |
size_t h = 0; | |
for (auto c : s) | |
h = h * 17 + c; | |
return h; | |
} | |
}; | |
} | |
namespace SuffixArray | |
{ | |
struct Type | |
{ | |
size_t id, key[2]; | |
bool operator<(const Type &rhs) const { | |
return key[0] < rhs.key[0] || key[0] == rhs.key[0] && key[1] < rhs.key[1]; | |
} | |
}; | |
template<int k> | |
void countingSort(const Type a[], size_t n, Type b[], size_t h[]) | |
{ | |
fill_n(h, n, 0); | |
for (int i = 0; i < n; ++i) | |
h[a[i].key[k]]++; | |
for (int i = 1; i < n; ++i) | |
h[i] += h[i-1]; | |
for (int i = n; --i >= 0; ) | |
b[--h[a[i].key[k]]] = a[i]; | |
} | |
template<typename T> | |
void run(T a[], size_t n, size_t output[], size_t h[]) | |
{ | |
Type *sa = new Type[n], *nsa = new Type[n]; | |
size_t *r = new size_t[n]; | |
for (size_t i = 0; i < n; ++i) | |
sa[i].id = i, sa[i].key[0] = size_t(a[i]), sa[i].key[1] = 0; | |
sort(sa, sa + n); | |
for (size_t k = 1; ; k <<= 1) { | |
r[sa[0].id] = 0; | |
for (size_t i = 1; i < n; ++i) | |
r[sa[i].id] = r[sa[i-1].id] + (sa[i-1] < sa[i]); | |
if (k >= n || r[sa[n-1].id] == n-1) break; | |
for (size_t i = 0; i < n; ++i) { | |
sa[i].id = i; | |
sa[i].key[0] = r[i]; | |
sa[i].key[1] = i + k < n ? r[i + k] + 1 : 0; | |
} | |
countingSort<1>(sa, n, nsa, h); | |
countingSort<0>(nsa, n, sa, h); | |
} | |
for (size_t i = 0; i < n; i++) | |
output[i] = sa[i].id; | |
for (size_t k = 0, i = 0; i < n; ++i) | |
if (r[i]) { | |
for (size_t j = sa[r[i]-1].id; a[j+k] == a[i+k]; ++k); | |
h[r[i]] = k; | |
if (k) --k; | |
} | |
delete[] sa; | |
delete[] nsa; | |
delete[] r; | |
} | |
}; | |
class WordExtractor | |
{ | |
public: | |
WordExtractor(const function<bool(const uint16_t*, size_t, size_t)>& word_filter) : word_filter(word_filter), u8w(stdout) {} | |
void extract(uint16_t* content, size_t n, size_t max_len) | |
{ | |
freq.clear(); | |
pmi.clear(); | |
entropy.clear(); | |
this->content = content; | |
this->n = n; | |
sa = new size_t[n]; | |
h = new size_t[n]; | |
SuffixArray::run(&content[0], n, sa, h); | |
for (size_t len = 1; len <= max_len; len++) | |
processLen(len, true); | |
reverse(content, content + n); | |
SuffixArray::run(&content[0], n, sa, h); | |
for (size_t len = 1; len <= max_len; len++) { | |
printf("---len: %zd---\n", len); | |
processLen(len, false); | |
} | |
delete[] sa; | |
delete[] h; | |
} | |
void processRange(size_t len, size_t lo, size_t hi, bool forward) | |
{ | |
if (forward) { | |
const uint16_t* first = &content[sa[lo]]; | |
if (! word_filter(first, len, hi - lo)) return; | |
vector<uint16_t> word(first, first + len); | |
freq[word] += hi - lo; | |
if (len > 1) { | |
vector<uint16_t> left(first, first + 1), right(first + 1, first + len); | |
pmi[word] = log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n); | |
if (len > 2) { | |
left.assign(first, first + len - 1); | |
right.assign(first + len - 1, first + len); | |
pmi[word] = min(pmi[word], log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n)); | |
} | |
} | |
entropy[word] = 0; | |
for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) { | |
for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++); | |
if (isCJKUnifiedIdeograph(content[sa[i] + len]) || isCJKSymbol(content[sa[i] + len])) | |
entropy[word] += - log(double(j - i) / (hi - lo)); | |
} | |
} else { | |
const uint16_t* first = &content[sa[lo]]; | |
vector<uint16_t> word(first, first + len); | |
reverse(word.begin(), word.end()); | |
if (! freq.count(word)) return; | |
double en = 0; | |
for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) { | |
for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++); | |
if (isCJKUnifiedIdeograph(content[sa[i] + len]) || isCJKSymbol(content[sa[i] + len])) | |
en += - log(double(j - i) / (hi - lo)); | |
} | |
entropy[word] = min(entropy[word], en); | |
printf("%zd: ", hi - lo); | |
for (size_t i = 0; i < len; i++) | |
u8w.put(first[len-1-i]); | |
if (len > 1) | |
printf(" pmi: %lf", pmi[word]); | |
printf(" entropy: %lf", entropy[word]); | |
puts(""); | |
} | |
} | |
void processLen(size_t len, bool forward) { | |
size_t i; | |
for (i = 1; i < n; i++) | |
if (h[i] < len) { | |
if (len <= n - sa[i-1]) | |
processRange(len, i - 1, i, forward); | |
} else { | |
size_t j = i - 1; | |
while (++i < n && h[i] >= len); | |
processRange(len, j, i, forward); | |
} | |
if (i == n && len <= n - sa[n-1]) | |
processRange(len, n - 1, n, forward); | |
} | |
protected: | |
size_t *sa, *h, n; | |
const function<bool(const uint16_t*, size_t, size_t)>& word_filter; | |
uint16_t* content; | |
unordered_map<vector<uint16_t>, size_t> freq; | |
unordered_map<vector<uint16_t>, double> pmi, entropy; | |
UTF8Writer u8w; | |
}; | |
int main() | |
{ | |
UTF8Reader utf8(stdin); | |
vector<uint16_t> content; | |
int32_t c, d = 0; | |
while ((c = utf8.get()) >= 0) | |
if (isCJKUnifiedIdeograph(c) || isCJKSymbol(c)) | |
content.push_back(c); | |
WordExtractor we([](const uint16_t* first, size_t len, size_t cnt) { | |
if (cnt < 50) return false; | |
for (size_t i = 0; i < len; i++) | |
if (! isCJKUnifiedIdeograph(first[i])) | |
return false; | |
return true; | |
}); | |
we.extract(&content[0], content.size(), 6); | |
if (c == -2) | |
return 2; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
make WordExtractor CXXFLAGS='-std=c++11'
./WordExtractor < in-question.txt | ruby analysis.rb