| words = [] | |
| while l = gets | |
| len = $1 if l =~ /^---len: (\d+)/ | |
| next if len == 1 | |
| if l =~ /^(\d+): (\S+) pmi: ([\d.]+) entropy: ([\w.]+)/ | |
| cnt = $1.to_i | |
| pmi = $3.to_f | |
| entropy = $4.to_i | |
| if entropy > 80 and pmi > 0.3 | |
| words << [cnt, $2, pmi, entropy] | |
| end | |
| end | |
| end | |
| words.sort_by! {|x| x[3] * x[2] }.reverse!.take(50).each do |a| | |
| puts a[1] | |
| end |
| #include <algorithm> | |
| #include <cmath> | |
| #include <functional> | |
| #include <numeric> | |
| #include <unordered_map> | |
| #include <vector> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| using namespace std; | |
| bool isCJKUnifiedIdeograph(int32_t c) | |
| { | |
| return 0x4E00 <= c && c < 0xA000; | |
| } | |
| bool isCJKSymbol(int32_t c) | |
| { | |
| return 0x3000 <= c && c < 0x3040; | |
| } | |
| class UTF8Reader | |
| { | |
| public: | |
| UTF8Reader(FILE* in) : in(in) {} | |
| int32_t get() { | |
| int s, c, t; | |
| if ((s = fgetc(in)) < 0x80) return s; | |
| c = __builtin_clz(~ s & 0xFF) - (sizeof(unsigned) * 8 - 8); | |
| if (! (2 <= c && c <= 6)) return -2; | |
| s &= (1 << 7 - c) - 1; | |
| while (--c) { | |
| t = fgetc(in); | |
| s = (s << 6) + (t & 0x3F); | |
| } | |
| return s; | |
| } | |
| protected: | |
| FILE* in; | |
| }; | |
| class UTF8Writer | |
| { | |
| public: | |
| UTF8Writer(FILE* out) : out(out) {} | |
| int put(int32_t c) { | |
| if (c < 0x80) | |
| fputc(c, out); | |
| else { | |
| int d; | |
| if (c < 0x800) { | |
| fputc(0xC0 | c >> 6 & 31, out); | |
| d = 1; | |
| } else if (c < 0x10000) { | |
| fputc(0xE0 | c >> 12 & 15, out); | |
| d = 2; | |
| } else if (c < 0x200000) { | |
| fputc(0xF0 | c >> 18 & 7, out); | |
| d = 3; | |
| } else if (c < 0x4000000) { | |
| fputc(0xF8 | c >> 24 & 3, out); | |
| d = 4; | |
| } else if (c < 0x80000000) { | |
| fputc(0xFC | c >> 30, out); | |
| d = 5; | |
| } else | |
| return -2; | |
| while (d--) | |
| fputc(0x80 | c >> d*6 & 63, out); | |
| } | |
| return 0; | |
| } | |
| protected: | |
| FILE* out; | |
| }; | |
| namespace std | |
| { | |
| template<> | |
| class hash<vector<uint16_t>> { | |
| public: | |
| size_t operator()(const vector<uint16_t> &s) const | |
| { | |
| size_t h = 0; | |
| for (auto c : s) | |
| h = h * 17 + c; | |
| return h; | |
| } | |
| }; | |
| } | |
| namespace SuffixArray | |
| { | |
| struct Type | |
| { | |
| size_t id, key[2]; | |
| bool operator<(const Type &rhs) const { | |
| return key[0] < rhs.key[0] || key[0] == rhs.key[0] && key[1] < rhs.key[1]; | |
| } | |
| }; | |
| template<int k> | |
| void countingSort(const Type a[], size_t n, Type b[], size_t h[]) | |
| { | |
| fill_n(h, n, 0); | |
| for (int i = 0; i < n; ++i) | |
| h[a[i].key[k]]++; | |
| for (int i = 1; i < n; ++i) | |
| h[i] += h[i-1]; | |
| for (int i = n; --i >= 0; ) | |
| b[--h[a[i].key[k]]] = a[i]; | |
| } | |
| template<typename T> | |
| void run(T a[], size_t n, size_t output[], size_t h[]) | |
| { | |
| Type *sa = new Type[n], *nsa = new Type[n]; | |
| size_t *r = new size_t[n]; | |
| for (size_t i = 0; i < n; ++i) | |
| sa[i].id = i, sa[i].key[0] = size_t(a[i]), sa[i].key[1] = 0; | |
| sort(sa, sa + n); | |
| for (size_t k = 1; ; k <<= 1) { | |
| r[sa[0].id] = 0; | |
| for (size_t i = 1; i < n; ++i) | |
| r[sa[i].id] = r[sa[i-1].id] + (sa[i-1] < sa[i]); | |
| if (k >= n || r[sa[n-1].id] == n-1) break; | |
| for (size_t i = 0; i < n; ++i) { | |
| sa[i].id = i; | |
| sa[i].key[0] = r[i]; | |
| sa[i].key[1] = i + k < n ? r[i + k] + 1 : 0; | |
| } | |
| countingSort<1>(sa, n, nsa, h); | |
| countingSort<0>(nsa, n, sa, h); | |
| } | |
| for (size_t i = 0; i < n; i++) | |
| output[i] = sa[i].id; | |
| for (size_t k = 0, i = 0; i < n; ++i) | |
| if (r[i]) { | |
| for (size_t j = sa[r[i]-1].id; a[j+k] == a[i+k]; ++k); | |
| h[r[i]] = k; | |
| if (k) --k; | |
| } | |
| delete[] sa; | |
| delete[] nsa; | |
| delete[] r; | |
| } | |
| }; | |
| class WordExtractor | |
| { | |
| public: | |
| WordExtractor(const function<bool(const uint16_t*, size_t, size_t)>& word_filter) : word_filter(word_filter), u8w(stdout) {} | |
| void extract(uint16_t* content, size_t n, size_t max_len) | |
| { | |
| freq.clear(); | |
| pmi.clear(); | |
| entropy.clear(); | |
| this->content = content; | |
| this->n = n; | |
| sa = new size_t[n]; | |
| h = new size_t[n]; | |
| SuffixArray::run(&content[0], n, sa, h); | |
| for (size_t len = 1; len <= max_len; len++) | |
| processLen(len, true); | |
| reverse(content, content + n); | |
| SuffixArray::run(&content[0], n, sa, h); | |
| for (size_t len = 1; len <= max_len; len++) { | |
| printf("---len: %zd---\n", len); | |
| processLen(len, false); | |
| } | |
| delete[] sa; | |
| delete[] h; | |
| } | |
| void processRange(size_t len, size_t lo, size_t hi, bool forward) | |
| { | |
| if (forward) { | |
| const uint16_t* first = &content[sa[lo]]; | |
| if (! word_filter(first, len, hi - lo)) return; | |
| vector<uint16_t> word(first, first + len); | |
| freq[word] += hi - lo; | |
| if (len > 1) { | |
| vector<uint16_t> left(first, first + 1), right(first + 1, first + len); | |
| pmi[word] = log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n); | |
| if (len > 2) { | |
| left.assign(first, first + len - 1); | |
| right.assign(first + len - 1, first + len); | |
| pmi[word] = min(pmi[word], log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n)); | |
| } | |
| } | |
| entropy[word] = 0; | |
| for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) { | |
| for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++); | |
| if (isCJKUnifiedIdeograph(content[sa[i] + len]) || isCJKSymbol(content[sa[i] + len])) | |
| entropy[word] += - log(double(j - i) / (hi - lo)); | |
| } | |
| } else { | |
| const uint16_t* first = &content[sa[lo]]; | |
| vector<uint16_t> word(first, first + len); | |
| reverse(word.begin(), word.end()); | |
| if (! freq.count(word)) return; | |
| double en = 0; | |
| for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) { | |
| for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++); | |
| if (isCJKUnifiedIdeograph(content[sa[i] + len]) || isCJKSymbol(content[sa[i] + len])) | |
| en += - log(double(j - i) / (hi - lo)); | |
| } | |
| entropy[word] = min(entropy[word], en); | |
| printf("%zd: ", hi - lo); | |
| for (size_t i = 0; i < len; i++) | |
| u8w.put(first[len-1-i]); | |
| if (len > 1) | |
| printf(" pmi: %lf", pmi[word]); | |
| printf(" entropy: %lf", entropy[word]); | |
| puts(""); | |
| } | |
| } | |
| void processLen(size_t len, bool forward) { | |
| size_t i; | |
| for (i = 1; i < n; i++) | |
| if (h[i] < len) { | |
| if (len <= n - sa[i-1]) | |
| processRange(len, i - 1, i, forward); | |
| } else { | |
| size_t j = i - 1; | |
| while (++i < n && h[i] >= len); | |
| processRange(len, j, i, forward); | |
| } | |
| if (i == n && len <= n - sa[n-1]) | |
| processRange(len, n - 1, n, forward); | |
| } | |
| protected: | |
| size_t *sa, *h, n; | |
| const function<bool(const uint16_t*, size_t, size_t)>& word_filter; | |
| uint16_t* content; | |
| unordered_map<vector<uint16_t>, size_t> freq; | |
| unordered_map<vector<uint16_t>, double> pmi, entropy; | |
| UTF8Writer u8w; | |
| }; | |
| int main() | |
| { | |
| UTF8Reader utf8(stdin); | |
| vector<uint16_t> content; | |
| int32_t c, d = 0; | |
| while ((c = utf8.get()) >= 0) | |
| if (isCJKUnifiedIdeograph(c) || isCJKSymbol(c)) | |
| content.push_back(c); | |
| WordExtractor we([](const uint16_t* first, size_t len, size_t cnt) { | |
| if (cnt < 50) return false; | |
| for (size_t i = 0; i < len; i++) | |
| if (! isCJKUnifiedIdeograph(first[i])) | |
| return false; | |
| return true; | |
| }); | |
| we.extract(&content[0], content.size(), 6); | |
| if (c == -2) | |
| return 2; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
MaskRay commentedOct 6, 2012
make WordExtractor CXXFLAGS='-std=c++11'
./WordExtractor < in-question.txt | ruby analysis.rb