Skip to content

Instantly share code, notes, and snippets.

@MaskRay
Created October 6, 2012 09:37
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save MaskRay/3844497 to your computer and use it in GitHub Desktop.
Save MaskRay/3844497 to your computer and use it in GitHub Desktop.
自然語言處理之词语抽取 http://maskray.me/posts/2012-10-06-word-extractor.html
words = []
while l = gets
len = $1 if l =~ /^---len: (\d+)/
next if len == 1
if l =~ /^(\d+): (\S+) pmi: ([\d.]+) entropy: ([\w.]+)/
cnt = $1.to_i
pmi = $3.to_f
entropy = $4.to_i
if entropy > 80 and pmi > 0.3
words << [cnt, $2, pmi, entropy]
end
end
end
words.sort_by! {|x| x[3] * x[2] }.reverse!.take(50).each do |a|
puts a[1]
end
#include <algorithm>
#include <cmath>
#include <functional>
#include <numeric>
#include <unordered_map>
#include <vector>
#include <stdint.h>
#include <stdio.h>
using namespace std;
bool isCJKUnifiedIdeograph(int32_t c)
{
return 0x4E00 <= c && c < 0xA000;
}
bool isCJKSymbol(int32_t c)
{
return 0x3000 <= c && c < 0x3040;
}
class UTF8Reader
{
public:
UTF8Reader(FILE* in) : in(in) {}
int32_t get() {
int s, c, t;
if ((s = fgetc(in)) < 0x80) return s;
c = __builtin_clz(~ s & 0xFF) - (sizeof(unsigned) * 8 - 8);
if (! (2 <= c && c <= 6)) return -2;
s &= (1 << 7 - c) - 1;
while (--c) {
t = fgetc(in);
s = (s << 6) + (t & 0x3F);
}
return s;
}
protected:
FILE* in;
};
class UTF8Writer
{
public:
UTF8Writer(FILE* out) : out(out) {}
int put(int32_t c) {
if (c < 0x80)
fputc(c, out);
else {
int d;
if (c < 0x800) {
fputc(0xC0 | c >> 6 & 31, out);
d = 1;
} else if (c < 0x10000) {
fputc(0xE0 | c >> 12 & 15, out);
d = 2;
} else if (c < 0x200000) {
fputc(0xF0 | c >> 18 & 7, out);
d = 3;
} else if (c < 0x4000000) {
fputc(0xF8 | c >> 24 & 3, out);
d = 4;
} else if (c < 0x80000000) {
fputc(0xFC | c >> 30, out);
d = 5;
} else
return -2;
while (d--)
fputc(0x80 | c >> d*6 & 63, out);
}
return 0;
}
protected:
FILE* out;
};
namespace std
{
template<>
class hash<vector<uint16_t>> {
public:
size_t operator()(const vector<uint16_t> &s) const
{
size_t h = 0;
for (auto c : s)
h = h * 17 + c;
return h;
}
};
}
namespace SuffixArray
{
struct Type
{
size_t id, key[2];
bool operator<(const Type &rhs) const {
return key[0] < rhs.key[0] || key[0] == rhs.key[0] && key[1] < rhs.key[1];
}
};
template<int k>
void countingSort(const Type a[], size_t n, Type b[], size_t h[])
{
fill_n(h, n, 0);
for (int i = 0; i < n; ++i)
h[a[i].key[k]]++;
for (int i = 1; i < n; ++i)
h[i] += h[i-1];
for (int i = n; --i >= 0; )
b[--h[a[i].key[k]]] = a[i];
}
template<typename T>
void run(T a[], size_t n, size_t output[], size_t h[])
{
Type *sa = new Type[n], *nsa = new Type[n];
size_t *r = new size_t[n];
for (size_t i = 0; i < n; ++i)
sa[i].id = i, sa[i].key[0] = size_t(a[i]), sa[i].key[1] = 0;
sort(sa, sa + n);
for (size_t k = 1; ; k <<= 1) {
r[sa[0].id] = 0;
for (size_t i = 1; i < n; ++i)
r[sa[i].id] = r[sa[i-1].id] + (sa[i-1] < sa[i]);
if (k >= n || r[sa[n-1].id] == n-1) break;
for (size_t i = 0; i < n; ++i) {
sa[i].id = i;
sa[i].key[0] = r[i];
sa[i].key[1] = i + k < n ? r[i + k] + 1 : 0;
}
countingSort<1>(sa, n, nsa, h);
countingSort<0>(nsa, n, sa, h);
}
for (size_t i = 0; i < n; i++)
output[i] = sa[i].id;
for (size_t k = 0, i = 0; i < n; ++i)
if (r[i]) {
for (size_t j = sa[r[i]-1].id; a[j+k] == a[i+k]; ++k);
h[r[i]] = k;
if (k) --k;
}
delete[] sa;
delete[] nsa;
delete[] r;
}
};
class WordExtractor
{
public:
WordExtractor(const function<bool(const uint16_t*, size_t, size_t)>& word_filter) : word_filter(word_filter), u8w(stdout) {}
void extract(uint16_t* content, size_t n, size_t max_len)
{
freq.clear();
pmi.clear();
entropy.clear();
this->content = content;
this->n = n;
sa = new size_t[n];
h = new size_t[n];
SuffixArray::run(&content[0], n, sa, h);
for (size_t len = 1; len <= max_len; len++)
processLen(len, true);
reverse(content, content + n);
SuffixArray::run(&content[0], n, sa, h);
for (size_t len = 1; len <= max_len; len++) {
printf("---len: %zd---\n", len);
processLen(len, false);
}
delete[] sa;
delete[] h;
}
void processRange(size_t len, size_t lo, size_t hi, bool forward)
{
if (forward) {
const uint16_t* first = &content[sa[lo]];
if (! word_filter(first, len, hi - lo)) return;
vector<uint16_t> word(first, first + len);
freq[word] += hi - lo;
if (len > 1) {
vector<uint16_t> left(first, first + 1), right(first + 1, first + len);
pmi[word] = log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n);
if (len > 2) {
left.assign(first, first + len - 1);
right.assign(first + len - 1, first + len);
pmi[word] = min(pmi[word], log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n));
}
}
entropy[word] = 0;
for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) {
for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++);
if (isCJKUnifiedIdeograph(content[sa[i] + len]) || isCJKSymbol(content[sa[i] + len]))
entropy[word] += - log(double(j - i) / (hi - lo));
}
} else {
const uint16_t* first = &content[sa[lo]];
vector<uint16_t> word(first, first + len);
reverse(word.begin(), word.end());
if (! freq.count(word)) return;
double en = 0;
for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) {
for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++);
if (isCJKUnifiedIdeograph(content[sa[i] + len]) || isCJKSymbol(content[sa[i] + len]))
en += - log(double(j - i) / (hi - lo));
}
entropy[word] = min(entropy[word], en);
printf("%zd: ", hi - lo);
for (size_t i = 0; i < len; i++)
u8w.put(first[len-1-i]);
if (len > 1)
printf(" pmi: %lf", pmi[word]);
printf(" entropy: %lf", entropy[word]);
puts("");
}
}
void processLen(size_t len, bool forward) {
size_t i;
for (i = 1; i < n; i++)
if (h[i] < len) {
if (len <= n - sa[i-1])
processRange(len, i - 1, i, forward);
} else {
size_t j = i - 1;
while (++i < n && h[i] >= len);
processRange(len, j, i, forward);
}
if (i == n && len <= n - sa[n-1])
processRange(len, n - 1, n, forward);
}
protected:
size_t *sa, *h, n;
const function<bool(const uint16_t*, size_t, size_t)>& word_filter;
uint16_t* content;
unordered_map<vector<uint16_t>, size_t> freq;
unordered_map<vector<uint16_t>, double> pmi, entropy;
UTF8Writer u8w;
};
int main()
{
UTF8Reader utf8(stdin);
vector<uint16_t> content;
int32_t c, d = 0;
while ((c = utf8.get()) >= 0)
if (isCJKUnifiedIdeograph(c) || isCJKSymbol(c))
content.push_back(c);
WordExtractor we([](const uint16_t* first, size_t len, size_t cnt) {
if (cnt < 50) return false;
for (size_t i = 0; i < len; i++)
if (! isCJKUnifiedIdeograph(first[i]))
return false;
return true;
});
we.extract(&content[0], content.size(), 6);
if (c == -2)
return 2;
}
@MaskRay
Copy link
Author

MaskRay commented Oct 6, 2012

make WordExtractor CXXFLAGS='-std=c++11'
./WordExtractor < in-question.txt | ruby analysis.rb

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment