MaskRay/WordExtractor.cc

## analysis.rb
words = []
while l = gets
  len = $1 if l =~ /^---len: (\d+)/
  next if len == 1
  if l =~ /^(\d+): (\S+) pmi: ([\d.]+) entropy: ([\w.]+)/
    cnt = $1.to_i
    pmi = $3.to_f
    entropy = $4.to_i
    if entropy > 80 and pmi > 0.3
      words << [cnt, $2, pmi, entropy]
    end
  end
end

words.sort_by! {|x| x[3] * x[2] }.reverse!.take(50).each do |a|
  puts a[1]
end

## WordExtractor.cc
#include <algorithm>
#include <cmath>
#include <functional>
#include <numeric>
#include <unordered_map>
#include <vector>
#include <stdint.h>
#include <stdio.h>
using namespace std;

bool isCJKUnifiedIdeograph(int32_t c)
{
  return 0x4E00 <= c && c < 0xA000;
}

bool isCJKSymbol(int32_t c)
{
  return 0x3000 <= c && c < 0x3040;
}

class UTF8Reader
{
public:
  UTF8Reader(FILE* in) : in(in) {}
  int32_t get() {
    int s, c, t;
    if ((s = fgetc(in)) < 0x80) return s;
    c = __builtin_clz(~ s & 0xFF) - (sizeof(unsigned) * 8 - 8);
    if (! (2 <= c && c <= 6)) return -2;
    s &= (1 << 7 - c) - 1;
    while (--c) {
      t = fgetc(in);
      s = (s << 6) + (t & 0x3F);
    }
    return s;
  }
protected:
  FILE* in;
};

class UTF8Writer
{
public:
  UTF8Writer(FILE* out) : out(out) {}
  int put(int32_t c) {
    if (c < 0x80)
      fputc(c, out);
    else {
      int d;
      if (c < 0x800) {
        fputc(0xC0 | c >> 6 & 31, out);
        d = 1;
      } else if (c < 0x10000) {
        fputc(0xE0 | c >> 12 & 15, out);
        d = 2;
      } else if (c < 0x200000) {
        fputc(0xF0 | c >> 18 & 7, out);
        d = 3;
      } else if (c < 0x4000000) {
        fputc(0xF8 | c >> 24 & 3, out);
        d = 4;
      } else if (c < 0x80000000) {
        fputc(0xFC | c >> 30, out);
        d = 5;
      } else
        return -2;
      while (d--)
        fputc(0x80 | c >> d*6 & 63, out);
    }
    return 0;
  }
protected:
  FILE* out;
};

namespace std
{
  template<>
    class hash<vector<uint16_t>> {
    public:
      size_t operator()(const vector<uint16_t> &s) const
      {
        size_t h = 0;
        for (auto c : s)
          h = h * 17 + c;
        return h;
      }
    };
}

namespace SuffixArray
{
  struct Type
  {
    size_t id, key[2];
    bool operator<(const Type &rhs) const {
      return key[0] < rhs.key[0] || key[0] == rhs.key[0] && key[1] < rhs.key[1];
    }
  };

  template<int k>
    void countingSort(const Type a[], size_t n, Type b[], size_t h[])
    {
      fill_n(h, n, 0);
      for (int i = 0; i < n; ++i)
        h[a[i].key[k]]++;
      for (int i = 1; i < n; ++i)
        h[i] += h[i-1];
      for (int i = n; --i >= 0; )
        b[--h[a[i].key[k]]] = a[i];
    }

  template<typename T>
    void run(T a[], size_t n, size_t output[], size_t h[])
    {
      Type *sa = new Type[n], *nsa = new Type[n];
      size_t *r = new size_t[n];
      for (size_t i = 0; i < n; ++i)
        sa[i].id = i, sa[i].key[0] = size_t(a[i]), sa[i].key[1] = 0;
      sort(sa, sa + n);
      for (size_t k = 1; ; k <<= 1) {
        r[sa[0].id] = 0;
        for (size_t i = 1; i < n; ++i)
          r[sa[i].id] = r[sa[i-1].id] + (sa[i-1] < sa[i]);
        if (k >= n || r[sa[n-1].id] == n-1) break;
        for (size_t i = 0; i < n; ++i) {
          sa[i].id = i;
          sa[i].key[0] = r[i];
          sa[i].key[1] = i + k < n ? r[i + k] + 1 : 0;
        }
        countingSort<1>(sa, n, nsa, h);
        countingSort<0>(nsa, n, sa, h);
      }
      for (size_t i = 0; i < n; i++)
        output[i] = sa[i].id;
      for (size_t k = 0, i = 0; i < n; ++i)
        if (r[i]) {
          for (size_t j = sa[r[i]-1].id; a[j+k] == a[i+k]; ++k);
          h[r[i]] = k;
          if (k) --k;
        }
      delete[] sa;
      delete[] nsa;
      delete[] r;
    }
};

class WordExtractor
{
public:
  WordExtractor(const function<bool(const uint16_t*, size_t, size_t)>& word_filter) : word_filter(word_filter), u8w(stdout) {}

  void extract(uint16_t* content, size_t n, size_t max_len)
  {
    freq.clear();
    pmi.clear();
    entropy.clear();

    this->content = content;
    this->n = n;
    sa = new size_t[n];
    h = new size_t[n];
    SuffixArray::run(&content[0], n, sa, h);

    for (size_t len = 1; len <= max_len; len++)
      processLen(len, true);

    reverse(content, content + n);
    SuffixArray::run(&content[0], n, sa, h);
    for (size_t len = 1; len <= max_len; len++) {
      printf("---len: %zd---\n", len);
      processLen(len, false);
    }

    delete[] sa;
    delete[] h;
  }

  void processRange(size_t len, size_t lo, size_t hi, bool forward)
  {
    if (forward) {
      const uint16_t* first = &content[sa[lo]];
      if (! word_filter(first, len, hi - lo)) return;
      vector<uint16_t> word(first, first + len);
      freq[word] += hi - lo;

      if (len > 1) {
        vector<uint16_t> left(first, first + 1), right(first + 1, first + len);
        pmi[word] = log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n);
        if (len > 2) {
          left.assign(first, first + len - 1);
          right.assign(first + len - 1, first + len);
          pmi[word] = min(pmi[word], log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n));
        }
      }

      entropy[word] = 0;
      for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) {
        for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++);
        if (isCJKUnifiedIdeograph(content[sa[i] + len]) || isCJKSymbol(content[sa[i] + len]))
          entropy[word] += - log(double(j - i) / (hi - lo));
      }
    } else {
      const uint16_t* first = &content[sa[lo]];
      vector<uint16_t> word(first, first + len);
      reverse(word.begin(), word.end());
      if (! freq.count(word)) return;

      double en = 0;
      for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) {
        for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++);
        if (isCJKUnifiedIdeograph(content[sa[i] + len]) || isCJKSymbol(content[sa[i] + len]))
          en += - log(double(j - i) / (hi - lo));
      }
      entropy[word] = min(entropy[word], en);

      printf("%zd: ", hi - lo);
      for (size_t i = 0; i < len; i++)
        u8w.put(first[len-1-i]);
      if (len > 1)
        printf(" pmi: %lf", pmi[word]);
      printf(" entropy: %lf", entropy[word]);
      puts("");
    }
  }

  void processLen(size_t len, bool forward) {
    size_t i;
    for (i = 1; i < n; i++)
      if (h[i] < len) {
        if (len <= n - sa[i-1])
          processRange(len, i - 1, i, forward);
      } else {
        size_t j = i - 1;
        while (++i < n && h[i] >= len);
        processRange(len, j, i, forward);
      }
    if (i == n && len <= n - sa[n-1])
      processRange(len, n - 1, n, forward);
  }
protected:
  size_t *sa, *h, n;
  const function<bool(const uint16_t*, size_t, size_t)>& word_filter;
  uint16_t* content;
  unordered_map<vector<uint16_t>, size_t> freq;
  unordered_map<vector<uint16_t>, double> pmi, entropy;
  UTF8Writer u8w;
};

int main()
{
  UTF8Reader utf8(stdin);
  vector<uint16_t> content;
  int32_t c, d = 0;
  while ((c = utf8.get()) >= 0)
    if (isCJKUnifiedIdeograph(c) || isCJKSymbol(c))
      content.push_back(c);

  WordExtractor we([](const uint16_t* first, size_t len, size_t cnt) {
      if (cnt < 50) return false;
      for (size_t i = 0; i < len; i++)
        if (! isCJKUnifiedIdeograph(first[i]))
          return false;
      return true;
      });

  we.extract(&content[0], content.size(), 6);

  if (c == -2)
    return 2;
}
	words = []
	while l = gets
	len = $1 if l =~ /^---len: (\d+)/
	next if len == 1
	if l =~ /^(\d+): (\S+) pmi: ([\d.]+) entropy: ([\w.]+)/
	cnt = $1.to_i
	pmi = $3.to_f
	entropy = $4.to_i
	if entropy > 80 and pmi > 0.3
	words << [cnt, $2, pmi, entropy]
	end
	end
	end

	words.sort_by! {\|x\| x[3] * x[2] }.reverse!.take(50).each do \|a\|
	puts a[1]
	end
	#include <algorithm>
	#include <cmath>
	#include <functional>
	#include <numeric>
	#include <unordered_map>
	#include <vector>
	#include <stdint.h>
	#include <stdio.h>
	using namespace std;

	bool isCJKUnifiedIdeograph(int32_t c)
	{
	return 0x4E00 <= c && c < 0xA000;
	}

	bool isCJKSymbol(int32_t c)
	{
	return 0x3000 <= c && c < 0x3040;
	}

	class UTF8Reader
	{
	public:
	UTF8Reader(FILE* in) : in(in) {}
	int32_t get() {
	int s, c, t;
	if ((s = fgetc(in)) < 0x80) return s;
	c = __builtin_clz(~ s & 0xFF) - (sizeof(unsigned) * 8 - 8);
	if (! (2 <= c && c <= 6)) return -2;
	s &= (1 << 7 - c) - 1;
	while (--c) {
	t = fgetc(in);
	s = (s << 6) + (t & 0x3F);
	}
	return s;
	}
	protected:
	FILE* in;
	};

	class UTF8Writer
	{
	public:
	UTF8Writer(FILE* out) : out(out) {}
	int put(int32_t c) {
	if (c < 0x80)
	fputc(c, out);
	else {
	int d;
	if (c < 0x800) {
	fputc(0xC0 \| c >> 6 & 31, out);
	d = 1;
	} else if (c < 0x10000) {
	fputc(0xE0 \| c >> 12 & 15, out);
	d = 2;
	} else if (c < 0x200000) {
	fputc(0xF0 \| c >> 18 & 7, out);
	d = 3;
	} else if (c < 0x4000000) {
	fputc(0xF8 \| c >> 24 & 3, out);
	d = 4;
	} else if (c < 0x80000000) {
	fputc(0xFC \| c >> 30, out);
	d = 5;
	} else
	return -2;
	while (d--)
	fputc(0x80 \| c >> d*6 & 63, out);
	}
	return 0;
	}
	protected:
	FILE* out;
	};

	namespace std
	{
	template<>
	class hash<vector<uint16_t>> {
	public:
	size_t operator()(const vector<uint16_t> &s) const
	{
	size_t h = 0;
	for (auto c : s)
	h = h * 17 + c;
	return h;
	}
	};
	}

	namespace SuffixArray
	{
	struct Type
	{
	size_t id, key[2];
	bool operator<(const Type &rhs) const {
	return key[0] < rhs.key[0] \|\| key[0] == rhs.key[0] && key[1] < rhs.key[1];
	}
	};

	template<int k>
	void countingSort(const Type a[], size_t n, Type b[], size_t h[])
	{
	fill_n(h, n, 0);
	for (int i = 0; i < n; ++i)
	h[a[i].key[k]]++;
	for (int i = 1; i < n; ++i)
	h[i] += h[i-1];
	for (int i = n; --i >= 0; )
	b[--h[a[i].key[k]]] = a[i];
	}

	template<typename T>
	void run(T a[], size_t n, size_t output[], size_t h[])
	{
	Type sa = new Type[n], nsa = new Type[n];
	size_t *r = new size_t[n];
	for (size_t i = 0; i < n; ++i)
	sa[i].id = i, sa[i].key[0] = size_t(a[i]), sa[i].key[1] = 0;
	sort(sa, sa + n);
	for (size_t k = 1; ; k <<= 1) {
	r[sa[0].id] = 0;
	for (size_t i = 1; i < n; ++i)
	r[sa[i].id] = r[sa[i-1].id] + (sa[i-1] < sa[i]);
	if (k >= n \|\| r[sa[n-1].id] == n-1) break;
	for (size_t i = 0; i < n; ++i) {
	sa[i].id = i;
	sa[i].key[0] = r[i];
	sa[i].key[1] = i + k < n ? r[i + k] + 1 : 0;
	}
	countingSort<1>(sa, n, nsa, h);
	countingSort<0>(nsa, n, sa, h);
	}
	for (size_t i = 0; i < n; i++)
	output[i] = sa[i].id;
	for (size_t k = 0, i = 0; i < n; ++i)
	if (r[i]) {
	for (size_t j = sa[r[i]-1].id; a[j+k] == a[i+k]; ++k);
	h[r[i]] = k;
	if (k) --k;
	}
	delete[] sa;
	delete[] nsa;
	delete[] r;
	}
	};

	class WordExtractor
	{
	public:
	WordExtractor(const function<bool(const uint16_t*, size_t, size_t)>& word_filter) : word_filter(word_filter), u8w(stdout) {}

	void extract(uint16_t* content, size_t n, size_t max_len)
	{
	freq.clear();
	pmi.clear();
	entropy.clear();

	this->content = content;
	this->n = n;
	sa = new size_t[n];
	h = new size_t[n];
	SuffixArray::run(&content[0], n, sa, h);

	for (size_t len = 1; len <= max_len; len++)
	processLen(len, true);

	reverse(content, content + n);
	SuffixArray::run(&content[0], n, sa, h);
	for (size_t len = 1; len <= max_len; len++) {
	printf("---len: %zd---\n", len);
	processLen(len, false);
	}

	delete[] sa;
	delete[] h;
	}

	void processRange(size_t len, size_t lo, size_t hi, bool forward)
	{
	if (forward) {
	const uint16_t* first = &content[sa[lo]];
	if (! word_filter(first, len, hi - lo)) return;
	vector<uint16_t> word(first, first + len);
	freq[word] += hi - lo;

	if (len > 1) {
	vector<uint16_t> left(first, first + 1), right(first + 1, first + len);
	pmi[word] = log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n);
	if (len > 2) {
	left.assign(first, first + len - 1);
	right.assign(first + len - 1, first + len);
	pmi[word] = min(pmi[word], log(double(freq[word]) / freq[left] / freq[right] * n) / - log(double(freq[word]) / n));
	}
	}

	entropy[word] = 0;
	for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) {
	for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++);
	if (isCJKUnifiedIdeograph(content[sa[i] + len]) \|\| isCJKSymbol(content[sa[i] + len]))
	entropy[word] += - log(double(j - i) / (hi - lo));
	}
	} else {
	const uint16_t* first = &content[sa[lo]];
	vector<uint16_t> word(first, first + len);
	reverse(word.begin(), word.end());
	if (! freq.count(word)) return;

	double en = 0;
	for (size_t j, i = lo + (sa[lo] + len >= n); i < hi; i = j) {
	for (j = i + 1; j < hi && content[sa[i] + len] == content[sa[j] + len]; j++);
	if (isCJKUnifiedIdeograph(content[sa[i] + len]) \|\| isCJKSymbol(content[sa[i] + len]))
	en += - log(double(j - i) / (hi - lo));
	}
	entropy[word] = min(entropy[word], en);

	printf("%zd: ", hi - lo);
	for (size_t i = 0; i < len; i++)
	u8w.put(first[len-1-i]);
	if (len > 1)
	printf(" pmi: %lf", pmi[word]);
	printf(" entropy: %lf", entropy[word]);
	puts("");
	}
	}

	void processLen(size_t len, bool forward) {
	size_t i;
	for (i = 1; i < n; i++)
	if (h[i] < len) {
	if (len <= n - sa[i-1])
	processRange(len, i - 1, i, forward);
	} else {
	size_t j = i - 1;
	while (++i < n && h[i] >= len);
	processRange(len, j, i, forward);
	}
	if (i == n && len <= n - sa[n-1])
	processRange(len, n - 1, n, forward);
	}
	protected:
	size_t sa, h, n;
	const function<bool(const uint16_t*, size_t, size_t)>& word_filter;
	uint16_t* content;
	unordered_map<vector<uint16_t>, size_t> freq;
	unordered_map<vector<uint16_t>, double> pmi, entropy;
	UTF8Writer u8w;
	};

	int main()
	{
	UTF8Reader utf8(stdin);
	vector<uint16_t> content;
	int32_t c, d = 0;
	while ((c = utf8.get()) >= 0)
	if (isCJKUnifiedIdeograph(c) \|\| isCJKSymbol(c))
	content.push_back(c);

	WordExtractor we([](const uint16_t* first, size_t len, size_t cnt) {
	if (cnt < 50) return false;
	for (size_t i = 0; i < len; i++)
	if (! isCJKUnifiedIdeograph(first[i]))
	return false;
	return true;
	});

	we.extract(&content[0], content.size(), 6);

	if (c == -2)
	return 2;
	}