Skip to content

Instantly share code, notes, and snippets.

@ksurent
Created December 8, 2011 12:38
Show Gist options
  • Save ksurent/1446877 to your computer and use it in GitHub Desktop.
Save ksurent/1446877 to your computer and use it in GitHub Desktop.
(Buggy) tokenizer in Qore
#!/usr/bin/env qore
%new-style
%require-types
%enable-all-warnings
namespace OpenCorpora;
class OpenCorpora::Tokenizer {
private {
list bounds;
list tokens;
hash vectors_cache;
OpenCorpora::Vectors vectors;
OpenCorpora::List prefixes;
OpenCorpora::List exceptions;
OpenCorpora::List hyphens;
}
public {
const VERSION = '0.01';
}
constructor(hash args = hash()) {
vectors = new OpenCorpora::Vectors(args.vectors);
exceptions = new OpenCorpora::List(args.exceptions);
prefixes = new OpenCorpora::List(args.prefixes);
hyphens = new OpenCorpora::List(args.hyphens);
}
list tokens_bounds(string text) {
do_tokenize(text);
return bounds;
}
list tokens(string text, hash options = hash()) {
options.want_tokens = True;
options.threshold = 0.878; # yes, i know
do_tokenize(text, options);
return tokens;
}
private do_tokenize(string text, hash opts = hash()) {
string token = '';
for(int i = 0; i <= length(text) - 1; i++) {
hash ctx = (
'pos' : i,
'prev' : text[i-1],
'next' : text[i+1],
'nnext': text[i+2],
'char' : text[i],
'text' : text,
);
if(ctx.prev == NOTHING) { ctx.prev = ''; }
if(ctx.next == NOTHING) { ctx.next = ''; }
if(ctx.nnext == NOTHING) { ctx.nnext = ''; }
get_sequences(\ctx);
vectorize(\ctx);
token += ctx.char;
*float p = vectors.probability(ctx.vector);
if(p == NOTHING) p = 0.5;
if(opts.want_tokens) {
if(
p >= opts.threshold
|| ctx.pos == length(ctx.text) - 1
)
{
push tokens, trim(token);
token = '';
}
}
else {
if(p) push bounds, (ctx.pos, p);
}
}
}
private get_sequences(reference ctx) {
string seq = '';
string seq_left = '';
string seq_right = '';
string spacer = '';
foreach string candiate in (list(ctx.next, ctx.char)) {
*list found = regex_extract(candiate, '([-./?=:&"!+()])');
if(elements found) {
spacer = found[0];
break;
}
}
if(length(spacer)) {
for(int i = ctx.pos; i >= 0; i--) {
string ch = ctx.text[i];
bool case1 = is_hyphen(spacer) && (is_cyr(ch) || is_hyphen(ch) || is_single_quote(ch));
bool case2 = !is_hyphen(spacer) && !is_space(ch);
if(case1 || case2) {
seq_left = ch + seq_left;
}
else {
break;
}
if(substr(seq_left, -1) === spacer) {
seq_left = substr(seq_left, 0, -1);
}
}
for(int i = ctx.pos + 1; i < length(ctx.text); i++) {
string ch = ctx.text[i];
bool case1 = is_hyphen(spacer) && (is_cyr(ch) || is_hyphen(ch) || is_single_quote(ch));
bool case2 = !is_hyphen(spacer) && !is_space(ch);
if(case1 || case2) seq_right += ch;
else break;
if(substr(seq_right, -1) === spacer)
seq_right = substr(seq_right, 0, 1);
}
seq = join('', seq_left, seq, seq_right);
}
ctx.spacer = spacer;
ctx.seq = seq;
ctx.seq_right = seq_right;
ctx.seq_left = seq_left;
}
private vectorize(reference ctx) {
#string ckey = join(',', is_hyphen(ctx.spacer),
# ctx.spacer,
# ctx.prev,
# ctx.char,
# ctx.next,
# ctx.nnext,
# ctx.seq_left,
# ctx.seq,
# ctx.seq_right);
ctx.vector = do_vectorize(ctx);
#if(!exists vectors_cache{ckey})
# vectors_cache{ckey} = do_vectorize(ctx);
#ctx.vector = vectors_cache{ckey};
}
private int do_vectorize(reference ctx) {
bool spacer = boolean(length(ctx.spacer));
bool spacer_is_hyphen = boolean(is_hyphen(ctx.spacer));
list bits = (
char_class(ctx.char),
char_class(ctx.next),
is_digit(ctx.prev),
is_digit(ctx.nnext),
spacer_is_hyphen
? is_dict_seq(ctx.seq)
: 0,
spacer_is_hyphen
? is_suffix(ctx.seq_right)
: 0,
is_same_pm(ctx.char, ctx.next),
(spacer && !spacer_is_hyphen)
? looks_like_url(ctx.seq, ctx.seq_right)
: 0,
(spacer && !spacer_is_hyphen)
? is_exception_seq(ctx.seq)
: 0,
spacer_is_hyphen
? is_prefix(ctx.seq_left)
: 0,
(is_colon(ctx.spacer) && length(ctx.seq_right))
? looks_like_time(ctx.seq_left, ctx.seq_right)
: 0,
);
return strtoint(join('', bits), 2);
}
private int is_pmark(string ch) { return ch =~ /^[,?!";«»]$/ ? 1 : 0; }
private int is_latin(string ch) { return ch =~ /^[a-zA-Z]$/ ? 1 : 0; }
private int is_cyr(string ch) { return ch =~ /^[а-яёА-ЯЁ]$/ ? 1 : 0; }
private int is_digit(string ch) { return ch =~ /^[0-9]$/ ? 1 : 0; }
private int is_bracket1(string ch) { return ch =~ /^[\[({<]$/ ? 1 : 0; }
private int is_bracket2(string ch) { return ch =~ /^[\])}>]$/ ? 1 : 0; }
private int is_suffix(string seq) { return seq =~ /^(?:то|таки|с|ка|де)$/ ? 1 : 0; }
private int is_space(string ch) { return ch === ' ' ? 1 : 0; }
private int is_hyphen(string ch) { return ch === '-' ? 1 : 0; }
private int is_dot(string ch) { return ch === '.' ? 1 : 0; }
private int is_single_quote(string ch) { return ch === "'" ? 1 : 0; }
private int is_slash(string ch) { return ch === '/' ? 1 : 0; }
private int is_colon(string ch) { return ch === ':' ? 1 : 0; }
private int is_same_pm(string ch1, string ch2) { return int(ch1 === ch2); }
private int is_prefix(string seq) { return prefixes.in_list(tolower(seq)) ? 1 : 0; }
private int is_dict_seq(string seq) {
if(!length(seq) || seq[0] === '-') return 0;
return hyphens.in_list(seq) ? 1 : 0;
}
private int is_exception_seq(string seq) {
if(exceptions.in_list(seq)) return 1;
if(seq !~ /^\W|\W$/) return 0;
string pattern = '^[^A-Za-zА-ЯЁа-яё0-9]+';
seq = regex_subst(seq, pattern, '');
if(exceptions.in_list(seq)) return 1;
while(regex(seq, pattern)) {
seq = regex_subst(seq, pattern, '');
if(exceptions.in_list(seq)) return 1;
}
return 0;
}
private int looks_like_url(string seq, string seq_right) {
if(!length(seq_right)) return 0;
if(length(seq) < 5) return 0;
if(seq[0] === '.') return 0;
if(
seq =~ /^\W*https?:\/\//
|| seq =~ /^\W*www\./
|| seq =~ /.\.(?:[a-z]{2,3}|р[уф])\W*$/i
)
return 1;
return 0;
}
private int looks_like_time(string seq_left, string seq_right) {
if(seq_left !~ /^[0-9]{1,2}$/ || seq_right !~ /^[0-9]{2}$/)
return 0;
return (int(seq_left) < 24 && int(seq_right) < 60) ? 1 : 0;
}
private string char_class(string ch) {
return is_cyr(ch) ? '0001' :
is_space(ch) ? '0010' :
is_dot(ch) ? '0011' :
is_pmark(ch) ? '0100' :
is_hyphen(ch) ? '0101' :
is_digit(ch) ? '0110' :
is_latin(ch) ? '0111' :
is_bracket1(ch) ? '1000' :
is_bracket2(ch) ? '1001' :
is_single_quote(ch) ? '1010' :
is_slash(ch) ? '1011' :
is_colon(ch) ? '1100' : '0000';
}
}
class OpenCorpora::List {
private {
list list;
}
constructor(string fn) {
load(fn);
}
private load(string fn) {
File fh = new File();
try {
fh.open2(fn);
}
catch(e) {
print(e.desc);
exit(1);
}
string raw = gunzip_to_string(fh.readBinary(fh.stat()[7]));
list = split("\n", raw);
fh.close();
}
bool in_list(string str) {
list selected = select list, $1 === str;
return boolean(elements selected);
}
}
class OpenCorpora::Vectors {
private {
hash vectors;
}
constructor(string fn) {
load(fn);
}
private load(string fn) {
File fh = new File();
try {
fh.open2(fn);
}
catch(e) {
print(e.desc);
exit(1);
}
string raw = gunzip_to_string(fh.readBinary(fh.stat()[7]));
foreach string row in (split("\n", raw)) {
list vp = split(' ', row);
vectors{vp[0]} = float(vp[1]);
}
fh.close();
}
*float probability(int vector) {
return vectors{vector};
}
}
string path = '/home/ksurent/Lingua--RU--OpenCorpora--Tokenizer/blib/lib/auto/share/dist/Lingua-RU-OpenCorpora-Tokenizer';
hash files = (
'vectors' : path + '/vectors.gz',
'prefixes' : path + '/prefixes.gz',
'hyphens' : path + '/hyphens.gz',
'exceptions': path + '/exceptions.gz',
);
Tokenizer tok = new Tokenizer(files);
#printf("%N\n", tok.tokens("Он хотел было уйти, но не тут-то было: дверь за его спиной уже закрылась."));
string separator = 'º';
Datasource dbh = new Datasource(SQL::DSMySQL, 'corpora', 'corpora', 'corpora', 'utf8', '127.0.0.1', 3306);
list sentences = dbh.selectRows(sprintf("
select
source,
group_concat(tf_text order by text_forms.pos separator '%s') as separated
from
sentences
join
text_forms
using
(sent_id)
group by
source
", separator));
int correct = 0;
int total = 0;
foreach hash sentence in (sentences) {
list tokens = tok.tokens(sentence.source);
total++;
if(join(separator, tokens) === sentence.separated)
correct++;
}
printf("%d/%d %.2f%%\n", correct, total, correct / total * 100);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment