ksurent/tokenizer.q

## tokenizer.q
#!/usr/bin/env qore

%new-style
%require-types
%enable-all-warnings

namespace OpenCorpora;

class OpenCorpora::Tokenizer {
    private {
        list bounds;
        list tokens;

        hash vectors_cache;

        OpenCorpora::Vectors vectors;
        OpenCorpora::List prefixes;
        OpenCorpora::List exceptions;
        OpenCorpora::List hyphens;
    }

    public {
        const VERSION = '0.01';
    }

    constructor(hash args = hash()) {
        vectors    = new OpenCorpora::Vectors(args.vectors);
        exceptions = new OpenCorpora::List(args.exceptions);
        prefixes   = new OpenCorpora::List(args.prefixes);
        hyphens    = new OpenCorpora::List(args.hyphens);
    }

    list tokens_bounds(string text) {
        do_tokenize(text);

        return bounds;
    }

    list tokens(string text, hash options = hash()) {
        options.want_tokens = True;
        options.threshold   = 0.878; # yes, i know

        do_tokenize(text, options);

        return tokens;
    }

    private do_tokenize(string text, hash opts = hash()) {
        string token = '';
        for(int i = 0; i <= length(text) - 1; i++) {
            hash ctx = (
                'pos'  : i,
                'prev' : text[i-1],
                'next' : text[i+1],
                'nnext': text[i+2],
                'char' : text[i],
                'text' : text,
            );
            if(ctx.prev == NOTHING)  { ctx.prev = ''; }
            if(ctx.next == NOTHING)  { ctx.next = ''; }
            if(ctx.nnext == NOTHING) { ctx.nnext = ''; }

            get_sequences(\ctx);
            vectorize(\ctx);

            token += ctx.char;

            *float p = vectors.probability(ctx.vector);
            if(p == NOTHING) p = 0.5;

            if(opts.want_tokens) {
                if(
                    p >= opts.threshold
                    || ctx.pos == length(ctx.text) - 1
                )
                {
                    push tokens, trim(token);
                    token = '';
                }
            }
            else {
                if(p) push bounds, (ctx.pos, p);
            }
        }
    }

    private get_sequences(reference ctx) {
        string seq       = '';
        string seq_left  = '';
        string seq_right = '';
        string spacer    = '';

        foreach string candiate in (list(ctx.next, ctx.char)) {
            *list found = regex_extract(candiate, '([-./?=:&"!+()])');
            if(elements found) {
                spacer = found[0];
                break;
            }
        }
        if(length(spacer)) {
            for(int i = ctx.pos; i >= 0; i--) {
                string ch = ctx.text[i];

                bool case1 = is_hyphen(spacer) && (is_cyr(ch) || is_hyphen(ch) || is_single_quote(ch));
                bool case2 = !is_hyphen(spacer) && !is_space(ch);

                if(case1 || case2) {
                    seq_left = ch + seq_left;
                }
                else {
                    break;
                }

                if(substr(seq_left, -1) === spacer) {
                    seq_left = substr(seq_left, 0, -1);
                }
            }

            for(int i = ctx.pos + 1; i < length(ctx.text); i++) {
                string ch = ctx.text[i];

                bool case1 = is_hyphen(spacer) && (is_cyr(ch) || is_hyphen(ch) || is_single_quote(ch));
                bool case2 = !is_hyphen(spacer) && !is_space(ch);

                if(case1 || case2) seq_right += ch;
                else break;

                if(substr(seq_right, -1) === spacer)
                    seq_right = substr(seq_right, 0, 1);
            }

            seq = join('', seq_left, seq, seq_right);
        }

        ctx.spacer    = spacer;
        ctx.seq       = seq;
        ctx.seq_right = seq_right;
        ctx.seq_left  = seq_left;
    }

    private vectorize(reference ctx) {
        #string ckey = join(',', is_hyphen(ctx.spacer),
        #                           ctx.spacer,
        #                           ctx.prev,
        #                           ctx.char,
        #                           ctx.next,
        #                           ctx.nnext,
        #                           ctx.seq_left,
        #                           ctx.seq,
        #                           ctx.seq_right);

        ctx.vector = do_vectorize(ctx);

        #if(!exists vectors_cache{ckey})
        #    vectors_cache{ckey} = do_vectorize(ctx);

        #ctx.vector = vectors_cache{ckey};
    }

    private int do_vectorize(reference ctx) {
        bool spacer = boolean(length(ctx.spacer));
        bool spacer_is_hyphen = boolean(is_hyphen(ctx.spacer));

        list bits = (
            char_class(ctx.char),
            char_class(ctx.next),
            is_digit(ctx.prev),
            is_digit(ctx.nnext),
            spacer_is_hyphen
                ? is_dict_seq(ctx.seq)
                : 0,
            spacer_is_hyphen
                ? is_suffix(ctx.seq_right)
                : 0,
            is_same_pm(ctx.char, ctx.next),
            (spacer && !spacer_is_hyphen)
                ? looks_like_url(ctx.seq, ctx.seq_right)
                : 0,
            (spacer && !spacer_is_hyphen)
                ? is_exception_seq(ctx.seq)
                : 0,
            spacer_is_hyphen
                ? is_prefix(ctx.seq_left)
                : 0,
            (is_colon(ctx.spacer) && length(ctx.seq_right))
                ? looks_like_time(ctx.seq_left, ctx.seq_right)
                : 0,
        );

        return strtoint(join('', bits), 2);
    }

    private int is_pmark(string ch) { return ch =~ /^[,?!";«»]$/ ? 1 : 0; }

    private int is_latin(string ch) { return ch =~ /^[a-zA-Z]$/ ? 1 : 0; }

    private int is_cyr(string ch) { return ch =~ /^[а-яёА-ЯЁ]$/ ? 1 : 0; }

    private int is_digit(string ch) { return ch =~ /^[0-9]$/ ? 1 : 0; }

    private int is_bracket1(string ch) { return ch =~ /^[\[({<]$/ ? 1 : 0; }

    private int is_bracket2(string ch) { return ch =~ /^[\])}>]$/ ? 1 : 0; }

    private int is_suffix(string seq) { return seq =~ /^(?:то|таки|с|ка|де)$/ ? 1 : 0; }

    private int is_space(string ch) { return ch === ' ' ? 1 : 0; }

    private int is_hyphen(string ch) { return ch === '-' ? 1 : 0; }

    private int is_dot(string ch) { return ch === '.' ? 1 : 0; }

    private int is_single_quote(string ch) { return ch === "'" ? 1 : 0; }

    private int is_slash(string ch) { return ch === '/' ? 1 : 0; }

    private int is_colon(string ch) { return ch === ':' ? 1 : 0; }

    private int is_same_pm(string ch1, string ch2) { return int(ch1 === ch2); }

    private int is_prefix(string seq) { return prefixes.in_list(tolower(seq)) ? 1 : 0; }

    private int is_dict_seq(string seq) {
        if(!length(seq) || seq[0] === '-') return 0;

        return hyphens.in_list(seq) ? 1 : 0;
    }

    private int is_exception_seq(string seq) {
        if(exceptions.in_list(seq)) return 1;

        if(seq !~ /^\W|\W$/) return 0;

        string pattern = '^[^A-Za-zА-ЯЁа-яё0-9]+';
        seq = regex_subst(seq, pattern, '');
        if(exceptions.in_list(seq)) return 1;

        while(regex(seq, pattern)) {
            seq = regex_subst(seq, pattern, '');
            if(exceptions.in_list(seq)) return 1;
        }

        return 0;
    }

    private int looks_like_url(string seq, string seq_right) {
        if(!length(seq_right)) return 0;
        if(length(seq) < 5)    return 0;
        if(seq[0] === '.')     return 0;

        if(
            seq =~ /^\W*https?:\/\//
            || seq =~ /^\W*www\./
            || seq =~ /.\.(?:[a-z]{2,3}|р[уф])\W*$/i
        )
        return 1;

        return 0;
    }

    private int looks_like_time(string seq_left, string seq_right) {
        if(seq_left !~ /^[0-9]{1,2}$/ || seq_right !~ /^[0-9]{2}$/)
            return 0;

        return (int(seq_left) < 24 && int(seq_right) < 60) ? 1 : 0;
    }

    private string char_class(string ch) {
        return is_cyr(ch)          ? '0001' :
               is_space(ch)        ? '0010' :
               is_dot(ch)          ? '0011' :
               is_pmark(ch)        ? '0100' :
               is_hyphen(ch)       ? '0101' :
               is_digit(ch)        ? '0110' :
               is_latin(ch)        ? '0111' :
               is_bracket1(ch)     ? '1000' :
               is_bracket2(ch)     ? '1001' :
               is_single_quote(ch) ? '1010' :
               is_slash(ch)        ? '1011' :
               is_colon(ch)        ? '1100' : '0000';
    }
}

class OpenCorpora::List {
    private {
        list list;
    }

    constructor(string fn) {
        load(fn);
    }

    private load(string fn) {
        File fh = new File();
        try {
            fh.open2(fn);
        }
        catch(e) {
            print(e.desc);
            exit(1);
        }

        string raw = gunzip_to_string(fh.readBinary(fh.stat()[7]));
        list = split("\n", raw);

        fh.close();
    }

    bool in_list(string str) {
        list selected = select list, $1 === str;

        return boolean(elements selected);
    }
}

class OpenCorpora::Vectors {
    private {
        hash vectors;
    }

    constructor(string fn) {
        load(fn);
    }

    private load(string fn) {
        File fh = new File();
        try {
            fh.open2(fn);
        }
        catch(e) {
            print(e.desc);
            exit(1);
        }

        string raw = gunzip_to_string(fh.readBinary(fh.stat()[7]));
        foreach string row in (split("\n", raw)) {
            list vp = split(' ', row);
            vectors{vp[0]} = float(vp[1]);
        }

        fh.close();
    }

    *float probability(int vector) {
        return vectors{vector};
    }
}

string path = '/home/ksurent/Lingua--RU--OpenCorpora--Tokenizer/blib/lib/auto/share/dist/Lingua-RU-OpenCorpora-Tokenizer';
hash files = (
    'vectors'   : path + '/vectors.gz',
    'prefixes'  : path + '/prefixes.gz',
    'hyphens'   : path + '/hyphens.gz',
    'exceptions': path + '/exceptions.gz',
);
Tokenizer tok = new Tokenizer(files);
#printf("%N\n", tok.tokens("Он хотел было уйти, но не тут-то было: дверь за его спиной уже закрылась."));

string separator = 'º';

Datasource dbh = new Datasource(SQL::DSMySQL, 'corpora', 'corpora', 'corpora', 'utf8', '127.0.0.1', 3306);
list sentences = dbh.selectRows(sprintf("
    select
        source,
        group_concat(tf_text order by text_forms.pos separator '%s') as separated
    from
        sentences
    join
        text_forms
    using
        (sent_id)
    group by
        source
", separator));

int correct = 0;
int total = 0;
foreach hash sentence in (sentences) {
    list tokens = tok.tokens(sentence.source);
    total++;
    if(join(separator, tokens) === sentence.separated)
        correct++;
}
printf("%d/%d %.2f%%\n", correct, total, correct / total * 100);
	#!/usr/bin/env qore

	%new-style
	%require-types
	%enable-all-warnings

	namespace OpenCorpora;

	class OpenCorpora::Tokenizer {
	private {
	list bounds;
	list tokens;

	hash vectors_cache;

	OpenCorpora::Vectors vectors;
	OpenCorpora::List prefixes;
	OpenCorpora::List exceptions;
	OpenCorpora::List hyphens;
	}

	public {
	const VERSION = '0.01';
	}

	constructor(hash args = hash()) {
	vectors = new OpenCorpora::Vectors(args.vectors);
	exceptions = new OpenCorpora::List(args.exceptions);
	prefixes = new OpenCorpora::List(args.prefixes);
	hyphens = new OpenCorpora::List(args.hyphens);
	}

	list tokens_bounds(string text) {
	do_tokenize(text);

	return bounds;
	}

	list tokens(string text, hash options = hash()) {
	options.want_tokens = True;
	options.threshold = 0.878; # yes, i know

	do_tokenize(text, options);

	return tokens;
	}

	private do_tokenize(string text, hash opts = hash()) {
	string token = '';
	for(int i = 0; i <= length(text) - 1; i++) {
	hash ctx = (
	'pos' : i,
	'prev' : text[i-1],
	'next' : text[i+1],
	'nnext': text[i+2],
	'char' : text[i],
	'text' : text,
	);
	if(ctx.prev == NOTHING) { ctx.prev = ''; }
	if(ctx.next == NOTHING) { ctx.next = ''; }
	if(ctx.nnext == NOTHING) { ctx.nnext = ''; }

	get_sequences(\ctx);
	vectorize(\ctx);

	token += ctx.char;

	*float p = vectors.probability(ctx.vector);
	if(p == NOTHING) p = 0.5;

	if(opts.want_tokens) {
	if(
	p >= opts.threshold
	\|\| ctx.pos == length(ctx.text) - 1
	)
	{
	push tokens, trim(token);
	token = '';
	}
	}
	else {
	if(p) push bounds, (ctx.pos, p);
	}
	}
	}

	private get_sequences(reference ctx) {
	string seq = '';
	string seq_left = '';
	string seq_right = '';
	string spacer = '';

	foreach string candiate in (list(ctx.next, ctx.char)) {
	*list found = regex_extract(candiate, '([-./?=:&"!+()])');
	if(elements found) {
	spacer = found[0];
	break;
	}
	}
	if(length(spacer)) {
	for(int i = ctx.pos; i >= 0; i--) {
	string ch = ctx.text[i];

	bool case1 = is_hyphen(spacer) && (is_cyr(ch) \|\| is_hyphen(ch) \|\| is_single_quote(ch));
	bool case2 = !is_hyphen(spacer) && !is_space(ch);

	if(case1 \|\| case2) {
	seq_left = ch + seq_left;
	}
	else {
	break;
	}

	if(substr(seq_left, -1) === spacer) {
	seq_left = substr(seq_left, 0, -1);
	}
	}

	for(int i = ctx.pos + 1; i < length(ctx.text); i++) {
	string ch = ctx.text[i];

	bool case1 = is_hyphen(spacer) && (is_cyr(ch) \|\| is_hyphen(ch) \|\| is_single_quote(ch));
	bool case2 = !is_hyphen(spacer) && !is_space(ch);

	if(case1 \|\| case2) seq_right += ch;
	else break;

	if(substr(seq_right, -1) === spacer)
	seq_right = substr(seq_right, 0, 1);
	}

	seq = join('', seq_left, seq, seq_right);
	}

	ctx.spacer = spacer;
	ctx.seq = seq;
	ctx.seq_right = seq_right;
	ctx.seq_left = seq_left;
	}

	private vectorize(reference ctx) {
	#string ckey = join(',', is_hyphen(ctx.spacer),
	# ctx.spacer,
	# ctx.prev,
	# ctx.char,
	# ctx.next,
	# ctx.nnext,
	# ctx.seq_left,
	# ctx.seq,
	# ctx.seq_right);

	ctx.vector = do_vectorize(ctx);

	#if(!exists vectors_cache{ckey})
	# vectors_cache{ckey} = do_vectorize(ctx);

	#ctx.vector = vectors_cache{ckey};
	}

	private int do_vectorize(reference ctx) {
	bool spacer = boolean(length(ctx.spacer));
	bool spacer_is_hyphen = boolean(is_hyphen(ctx.spacer));

	list bits = (
	char_class(ctx.char),
	char_class(ctx.next),
	is_digit(ctx.prev),
	is_digit(ctx.nnext),
	spacer_is_hyphen
	? is_dict_seq(ctx.seq)
	: 0,
	spacer_is_hyphen
	? is_suffix(ctx.seq_right)
	: 0,
	is_same_pm(ctx.char, ctx.next),
	(spacer && !spacer_is_hyphen)
	? looks_like_url(ctx.seq, ctx.seq_right)
	: 0,
	(spacer && !spacer_is_hyphen)
	? is_exception_seq(ctx.seq)
	: 0,
	spacer_is_hyphen
	? is_prefix(ctx.seq_left)
	: 0,
	(is_colon(ctx.spacer) && length(ctx.seq_right))
	? looks_like_time(ctx.seq_left, ctx.seq_right)
	: 0,
	);

	return strtoint(join('', bits), 2);
	}

	private int is_pmark(string ch) { return ch =~ /^[,?!";«»]$/ ? 1 : 0; }

	private int is_latin(string ch) { return ch =~ /^[a-zA-Z]$/ ? 1 : 0; }

	private int is_cyr(string ch) { return ch =~ /^[а-яёА-ЯЁ]$/ ? 1 : 0; }

	private int is_digit(string ch) { return ch =~ /^[0-9]$/ ? 1 : 0; }

	private int is_bracket1(string ch) { return ch =~ /^[\[({<]$/ ? 1 : 0; }

	private int is_bracket2(string ch) { return ch =~ /^[\])}>]$/ ? 1 : 0; }

	private int is_suffix(string seq) { return seq =~ /^(?:то\|таки\|с\|ка\|де)$/ ? 1 : 0; }

	private int is_space(string ch) { return ch === ' ' ? 1 : 0; }

	private int is_hyphen(string ch) { return ch === '-' ? 1 : 0; }

	private int is_dot(string ch) { return ch === '.' ? 1 : 0; }

	private int is_single_quote(string ch) { return ch === "'" ? 1 : 0; }

	private int is_slash(string ch) { return ch === '/' ? 1 : 0; }

	private int is_colon(string ch) { return ch === ':' ? 1 : 0; }

	private int is_same_pm(string ch1, string ch2) { return int(ch1 === ch2); }

	private int is_prefix(string seq) { return prefixes.in_list(tolower(seq)) ? 1 : 0; }

	private int is_dict_seq(string seq) {
	if(!length(seq) \|\| seq[0] === '-') return 0;

	return hyphens.in_list(seq) ? 1 : 0;
	}

	private int is_exception_seq(string seq) {
	if(exceptions.in_list(seq)) return 1;

	if(seq !~ /^\W\|\W$/) return 0;

	string pattern = '^[^A-Za-zА-ЯЁа-яё0-9]+';
	seq = regex_subst(seq, pattern, '');
	if(exceptions.in_list(seq)) return 1;

	while(regex(seq, pattern)) {
	seq = regex_subst(seq, pattern, '');
	if(exceptions.in_list(seq)) return 1;
	}

	return 0;
	}

	private int looks_like_url(string seq, string seq_right) {
	if(!length(seq_right)) return 0;
	if(length(seq) < 5) return 0;
	if(seq[0] === '.') return 0;

	if(
	seq =~ /^\W*https?:\/\//
	\|\| seq =~ /^\W*www\./
	\|\| seq =~ /.\.(?:[a-z]{2,3}\|р[уф])\W*$/i
	)
	return 1;

	return 0;
	}

	private int looks_like_time(string seq_left, string seq_right) {
	if(seq_left !~ /^[0-9]{1,2}$/ \|\| seq_right !~ /^[0-9]{2}$/)
	return 0;

	return (int(seq_left) < 24 && int(seq_right) < 60) ? 1 : 0;
	}

	private string char_class(string ch) {
	return is_cyr(ch) ? '0001' :
	is_space(ch) ? '0010' :
	is_dot(ch) ? '0011' :
	is_pmark(ch) ? '0100' :
	is_hyphen(ch) ? '0101' :
	is_digit(ch) ? '0110' :
	is_latin(ch) ? '0111' :
	is_bracket1(ch) ? '1000' :
	is_bracket2(ch) ? '1001' :
	is_single_quote(ch) ? '1010' :
	is_slash(ch) ? '1011' :
	is_colon(ch) ? '1100' : '0000';
	}
	}

	class OpenCorpora::List {
	private {
	list list;
	}

	constructor(string fn) {
	load(fn);
	}

	private load(string fn) {
	File fh = new File();
	try {
	fh.open2(fn);
	}
	catch(e) {
	print(e.desc);
	exit(1);
	}

	string raw = gunzip_to_string(fh.readBinary(fh.stat()[7]));
	list = split("\n", raw);

	fh.close();
	}

	bool in_list(string str) {
	list selected = select list, $1 === str;

	return boolean(elements selected);
	}
	}

	class OpenCorpora::Vectors {
	private {
	hash vectors;
	}

	constructor(string fn) {
	load(fn);
	}

	private load(string fn) {
	File fh = new File();
	try {
	fh.open2(fn);
	}
	catch(e) {
	print(e.desc);
	exit(1);
	}

	string raw = gunzip_to_string(fh.readBinary(fh.stat()[7]));
	foreach string row in (split("\n", raw)) {
	list vp = split(' ', row);
	vectors{vp[0]} = float(vp[1]);
	}

	fh.close();
	}

	*float probability(int vector) {
	return vectors{vector};
	}
	}

	string path = '/home/ksurent/Lingua--RU--OpenCorpora--Tokenizer/blib/lib/auto/share/dist/Lingua-RU-OpenCorpora-Tokenizer';
	hash files = (
	'vectors' : path + '/vectors.gz',
	'prefixes' : path + '/prefixes.gz',
	'hyphens' : path + '/hyphens.gz',
	'exceptions': path + '/exceptions.gz',
	);
	Tokenizer tok = new Tokenizer(files);
	#printf("%N\n", tok.tokens("Он хотел было уйти, но не тут-то было: дверь за его спиной уже закрылась."));

	string separator = 'º';

	Datasource dbh = new Datasource(SQL::DSMySQL, 'corpora', 'corpora', 'corpora', 'utf8', '127.0.0.1', 3306);
	list sentences = dbh.selectRows(sprintf("
	select
	source,
	group_concat(tf_text order by text_forms.pos separator '%s') as separated
	from
	sentences
	join
	text_forms
	using
	(sent_id)
	group by
	source
	", separator));

	int correct = 0;
	int total = 0;
	foreach hash sentence in (sentences) {
	list tokens = tok.tokens(sentence.source);
	total++;
	if(join(separator, tokens) === sentence.separated)
	correct++;
	}
	printf("%d/%d %.2f%%\n", correct, total, correct / total * 100);