sandello/mediawiki_extract_pages_to_files.py

## mediawiki_extract_pages_to_files.py
#!/usr/bin/python
# For Yandex Data Analysis School

"""Takes MediaWiki XML dump and extracts pages to separate files."""

SUBDIRECTORY_SPREAD = 512

import sys
import os
import os.path

from optparse import OptionParser
from xml.sax import parse
from xml.sax.handler import ContentHandler

def smart_open(filename, mode = "rb", encoding = "utf-8"):
    handle = None

    if "b" not in mode:
        mode = mode + "b"

    if filename == "-":
        if "w" in mode:
            handle = sys.stdout
        else:
            handle = sys.stdin
    elif filename.endswith(".gz"):
        import gzip
        handle = gzip.open(filename, mode)
    elif filename.endswith(".bz2"):
        import bz2
        handle = bz2.BZ2File(filename, mode)
    else:
        handle = open(filename, mode)

    if encoding:
        import codecs
        if "w" in mode:
            handle = codecs.getwriter(encoding)(handle, "ignore")
        else:
            handle = codecs.getreader(encoding)(handle, "ignore")

    return handle

class MyHandler(ContentHandler):
    def __init__(self, callback):
        self.callback = callback
        self.counter = 0

        self.clear()

    def clear(self):
        self.in_page = False
        self.in_title = False
        self.in_text = False
        self.is_redirect = False

        self.current_title = u""
        self.current_text = u""

        self.counter += 1

    def startElement(self, name, attrs):
        if name == "page":
            self.in_page = True
        elif self.in_page:
            if name == "redirect":
                self.is_redirect = True
            elif name == "title":
                assert(not self.in_title and not self.in_text)
                self.in_title = True
            elif name == "text":
                assert(not self.in_title and not self.in_text)
                self.in_text = True

    def endElement(self, name):
        if name == "page":
            assert(not self.in_title)
            assert(not self.in_text)

            if not self.is_redirect:
                self.callback(self.counter, self.current_title.strip(), self.current_text.strip())

            self.clear()
        elif self.in_page:
            if name == "title":
                assert(self.in_title and not self.in_text)
                self.in_title = False
            elif name == "text":
                assert(not self.in_title and self.in_text)
                self.in_text = False

    def characters(self, data):
        if self.in_title:
            self.current_title += data

        if self.in_text:
            self.current_text += data

def extract_pages(stream, output_directory):
    def extract_page(aydee, title, text):
        subdirectory = "{0:03d}".format(aydee % SUBDIRECTORY_SPREAD)
        filename = "{0:08d}.txt".format(aydee)

        output_path = os.path.join(output_directory, subdirectory)
        if not os.path.isdir(output_path):
            os.mkdir(output_path)
        output_path = os.path.join(output_path, filename)

        print output_path

        f = smart_open(output_path, "w")
        f.write(title)
        f.write(u"\n\n")
        f.write(text)
        f.close()

    return parse(stream, MyHandler(extract_page))

if __name__ == "__main__":
    parser = OptionParser(description = __doc__)
    parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")
    parser.add_option("-o", "--output", metavar = "DIRECTORY", help = "write files to DIRECTORY")

    (options, args) = parser.parse_args()

    if not options.input or not options.output:
        parser.error("-i, -o options are required.")
        parser.print_usage()

        sys.exit(1)

    output_directory = os.path.abspath(options.output)

    if not os.path.isdir(output_directory):
        print >>sys.stderr, "-o option should point to directory."
        sys.exit(1)

    input_file = smart_open(options.input, encoding = None)

    extract_pages(input_file, output_directory)

    input_file.close()

## mediawiki_take_first_n_pages.py
#!/usr/bin/python
# For Yandex Data Analysis School

"""Takes MediaWiki XML dump as input and trims it to first N pages."""

import sys

from optparse import OptionParser

def smart_open(filename, mode = "rb", encoding = "utf-8"):
    handle = None

    if "b" not in mode:
        mode = mode + "b"

    if filename == "-":
        if "w" in mode:
            handle = sys.stdout
        else:
            handle = sys.stdin
    elif filename.endswith(".gz"):
        import gzip
        handle = gzip.open(filename, mode)
    elif filename.endswith(".bz2"):
        import bz2
        handle = bz2.BZ2File(filename, mode)
    else:
        handle = open(filename, mode)

    if encoding:
        import codecs
        if "w" in mode:
            handle = codecs.getwriter(encoding)(handle, "ignore")
        else:
            handle = codecs.getreader(encoding)(handle, "ignore")

    return handle

def trim_dump(stream, required_number_of_pages):
    number_of_pages = 0
    should_close_mediawiki = False

    for line in stream:
        if line.find(u"<mediawiki") != -1:
            should_close_mediawiki = True
        if line.find(u"</mediawiki>") != -1:
            should_close_mediawiki = False

        if line.find(u"</page>") != -1:
            number_of_pages += 1

        yield line

        if number_of_pages == required_number_of_pages:
            break

    if should_close_mediawiki:
        yield u"</mediawiki>\n"

if __name__ == "__main__":
    parser = OptionParser(description = __doc__)
    parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")
    parser.add_option("-o", "--output", metavar = "FILE", help = "write trimmed dump to FILE")
    parser.add_option("-n", "--number", metavar = "N", help = "trim dump to first N pages", type = "int")

    (options, args) = parser.parse_args()

    if not options.input or not options.output or not options.number:
        parser.error("-i, -o, -n options are required.")
        parser.print_usage()

        sys.exit(1)

    input_file = smart_open(options.input)
    output_file = smart_open(options.output, "w")

    output_file.writelines(trim_dump(input_file, options.number))

    input_file.close()
    output_file.close()

## print_words.cpp
// For Yandex Data Analysis School
// XXX: Включите этот define в зависимости от вашей операционной системы
#define WINDOWS

#include <string>
#include <fstream>
#include <iostream>

// ICU: http://icu-project.org
#include <unicode/utypes.h>
#include <unicode/unistr.h>
#include <unicode/uniset.h>

#ifdef WINDOWS
#include <windows.h>
#endif

// ==== Выделение слов из потока
//
// Фунции PrintWords* читают входящий поток is и построчно в os выписывает слова, переводя их в нижний регистр.
// Входная и выходная кодировки потоков -- UTF-8.
// Ниже приведены три возможные реализации функции:
//  - выделение слов как последовательности алфавитных символов с помощью UnicodeSet,
//  - выделение слов как последовательности алфавитных символов с помощью IsAlphabetic,
//  - разделение исходной строки по пробельным символам.
//
// ==== Вспомогательные функции
//
// Для справки, список потенциально полезных функций определения типа символа:
//  - u_isUAlphabetic
//  - u_isULowercase
//  - u_isUUppercase
//  - u_isUWhiteSpace
//
// А также функции в стиле ctypes:
//  - u_islower (u_tolower)
//  - u_isupper (u_toupper)
//  - u_istitle (u_totitle)
//  - u_isdigit
//  - u_isalpha
//  - u_isalnum
//  - u_isxdigit
//  - u_ispunct
//  - u_isgraph
//  - u_isblank
//  - u_isspace
//  - u_iscntrl
//  - u_isprint
//
// ==== Продвинутые техники
//
// Более продвинутый метод итерации по словам -- это использование BreakIterator.
// Для простоты этот метод не рассматривается.
//
// Также можно для каких-либо своих целей использовать регулярные выражения.
// См. классы RegexPattern и RegexMatcher.
//
// ==== API References
//
// http://icu-project.org/apiref/icu4c/index.html
//
// В частности:
// http://icu-project.org/apiref/icu4c/classUnicodeString.html
// http://icu-project.org/apiref/icu4c/classUnicodeSet.html
//
// http://icu-project.org/apiref/icu4c/uchar_8h.html -- классификация символов.
//
// http://icu-project.org/apiref/icu4c/classBreakIterator.html
// http://icu-project.org/apiref/icu4c/classRegexMatcher.html
// http://icu-project.org/apiref/icu4c/classRegexPattern.html

#ifdef WINDOWS
// Хак для печати UTF-8 в Windows-консоли в обход стандартных операторов <<.
std::ostream& operator<<(std::ostream& os, const std::string& s)
{
    static HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
    static HANDLE stderr_handle = GetStdHandle(STD_ERROR_HANDLE);

    if (os == std::cout)
    {
        DWORD characters_written;
        WriteConsoleA(stdout_handle, s.data(), s.length(), &characters_written, NULL);
    }
    else if (os == std::cerr)
    {
        DWORD characters_written;
        WriteConsoleA(stderr_handle, s.data(), s.length(), &characters_written, NULL);
    }
    else
    {
        os.write(s.data(), s.length());
    }

    return os;
}
#endif

void PrintWordsWithUnicodeSet(std::istream& is, std::ostream& os)
{
    UnicodeSet allowed_characters;
    UnicodeString current_line;

    {
        UErrorCode error_code = U_ZERO_ERROR;
        allowed_characters.applyPattern("[\\p{Letter}]", error_code);

        if (U_FAILURE(error_code))
        {
            std::cerr << "Failed to create set of allowed characters." << std::endl;
            std::exit(1);
        }
    }

    std::string buffer;

    while (std::getline(is, buffer))
    {
        current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
        current_line.toLower();

        int32_t i, j;

        for (i = 0; i < current_line.length(); ++i)
        {
            for (j = i; j < current_line.length() && allowed_characters.contains(current_line[j]); ++j)
            {
            }

            if (j > i)
            {
                UnicodeString word(current_line, i, j - i);

                buffer.clear();
                word.toUTF8String(buffer);

                os << buffer << std::endl;
            }

            i = j;
        }
    }
}

void PrintWordsWithIsAlphabetic(std::istream& is, std::ostream& os)
{
    UnicodeString current_line;
    std::string buffer;

    while (std::getline(is, buffer))
    {
        current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
        current_line.toLower();

        int32_t i, j;

        for (i = 0; i < current_line.length(); ++i)
        {
            for (j = i; j < current_line.length() && u_isUAlphabetic(current_line[j]); ++j)
            {
            }

            if (j > i)
            {
                UnicodeString word(current_line, i, j - i);

                buffer.clear();
                word.toUTF8String(buffer);

                os << buffer << std::endl;
            }

            i = j;
        }
    }
}

void PrintWordsBySpaces(std::istream& is, std::ostream& os)
{
    UnicodeString current_line;
    std::string buffer;

    while (std::getline(is, buffer))
    {
        current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
        current_line.toLower();

        int32_t i, j;

        for (i = 0; i < current_line.length(); ++i)
        {
            for (j = i; j < current_line.length() && !u_isUWhiteSpace(current_line[j]); ++j)
            {
            }

            if (j > i)
            {
                UnicodeString word(current_line, i, j - i);

                buffer.clear();
                word.toUTF8String(buffer);

                os << buffer << std::endl;
            }

            i = j;
        }
    }
}

int main(int argc, char** argv)
{
#ifdef WINDOWS
    UINT _previous_cp = GetConsoleCP();
    UINT _previous_output_cp = GetConsoleOutputCP();
    ::SetConsoleCP(CP_UTF8);
    ::SetConsoleOutputCP(CP_UTF8);
#endif

    if (argc < 2)
    {
        std::cerr << "Please, specify input file as an argument." << std::endl;
        return 1;
    }

    std::ifstream input_file(argv[1], std::ios::binary | std::ios::in);

    if (!input_file)
    {
        std::cerr << "Cannot open file." << std::endl;
        return 1;
    }

    //PrintWordsWithUnicodeSet(input_file, std::cout);
    //PrintWordsWithIsAlphabetic(input_file, std::cout);
    PrintWordsBySpaces(input_file, std::cout);

#ifdef WINDOWS
    ::SetConsoleCP(_previous_cp);
    ::SetConsoleOutputCP(_previous_output_cp);
#endif

    return 0;
}

## statistics_from_stripped_dump.py
#!/usr/bin/python
# For Yandex Data Analysis School

"""Takes stripped MediaWiki XML dump and calculates different statistics."""

import sys
import unicodedata

from optparse import OptionParser

IS_PUNCTUATION = lambda u: unicodedata.category(u)[0] == "P"
IS_WHITESPACE  = lambda u: unicodedata.category(u)[0] == "Z"
IS_LETTER      = lambda u: unicodedata.category(u)[0] == "L"
IS_NUMBER      = lambda u: unicodedata.category(u)[0] == "N"
IS_LOWERCASE   = lambda u: unicodedata.category(u) == "Ll"
IS_UPPERCASE   = lambda u: unicodedata.category(u) == "Lu"
IS_CYRILLIC    = lambda u: (ord(u) >> 8) == 0x04

def smart_open(filename, mode = "rb", encoding = "utf-8"):
    handle = None

    if "b" not in mode:
        mode = mode + "b"

    if filename == "-":
        if "w" in mode:
            handle = sys.stdout
        else:
            handle = sys.stdin
    elif filename.endswith(".gz"):
        import gzip
        handle = gzip.open(filename, mode)
    elif filename.endswith(".bz2"):
        import bz2
        handle = bz2.BZ2File(filename, mode)
    else:
        handle = open(filename, mode)

    if encoding:
        import codecs
        if "w" in mode:
            handle = codecs.getwriter(encoding)(handle, "ignore")
        else:
            handle = codecs.getreader(encoding)(handle, "ignore")

    return handle

def tokenize(stream):
    for line in stream:
        for word in line.split():
            if word[0] != u"<":
                word = u"".join(c for c in word if IS_LETTER(c))
                word = word.lower()

            if not word:
                continue

            yield word

def calculate_statistics(word_stream):
    number_of_documents = 0
    number_of_words = 0
    number_of_pointers = 0

    global_vocabulary = set()
    local_vocabulary = set()

    in_page = False
    in_title = False
    in_text = False

    BLOCKED_WORDS = u"<b> <h> <i> <id> <ref </b> </h> </i> </id> </ref>".split()

    for word in word_stream:
        if word in BLOCKED_WORDS:
            continue

        elif word == u"<main_text>":
            assert(in_page and not in_title and not in_text)
            in_text = True

        elif word == u"</main_text>":
            assert(in_page and not in_title and in_text)
            in_text = False

        elif word == u"<page>":
            assert(not in_page and not in_title and not in_text)
            in_page = True

            number_of_documents += 1
            number_of_pointers += len(local_vocabulary)

            local_vocabulary.clear()

        elif word == u"</page>":
            assert(in_page and not in_title and not in_text)
            in_page = False

        elif word == u"<title>":
            assert(in_page and not in_title and not in_text)
            in_title = True

        elif word == u"</title>":
            assert(in_page and in_title and not in_text)
            in_title = False

        else:
            assert(in_page and (in_title or in_text))

            number_of_words += 1
            global_vocabulary.add(word)
            local_vocabulary.add(word)

    print >>sys.stderr, u"# Documents: {0}".format(number_of_documents)
    print >>sys.stderr, u"# Words: {0}".format(number_of_words)
    print >>sys.stderr, u"# Pointers: {0}".format(number_of_pointers)
    print >>sys.stderr, u"# Distinct Words: {0}".format(len(global_vocabulary))

if __name__ == "__main__":
    parser = OptionParser(description = __doc__)
    parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")

    (options, args) = parser.parse_args()

    if not options.input:
        parser.error("-i option is required.")
        parser.print_usage()

        sys.exit(1)

    input_file = smart_open(options.input)

    calculate_statistics(tokenize(input_file))

    input_file.close()
	#!/usr/bin/python
	# For Yandex Data Analysis School

	"""Takes MediaWiki XML dump and extracts pages to separate files."""

	SUBDIRECTORY_SPREAD = 512

	import sys
	import os
	import os.path

	from optparse import OptionParser
	from xml.sax import parse
	from xml.sax.handler import ContentHandler

	def smart_open(filename, mode = "rb", encoding = "utf-8"):
	handle = None

	if "b" not in mode:
	mode = mode + "b"

	if filename == "-":
	if "w" in mode:
	handle = sys.stdout
	else:
	handle = sys.stdin
	elif filename.endswith(".gz"):
	import gzip
	handle = gzip.open(filename, mode)
	elif filename.endswith(".bz2"):
	import bz2
	handle = bz2.BZ2File(filename, mode)
	else:
	handle = open(filename, mode)

	if encoding:
	import codecs
	if "w" in mode:
	handle = codecs.getwriter(encoding)(handle, "ignore")
	else:
	handle = codecs.getreader(encoding)(handle, "ignore")

	return handle

	class MyHandler(ContentHandler):
	def __init__(self, callback):
	self.callback = callback
	self.counter = 0

	self.clear()

	def clear(self):
	self.in_page = False
	self.in_title = False
	self.in_text = False
	self.is_redirect = False

	self.current_title = u""
	self.current_text = u""

	self.counter += 1

	def startElement(self, name, attrs):
	if name == "page":
	self.in_page = True
	elif self.in_page:
	if name == "redirect":
	self.is_redirect = True
	elif name == "title":
	assert(not self.in_title and not self.in_text)
	self.in_title = True
	elif name == "text":
	assert(not self.in_title and not self.in_text)
	self.in_text = True

	def endElement(self, name):
	if name == "page":
	assert(not self.in_title)
	assert(not self.in_text)

	if not self.is_redirect:
	self.callback(self.counter, self.current_title.strip(), self.current_text.strip())

	self.clear()
	elif self.in_page:
	if name == "title":
	assert(self.in_title and not self.in_text)
	self.in_title = False
	elif name == "text":
	assert(not self.in_title and self.in_text)
	self.in_text = False

	def characters(self, data):
	if self.in_title:
	self.current_title += data

	if self.in_text:
	self.current_text += data

	def extract_pages(stream, output_directory):
	def extract_page(aydee, title, text):
	subdirectory = "{0:03d}".format(aydee % SUBDIRECTORY_SPREAD)
	filename = "{0:08d}.txt".format(aydee)

	output_path = os.path.join(output_directory, subdirectory)
	if not os.path.isdir(output_path):
	os.mkdir(output_path)
	output_path = os.path.join(output_path, filename)

	print output_path

	f = smart_open(output_path, "w")
	f.write(title)
	f.write(u"\n\n")
	f.write(text)
	f.close()

	return parse(stream, MyHandler(extract_page))

	if __name__ == "__main__":
	parser = OptionParser(description = __doc__)
	parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")
	parser.add_option("-o", "--output", metavar = "DIRECTORY", help = "write files to DIRECTORY")

	(options, args) = parser.parse_args()

	if not options.input or not options.output:
	parser.error("-i, -o options are required.")
	parser.print_usage()

	sys.exit(1)

	output_directory = os.path.abspath(options.output)

	if not os.path.isdir(output_directory):
	print >>sys.stderr, "-o option should point to directory."
	sys.exit(1)

	input_file = smart_open(options.input, encoding = None)

	extract_pages(input_file, output_directory)

	input_file.close()
	#!/usr/bin/python
	# For Yandex Data Analysis School

	"""Takes MediaWiki XML dump as input and trims it to first N pages."""

	import sys

	from optparse import OptionParser

	def smart_open(filename, mode = "rb", encoding = "utf-8"):
	handle = None

	if "b" not in mode:
	mode = mode + "b"

	if filename == "-":
	if "w" in mode:
	handle = sys.stdout
	else:
	handle = sys.stdin
	elif filename.endswith(".gz"):
	import gzip
	handle = gzip.open(filename, mode)
	elif filename.endswith(".bz2"):
	import bz2
	handle = bz2.BZ2File(filename, mode)
	else:
	handle = open(filename, mode)

	if encoding:
	import codecs
	if "w" in mode:
	handle = codecs.getwriter(encoding)(handle, "ignore")
	else:
	handle = codecs.getreader(encoding)(handle, "ignore")

	return handle

	def trim_dump(stream, required_number_of_pages):
	number_of_pages = 0
	should_close_mediawiki = False

	for line in stream:
	if line.find(u"<mediawiki") != -1:
	should_close_mediawiki = True
	if line.find(u"</mediawiki>") != -1:
	should_close_mediawiki = False

	if line.find(u"</page>") != -1:
	number_of_pages += 1

	yield line

	if number_of_pages == required_number_of_pages:
	break

	if should_close_mediawiki:
	yield u"</mediawiki>\n"

	if __name__ == "__main__":
	parser = OptionParser(description = __doc__)
	parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")
	parser.add_option("-o", "--output", metavar = "FILE", help = "write trimmed dump to FILE")
	parser.add_option("-n", "--number", metavar = "N", help = "trim dump to first N pages", type = "int")

	(options, args) = parser.parse_args()

	if not options.input or not options.output or not options.number:
	parser.error("-i, -o, -n options are required.")
	parser.print_usage()

	sys.exit(1)

	input_file = smart_open(options.input)
	output_file = smart_open(options.output, "w")

	output_file.writelines(trim_dump(input_file, options.number))

	input_file.close()
	output_file.close()
	// For Yandex Data Analysis School
	// XXX: Включите этот define в зависимости от вашей операционной системы
	#define WINDOWS

	#include <string>
	#include <fstream>
	#include <iostream>

	// ICU: http://icu-project.org
	#include <unicode/utypes.h>
	#include <unicode/unistr.h>
	#include <unicode/uniset.h>

	#ifdef WINDOWS
	#include <windows.h>
	#endif

	// ==== Выделение слов из потока
	//
	// Фунции PrintWords* читают входящий поток is и построчно в os выписывает слова, переводя их в нижний регистр.
	// Входная и выходная кодировки потоков -- UTF-8.
	// Ниже приведены три возможные реализации функции:
	// - выделение слов как последовательности алфавитных символов с помощью UnicodeSet,
	// - выделение слов как последовательности алфавитных символов с помощью IsAlphabetic,
	// - разделение исходной строки по пробельным символам.
	//
	// ==== Вспомогательные функции
	//
	// Для справки, список потенциально полезных функций определения типа символа:
	// - u_isUAlphabetic
	// - u_isULowercase
	// - u_isUUppercase
	// - u_isUWhiteSpace
	//
	// А также функции в стиле ctypes:
	// - u_islower (u_tolower)
	// - u_isupper (u_toupper)
	// - u_istitle (u_totitle)
	// - u_isdigit
	// - u_isalpha
	// - u_isalnum
	// - u_isxdigit
	// - u_ispunct
	// - u_isgraph
	// - u_isblank
	// - u_isspace
	// - u_iscntrl
	// - u_isprint
	//
	// ==== Продвинутые техники
	//
	// Более продвинутый метод итерации по словам -- это использование BreakIterator.
	// Для простоты этот метод не рассматривается.
	//
	// Также можно для каких-либо своих целей использовать регулярные выражения.
	// См. классы RegexPattern и RegexMatcher.
	//
	// ==== API References
	//
	// http://icu-project.org/apiref/icu4c/index.html
	//
	// В частности:
	// http://icu-project.org/apiref/icu4c/classUnicodeString.html
	// http://icu-project.org/apiref/icu4c/classUnicodeSet.html
	//
	// http://icu-project.org/apiref/icu4c/uchar_8h.html -- классификация символов.
	//
	// http://icu-project.org/apiref/icu4c/classBreakIterator.html
	// http://icu-project.org/apiref/icu4c/classRegexMatcher.html
	// http://icu-project.org/apiref/icu4c/classRegexPattern.html

	#ifdef WINDOWS
	// Хак для печати UTF-8 в Windows-консоли в обход стандартных операторов <<.
	std::ostream& operator<<(std::ostream& os, const std::string& s)
	{
	static HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
	static HANDLE stderr_handle = GetStdHandle(STD_ERROR_HANDLE);

	if (os == std::cout)
	{
	DWORD characters_written;
	WriteConsoleA(stdout_handle, s.data(), s.length(), &characters_written, NULL);
	}
	else if (os == std::cerr)
	{
	DWORD characters_written;
	WriteConsoleA(stderr_handle, s.data(), s.length(), &characters_written, NULL);
	}
	else
	{
	os.write(s.data(), s.length());
	}

	return os;
	}
	#endif

	void PrintWordsWithUnicodeSet(std::istream& is, std::ostream& os)
	{
	UnicodeSet allowed_characters;
	UnicodeString current_line;

	{
	UErrorCode error_code = U_ZERO_ERROR;
	allowed_characters.applyPattern("[\\p{Letter}]", error_code);

	if (U_FAILURE(error_code))
	{
	std::cerr << "Failed to create set of allowed characters." << std::endl;
	std::exit(1);
	}
	}

	std::string buffer;

	while (std::getline(is, buffer))
	{
	current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
	current_line.toLower();

	int32_t i, j;

	for (i = 0; i < current_line.length(); ++i)
	{
	for (j = i; j < current_line.length() && allowed_characters.contains(current_line[j]); ++j)
	{
	}

	if (j > i)
	{
	UnicodeString word(current_line, i, j - i);

	buffer.clear();
	word.toUTF8String(buffer);

	os << buffer << std::endl;
	}

	i = j;
	}
	}
	}

	void PrintWordsWithIsAlphabetic(std::istream& is, std::ostream& os)
	{
	UnicodeString current_line;
	std::string buffer;

	while (std::getline(is, buffer))
	{
	current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
	current_line.toLower();

	int32_t i, j;

	for (i = 0; i < current_line.length(); ++i)
	{
	for (j = i; j < current_line.length() && u_isUAlphabetic(current_line[j]); ++j)
	{
	}

	if (j > i)
	{
	UnicodeString word(current_line, i, j - i);

	buffer.clear();
	word.toUTF8String(buffer);

	os << buffer << std::endl;
	}

	i = j;
	}
	}
	}

	void PrintWordsBySpaces(std::istream& is, std::ostream& os)
	{
	UnicodeString current_line;
	std::string buffer;

	while (std::getline(is, buffer))
	{
	current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
	current_line.toLower();

	int32_t i, j;

	for (i = 0; i < current_line.length(); ++i)
	{
	for (j = i; j < current_line.length() && !u_isUWhiteSpace(current_line[j]); ++j)
	{
	}

	if (j > i)
	{
	UnicodeString word(current_line, i, j - i);

	buffer.clear();
	word.toUTF8String(buffer);

	os << buffer << std::endl;
	}

	i = j;
	}
	}
	}

	int main(int argc, char** argv)
	{
	#ifdef WINDOWS
	UINT _previous_cp = GetConsoleCP();
	UINT _previous_output_cp = GetConsoleOutputCP();
	::SetConsoleCP(CP_UTF8);
	::SetConsoleOutputCP(CP_UTF8);
	#endif

	if (argc < 2)
	{
	std::cerr << "Please, specify input file as an argument." << std::endl;
	return 1;
	}

	std::ifstream input_file(argv[1], std::ios::binary \| std::ios::in);

	if (!input_file)
	{
	std::cerr << "Cannot open file." << std::endl;
	return 1;
	}

	//PrintWordsWithUnicodeSet(input_file, std::cout);
	//PrintWordsWithIsAlphabetic(input_file, std::cout);
	PrintWordsBySpaces(input_file, std::cout);

	#ifdef WINDOWS
	::SetConsoleCP(_previous_cp);
	::SetConsoleOutputCP(_previous_output_cp);
	#endif

	return 0;
	}
	#!/usr/bin/python
	# For Yandex Data Analysis School

	"""Takes stripped MediaWiki XML dump and calculates different statistics."""

	import sys
	import unicodedata

	from optparse import OptionParser

	IS_PUNCTUATION = lambda u: unicodedata.category(u)[0] == "P"
	IS_WHITESPACE = lambda u: unicodedata.category(u)[0] == "Z"
	IS_LETTER = lambda u: unicodedata.category(u)[0] == "L"
	IS_NUMBER = lambda u: unicodedata.category(u)[0] == "N"
	IS_LOWERCASE = lambda u: unicodedata.category(u) == "Ll"
	IS_UPPERCASE = lambda u: unicodedata.category(u) == "Lu"
	IS_CYRILLIC = lambda u: (ord(u) >> 8) == 0x04

	def smart_open(filename, mode = "rb", encoding = "utf-8"):
	handle = None

	if "b" not in mode:
	mode = mode + "b"

	if filename == "-":
	if "w" in mode:
	handle = sys.stdout
	else:
	handle = sys.stdin
	elif filename.endswith(".gz"):
	import gzip
	handle = gzip.open(filename, mode)
	elif filename.endswith(".bz2"):
	import bz2
	handle = bz2.BZ2File(filename, mode)
	else:
	handle = open(filename, mode)

	if encoding:
	import codecs
	if "w" in mode:
	handle = codecs.getwriter(encoding)(handle, "ignore")
	else:
	handle = codecs.getreader(encoding)(handle, "ignore")

	return handle

	def tokenize(stream):
	for line in stream:
	for word in line.split():
	if word[0] != u"<":
	word = u"".join(c for c in word if IS_LETTER(c))
	word = word.lower()

	if not word:
	continue

	yield word

	def calculate_statistics(word_stream):
	number_of_documents = 0
	number_of_words = 0
	number_of_pointers = 0

	global_vocabulary = set()
	local_vocabulary = set()

	in_page = False
	in_title = False
	in_text = False

	BLOCKED_WORDS = u"<b> <h> <i> <id> <ref </b> </h> </i> </id> </ref>".split()

	for word in word_stream:
	if word in BLOCKED_WORDS:
	continue

	elif word == u"<main_text>":
	assert(in_page and not in_title and not in_text)
	in_text = True

	elif word == u"</main_text>":
	assert(in_page and not in_title and in_text)
	in_text = False

	elif word == u"<page>":
	assert(not in_page and not in_title and not in_text)
	in_page = True

	number_of_documents += 1
	number_of_pointers += len(local_vocabulary)

	local_vocabulary.clear()

	elif word == u"</page>":
	assert(in_page and not in_title and not in_text)
	in_page = False

	elif word == u"<title>":
	assert(in_page and not in_title and not in_text)
	in_title = True

	elif word == u"</title>":
	assert(in_page and in_title and not in_text)
	in_title = False

	else:
	assert(in_page and (in_title or in_text))

	number_of_words += 1
	global_vocabulary.add(word)
	local_vocabulary.add(word)

	print >>sys.stderr, u"# Documents: {0}".format(number_of_documents)
	print >>sys.stderr, u"# Words: {0}".format(number_of_words)
	print >>sys.stderr, u"# Pointers: {0}".format(number_of_pointers)
	print >>sys.stderr, u"# Distinct Words: {0}".format(len(global_vocabulary))

	if __name__ == "__main__":
	parser = OptionParser(description = __doc__)
	parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")

	(options, args) = parser.parse_args()

	if not options.input:
	parser.error("-i option is required.")
	parser.print_usage()

	sys.exit(1)

	input_file = smart_open(options.input)

	calculate_statistics(tokenize(input_file))

	input_file.close()