Skip to content

Instantly share code, notes, and snippets.

@sandello
Created September 10, 2010 23:15
Show Gist options
  • Save sandello/574542 to your computer and use it in GitHub Desktop.
Save sandello/574542 to your computer and use it in GitHub Desktop.
Snippets for IR-course in Yandex Data Analysis School
#!/usr/bin/python
# For Yandex Data Analysis School
"""Takes MediaWiki XML dump and extracts pages to separate files."""
SUBDIRECTORY_SPREAD = 512
import sys
import os
import os.path
from optparse import OptionParser
from xml.sax import parse
from xml.sax.handler import ContentHandler
def smart_open(filename, mode = "rb", encoding = "utf-8"):
handle = None
if "b" not in mode:
mode = mode + "b"
if filename == "-":
if "w" in mode:
handle = sys.stdout
else:
handle = sys.stdin
elif filename.endswith(".gz"):
import gzip
handle = gzip.open(filename, mode)
elif filename.endswith(".bz2"):
import bz2
handle = bz2.BZ2File(filename, mode)
else:
handle = open(filename, mode)
if encoding:
import codecs
if "w" in mode:
handle = codecs.getwriter(encoding)(handle, "ignore")
else:
handle = codecs.getreader(encoding)(handle, "ignore")
return handle
class MyHandler(ContentHandler):
def __init__(self, callback):
self.callback = callback
self.counter = 0
self.clear()
def clear(self):
self.in_page = False
self.in_title = False
self.in_text = False
self.is_redirect = False
self.current_title = u""
self.current_text = u""
self.counter += 1
def startElement(self, name, attrs):
if name == "page":
self.in_page = True
elif self.in_page:
if name == "redirect":
self.is_redirect = True
elif name == "title":
assert(not self.in_title and not self.in_text)
self.in_title = True
elif name == "text":
assert(not self.in_title and not self.in_text)
self.in_text = True
def endElement(self, name):
if name == "page":
assert(not self.in_title)
assert(not self.in_text)
if not self.is_redirect:
self.callback(self.counter, self.current_title.strip(), self.current_text.strip())
self.clear()
elif self.in_page:
if name == "title":
assert(self.in_title and not self.in_text)
self.in_title = False
elif name == "text":
assert(not self.in_title and self.in_text)
self.in_text = False
def characters(self, data):
if self.in_title:
self.current_title += data
if self.in_text:
self.current_text += data
def extract_pages(stream, output_directory):
def extract_page(aydee, title, text):
subdirectory = "{0:03d}".format(aydee % SUBDIRECTORY_SPREAD)
filename = "{0:08d}.txt".format(aydee)
output_path = os.path.join(output_directory, subdirectory)
if not os.path.isdir(output_path):
os.mkdir(output_path)
output_path = os.path.join(output_path, filename)
print output_path
f = smart_open(output_path, "w")
f.write(title)
f.write(u"\n\n")
f.write(text)
f.close()
return parse(stream, MyHandler(extract_page))
if __name__ == "__main__":
parser = OptionParser(description = __doc__)
parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")
parser.add_option("-o", "--output", metavar = "DIRECTORY", help = "write files to DIRECTORY")
(options, args) = parser.parse_args()
if not options.input or not options.output:
parser.error("-i, -o options are required.")
parser.print_usage()
sys.exit(1)
output_directory = os.path.abspath(options.output)
if not os.path.isdir(output_directory):
print >>sys.stderr, "-o option should point to directory."
sys.exit(1)
input_file = smart_open(options.input, encoding = None)
extract_pages(input_file, output_directory)
input_file.close()
#!/usr/bin/python
# For Yandex Data Analysis School
"""Takes MediaWiki XML dump as input and trims it to first N pages."""
import sys
from optparse import OptionParser
def smart_open(filename, mode = "rb", encoding = "utf-8"):
handle = None
if "b" not in mode:
mode = mode + "b"
if filename == "-":
if "w" in mode:
handle = sys.stdout
else:
handle = sys.stdin
elif filename.endswith(".gz"):
import gzip
handle = gzip.open(filename, mode)
elif filename.endswith(".bz2"):
import bz2
handle = bz2.BZ2File(filename, mode)
else:
handle = open(filename, mode)
if encoding:
import codecs
if "w" in mode:
handle = codecs.getwriter(encoding)(handle, "ignore")
else:
handle = codecs.getreader(encoding)(handle, "ignore")
return handle
def trim_dump(stream, required_number_of_pages):
number_of_pages = 0
should_close_mediawiki = False
for line in stream:
if line.find(u"<mediawiki") != -1:
should_close_mediawiki = True
if line.find(u"</mediawiki>") != -1:
should_close_mediawiki = False
if line.find(u"</page>") != -1:
number_of_pages += 1
yield line
if number_of_pages == required_number_of_pages:
break
if should_close_mediawiki:
yield u"</mediawiki>\n"
if __name__ == "__main__":
parser = OptionParser(description = __doc__)
parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")
parser.add_option("-o", "--output", metavar = "FILE", help = "write trimmed dump to FILE")
parser.add_option("-n", "--number", metavar = "N", help = "trim dump to first N pages", type = "int")
(options, args) = parser.parse_args()
if not options.input or not options.output or not options.number:
parser.error("-i, -o, -n options are required.")
parser.print_usage()
sys.exit(1)
input_file = smart_open(options.input)
output_file = smart_open(options.output, "w")
output_file.writelines(trim_dump(input_file, options.number))
input_file.close()
output_file.close()
// For Yandex Data Analysis School
// XXX: Включите этот define в зависимости от вашей операционной системы
#define WINDOWS
#include <string>
#include <fstream>
#include <iostream>
// ICU: http://icu-project.org
#include <unicode/utypes.h>
#include <unicode/unistr.h>
#include <unicode/uniset.h>
#ifdef WINDOWS
#include <windows.h>
#endif
// ==== Выделение слов из потока
//
// Фунции PrintWords* читают входящий поток is и построчно в os выписывает слова, переводя их в нижний регистр.
// Входная и выходная кодировки потоков -- UTF-8.
// Ниже приведены три возможные реализации функции:
// - выделение слов как последовательности алфавитных символов с помощью UnicodeSet,
// - выделение слов как последовательности алфавитных символов с помощью IsAlphabetic,
// - разделение исходной строки по пробельным символам.
//
// ==== Вспомогательные функции
//
// Для справки, список потенциально полезных функций определения типа символа:
// - u_isUAlphabetic
// - u_isULowercase
// - u_isUUppercase
// - u_isUWhiteSpace
//
// А также функции в стиле ctypes:
// - u_islower (u_tolower)
// - u_isupper (u_toupper)
// - u_istitle (u_totitle)
// - u_isdigit
// - u_isalpha
// - u_isalnum
// - u_isxdigit
// - u_ispunct
// - u_isgraph
// - u_isblank
// - u_isspace
// - u_iscntrl
// - u_isprint
//
// ==== Продвинутые техники
//
// Более продвинутый метод итерации по словам -- это использование BreakIterator.
// Для простоты этот метод не рассматривается.
//
// Также можно для каких-либо своих целей использовать регулярные выражения.
// См. классы RegexPattern и RegexMatcher.
//
// ==== API References
//
// http://icu-project.org/apiref/icu4c/index.html
//
// В частности:
// http://icu-project.org/apiref/icu4c/classUnicodeString.html
// http://icu-project.org/apiref/icu4c/classUnicodeSet.html
//
// http://icu-project.org/apiref/icu4c/uchar_8h.html -- классификация символов.
//
// http://icu-project.org/apiref/icu4c/classBreakIterator.html
// http://icu-project.org/apiref/icu4c/classRegexMatcher.html
// http://icu-project.org/apiref/icu4c/classRegexPattern.html
#ifdef WINDOWS
// Хак для печати UTF-8 в Windows-консоли в обход стандартных операторов <<.
std::ostream& operator<<(std::ostream& os, const std::string& s)
{
static HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
static HANDLE stderr_handle = GetStdHandle(STD_ERROR_HANDLE);
if (os == std::cout)
{
DWORD characters_written;
WriteConsoleA(stdout_handle, s.data(), s.length(), &characters_written, NULL);
}
else if (os == std::cerr)
{
DWORD characters_written;
WriteConsoleA(stderr_handle, s.data(), s.length(), &characters_written, NULL);
}
else
{
os.write(s.data(), s.length());
}
return os;
}
#endif
void PrintWordsWithUnicodeSet(std::istream& is, std::ostream& os)
{
UnicodeSet allowed_characters;
UnicodeString current_line;
{
UErrorCode error_code = U_ZERO_ERROR;
allowed_characters.applyPattern("[\\p{Letter}]", error_code);
if (U_FAILURE(error_code))
{
std::cerr << "Failed to create set of allowed characters." << std::endl;
std::exit(1);
}
}
std::string buffer;
while (std::getline(is, buffer))
{
current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
current_line.toLower();
int32_t i, j;
for (i = 0; i < current_line.length(); ++i)
{
for (j = i; j < current_line.length() && allowed_characters.contains(current_line[j]); ++j)
{
}
if (j > i)
{
UnicodeString word(current_line, i, j - i);
buffer.clear();
word.toUTF8String(buffer);
os << buffer << std::endl;
}
i = j;
}
}
}
void PrintWordsWithIsAlphabetic(std::istream& is, std::ostream& os)
{
UnicodeString current_line;
std::string buffer;
while (std::getline(is, buffer))
{
current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
current_line.toLower();
int32_t i, j;
for (i = 0; i < current_line.length(); ++i)
{
for (j = i; j < current_line.length() && u_isUAlphabetic(current_line[j]); ++j)
{
}
if (j > i)
{
UnicodeString word(current_line, i, j - i);
buffer.clear();
word.toUTF8String(buffer);
os << buffer << std::endl;
}
i = j;
}
}
}
void PrintWordsBySpaces(std::istream& is, std::ostream& os)
{
UnicodeString current_line;
std::string buffer;
while (std::getline(is, buffer))
{
current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length()));
current_line.toLower();
int32_t i, j;
for (i = 0; i < current_line.length(); ++i)
{
for (j = i; j < current_line.length() && !u_isUWhiteSpace(current_line[j]); ++j)
{
}
if (j > i)
{
UnicodeString word(current_line, i, j - i);
buffer.clear();
word.toUTF8String(buffer);
os << buffer << std::endl;
}
i = j;
}
}
}
int main(int argc, char** argv)
{
#ifdef WINDOWS
UINT _previous_cp = GetConsoleCP();
UINT _previous_output_cp = GetConsoleOutputCP();
::SetConsoleCP(CP_UTF8);
::SetConsoleOutputCP(CP_UTF8);
#endif
if (argc < 2)
{
std::cerr << "Please, specify input file as an argument." << std::endl;
return 1;
}
std::ifstream input_file(argv[1], std::ios::binary | std::ios::in);
if (!input_file)
{
std::cerr << "Cannot open file." << std::endl;
return 1;
}
//PrintWordsWithUnicodeSet(input_file, std::cout);
//PrintWordsWithIsAlphabetic(input_file, std::cout);
PrintWordsBySpaces(input_file, std::cout);
#ifdef WINDOWS
::SetConsoleCP(_previous_cp);
::SetConsoleOutputCP(_previous_output_cp);
#endif
return 0;
}
#!/usr/bin/python
# For Yandex Data Analysis School
"""Takes stripped MediaWiki XML dump and calculates different statistics."""
import sys
import unicodedata
from optparse import OptionParser
IS_PUNCTUATION = lambda u: unicodedata.category(u)[0] == "P"
IS_WHITESPACE = lambda u: unicodedata.category(u)[0] == "Z"
IS_LETTER = lambda u: unicodedata.category(u)[0] == "L"
IS_NUMBER = lambda u: unicodedata.category(u)[0] == "N"
IS_LOWERCASE = lambda u: unicodedata.category(u) == "Ll"
IS_UPPERCASE = lambda u: unicodedata.category(u) == "Lu"
IS_CYRILLIC = lambda u: (ord(u) >> 8) == 0x04
def smart_open(filename, mode = "rb", encoding = "utf-8"):
handle = None
if "b" not in mode:
mode = mode + "b"
if filename == "-":
if "w" in mode:
handle = sys.stdout
else:
handle = sys.stdin
elif filename.endswith(".gz"):
import gzip
handle = gzip.open(filename, mode)
elif filename.endswith(".bz2"):
import bz2
handle = bz2.BZ2File(filename, mode)
else:
handle = open(filename, mode)
if encoding:
import codecs
if "w" in mode:
handle = codecs.getwriter(encoding)(handle, "ignore")
else:
handle = codecs.getreader(encoding)(handle, "ignore")
return handle
def tokenize(stream):
for line in stream:
for word in line.split():
if word[0] != u"<":
word = u"".join(c for c in word if IS_LETTER(c))
word = word.lower()
if not word:
continue
yield word
def calculate_statistics(word_stream):
number_of_documents = 0
number_of_words = 0
number_of_pointers = 0
global_vocabulary = set()
local_vocabulary = set()
in_page = False
in_title = False
in_text = False
BLOCKED_WORDS = u"<b> <h> <i> <id> <ref </b> </h> </i> </id> </ref>".split()
for word in word_stream:
if word in BLOCKED_WORDS:
continue
elif word == u"<main_text>":
assert(in_page and not in_title and not in_text)
in_text = True
elif word == u"</main_text>":
assert(in_page and not in_title and in_text)
in_text = False
elif word == u"<page>":
assert(not in_page and not in_title and not in_text)
in_page = True
number_of_documents += 1
number_of_pointers += len(local_vocabulary)
local_vocabulary.clear()
elif word == u"</page>":
assert(in_page and not in_title and not in_text)
in_page = False
elif word == u"<title>":
assert(in_page and not in_title and not in_text)
in_title = True
elif word == u"</title>":
assert(in_page and in_title and not in_text)
in_title = False
else:
assert(in_page and (in_title or in_text))
number_of_words += 1
global_vocabulary.add(word)
local_vocabulary.add(word)
print >>sys.stderr, u"# Documents: {0}".format(number_of_documents)
print >>sys.stderr, u"# Words: {0}".format(number_of_words)
print >>sys.stderr, u"# Pointers: {0}".format(number_of_pointers)
print >>sys.stderr, u"# Distinct Words: {0}".format(len(global_vocabulary))
if __name__ == "__main__":
parser = OptionParser(description = __doc__)
parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE")
(options, args) = parser.parse_args()
if not options.input:
parser.error("-i option is required.")
parser.print_usage()
sys.exit(1)
input_file = smart_open(options.input)
calculate_statistics(tokenize(input_file))
input_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment