Skip to content

Instantly share code, notes, and snippets.

@borman
Created April 7, 2013 20:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save borman/5332396 to your computer and use it in GitHub Desktop.
Save borman/5332396 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# encoding: utf-8
#
# Map/Reduce word counter
# (c) Mikhail Borisov
# Running prereqs:
# - files 'lemmas.utf8.xml', 'paradigms.utf8.xml'
# in current working directory
# - data files in corpus_utf8/ subdirectory
import glob
import re
import codecs
import lxml.etree
from collections import Counter
WORD_RE = re.compile(r'\w[\w-]*', re.UNICODE)
def words(line):
return (m.group(0) for m in WORD_RE.finditer(line))
def count_words(lines, word_mapper):
counter = Counter()
for line in lines:
counter.update(word_mapper(word)
for word in words(line))
return counter
def count_words_in_file(filename, word_mapper):
with codecs.open(filename, 'r', 'utf-8') as f:
return count_words(f, word_mapper)
class Lemmatizer(object):
def __init__(self):
paradigms = self.load_paradigms()
lemmas = self.load_lemmas()
all_forms = dict()
for lemma, paradigm_id in lemmas:
base_form = paradigms[paradigm_id][0][0] + lemma + paradigms[paradigm_id][0][1]
for prefix, suffix in paradigms[paradigm_id]:
concrete_form = prefix + lemma + suffix
concrete_form = concrete_form.replace(u'Ё', u'Е')
all_forms[concrete_form] = base_form
self.mapping = all_forms
@staticmethod
def load_paradigms():
'''Paradigms: [paradigm_id => [(prefix, suffix), ...], ... => ...]'''
paradigms = dict()
with open('paradigms.utf8.xml') as f:
tree = lxml.etree.parse(f)
for paradigm_node in tree.getroot().getchildren():
paradigm_id = int(paradigm_node.attrib['id'])
forms = [(form_node.attrib.get('p', ''),
form_node.attrib.get('s', ''))
for form_node in paradigm_node.getchildren()
if form_node.tag == 'f']
paradigms[paradigm_id] = forms
return paradigms
@staticmethod
def load_lemmas():
'''Lemmas: [(lemma, paradigm_id), ...]'''
with open('lemmas.utf8.xml') as f:
tree = lxml.etree.parse(f)
return [(lemma_node.attrib['id'], int(lemma_node.attrib['p']))
for lemma_node in tree.getroot().getchildren()
if lemma_node.tag == 'l']
def __call__(self, word):
'''Lemmatizer: unicode -> lemma(unicode) | '*' + unicode'''
word = word.upper().replace(u'Ё', u'Е')
if word in self.mapping:
return self.mapping[word]
else:
return '*' + word
###########################################################################
# Octo.Py below #
###########################################################################
source = dict((filename, '')
for filename in glob.iglob('corpus_utf8/*.txt'))
lemmatizer = None
def mapfn(filename, value):
global lemmatizer
if not lemmatizer:
lemmatizer = Lemmatizer()
counter = count_words_in_file(filename, lemmatizer)
return counter.iteritems()
def reducefn(key, value):
return key, sum(value)
def final(key, value):
print 'FINAL', key, value
def main():
# Running sample
l = Lemmatizer()
d = count_words_in_file('text.txt', l)
for word, count in sorted(d.iteritems(), key=lambda(a, b): b, reverse=True):
print word.encode('utf-8'), '=>', count
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment