Created
February 22, 2013 06:21
-
-
Save msabramo/5011153 to your computer and use it in GitHub Desktop.
Tests for PyStemmer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
class PyStemmerBaseTestCase(unittest.TestCase): | |
def import_pystemmer(self): | |
Stemmer = __import__('Stemmer') | |
return Stemmer | |
def get_stemmer(self, lang): | |
Stemmer = self.import_pystemmer() | |
return Stemmer.Stemmer(lang) | |
class PyStemmerGenericTests(PyStemmerBaseTestCase): | |
def test_import(self): | |
Stemmer = self.import_pystemmer() | |
self.assertTrue(hasattr(Stemmer, '__file__')) | |
def test_has_version(self): | |
Stemmer = self.import_pystemmer() | |
self.assertTrue(hasattr(Stemmer, 'version')) | |
def test_has_algorithms(self): | |
Stemmer = self.import_pystemmer() | |
self.assertTrue(hasattr(Stemmer, 'algorithms')) | |
class PyStemmerEnglishTests(PyStemmerBaseTestCase): | |
def setUp(self): | |
self.stemmer = self.get_stemmer('english') | |
def test_stemWord(self): | |
self.assertEqual(self.stemmer.stemWord('cycling'), 'cycl') | |
def test_stemWords(self): | |
self.assertEqual(self.stemmer.stemWords(['cycling', 'cyclist']), ['cycl', 'cyclist']) | |
def test_stemWords_unicode_simple(self): | |
self.assertEqual(self.stemmer.stemWords(['cycling', u'cyclist']), ['cycl', u'cyclist']) | |
def get_s_words_file(self): | |
import os | |
here = os.path.dirname(__file__) | |
s_words_file = open(os.path.join(here, 's_words.txt')) | |
return s_words_file | |
def test_stemWord_many_times(self): | |
# This test runs stemWord on a large number of words (15916) | |
# so that we force cache purging to be tested | |
s_words_file = self.get_s_words_file() | |
for word in s_words_file: | |
word = word.strip() | |
result = self.stemmer.stemWord(word) | |
class PyStemmerFrenchTests(PyStemmerBaseTestCase): | |
def setUp(self): | |
self.stemmer = self.get_stemmer('french') | |
def test_stemWord(self): | |
self.assertEqual(self.stemmer.stemWord('cyclisme'), 'cyclism') | |
class PyStemmerGermanTests(PyStemmerBaseTestCase): | |
def setUp(self): | |
self.stemmer = self.get_stemmer('german') | |
def test_stemWord(self): | |
self.assertEqual(self.stemmer.stemWord('Fahrradfahren'), 'Fahrradfahr') | |
self.assertEqual(self.stemmer.stemWord('Rad fahren'), 'Rad fahr') | |
class PyStemmerRussianTests(PyStemmerBaseTestCase): | |
def setUp(self): | |
self.stemmer = self.get_stemmer('russian') | |
def test_stemWord(self): | |
word = u'\u0441\u043e\u0432\u0435\u0440\u0448\u0430\u0442\u044c \u0446\u0438\u043a\u043b \u0440\u0430\u0437\u0432\u0438\u0442\u0438\u044f' | |
stem = u'\u0441\u043e\u0432\u0435\u0440\u0448\u0430\u0442\u044c \u0446\u0438\u043a\u043b \u0440\u0430\u0437\u0432\u0438\u0442' | |
self.assertEqual(self.stemmer.stemWord(word), stem) | |
class PyStemmerHungarianTests(PyStemmerBaseTestCase): | |
def setUp(self): | |
self.stemmer = self.get_stemmer('hungarian') | |
def test_stemWord(self): | |
word = u'Fut\xe1s k\xf6zben' | |
stem = u'Fut\xe1s k\xf6z' | |
self.assertEqual(self.stemmer.stemWord(word), stem) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This needs the
s_words.txt
file at https://gist.github.com/msabramo/5011183