Skip to content

Instantly share code, notes, and snippets.

@Gunni
Last active February 7, 2021 23:33
Show Gist options
  • Save Gunni/7d2e14b49d3f0483666843e31b2b358f to your computer and use it in GitHub Desktop.
Save Gunni/7d2e14b49d3f0483666843e31b2b358f to your computer and use it in GitHub Desktop.
I hate number words
import re
import nltk as nltk
from word2number import w2n
def replaceTextnumberWithNumber(text):
#print(f'-- BEFORE --\n{text}')
tagged_number_words = 'ten/CD thousand/CD nine/CD hundred/CD ninety/CD eight/CD seven/CD six/CD five/CD four/CD three/CD two/CD one/CD eighty/CD seventy/CD sixty/CD fifty/CD forty/CD thirty/CD twenty/CD nineteen/CD eighteen/CD seventeen/CD sixteen/CD fifteen/CD fourteen/CD thirteen/CD twelve/CD eleven/CD zero/CD'
tagged_number_words_tuples = [nltk.tag.str2tuple(t) for t in tagged_number_words.split()]
my_tagger = nltk.UnigramTagger([ tagged_number_words_tuples ], backoff=nltk.DefaultTagger('IGNORE'))
my_grammar = 'NumberWord: {<CD>+}'
parser = nltk.RegexpParser(my_grammar)
parsed = parser.parse(my_tagger.tag(nltk.word_tokenize(text.lower())))
#print(parsed)
for tag in [tree.leaves() for tree in parsed.subtrees() if tree.label() == 'NumberWord']:
ut = nltk.untag(tag)
num = w2n.word_to_num(' '.join(ut))
r = re.compile(re.escape(' '.join(ut)), re.IGNORECASE)
text = r.sub(str(num), text)
#print('-- AFTER --')
return text
import unittest
from .helpers import replaceTextnumberWithNumber
class TestReplaceTextnumberWithNumber(unittest.TestCase):
def test_number(self):
self.assertEqual(replaceTextnumberWithNumber('four'), '4')
def test_number_in_a_sentence(self):
self.assertEqual(replaceTextnumberWithNumber('There were forty two of them'), 'There were 42 of them')
def test_multiple_numbers_in_a_sentence(self):
self.assertEqual(replaceTextnumberWithNumber(
'Example Chapter Title: Chapter Twenty (End of Book One)'
), 'Example Chapter Title: Chapter 20 (End of Book 1)')
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment