Cleaning lines of text
# -*- coding: utf-8 -*- | |
# Ieva Zarina, 2016, licensed under the Apache 2.0 licnece | |
import re | |
def is_digit(word): | |
try: | |
int(word) | |
return True | |
except ValueError: | |
return False | |
cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']] | |
tr = dict([(a[0], a[1]) for (a) in cedilla2latin]) | |
def transliterate(line): | |
new_line = "" | |
for letter in line: | |
if letter in tr: | |
new_line += tr[letter] | |
else: | |
new_line += letter | |
return new_line | |
text = ['This is dirty TEXT: A phone number +001234561234, moNey 3.333, some date like 09.08.2016 and weird Čárákterš.'] | |
for line in text: | |
# decode line to worrk with utf8 symbols | |
line = line.decode('utf8') | |
line = line.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ') | |
# remove digits with regex | |
line = re.sub("(^|\W)\d+($|\W)", " ", line) | |
# OR remove digits with casting to int | |
new_line = [] | |
for word in line.split(): | |
if not is_digit(word): | |
new_line.append(word) | |
line = " ".join(new_line) | |
# transliterate to Latin characters | |
line = transliterate(line) | |
line = line.lower() | |
print line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
Hi!
Would not
line.replace('+|.|,|:', ' ')
do
line.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')?
To remove all punctuations I found string library to be powerful:
import string
punc= str.maketrans('', '', string.punctuation)
nopuncline = [w.translate(punc) for w in line]
;) Awesome job. Thanks for sharing.