Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Cleaning lines of text
# -*- coding: utf-8 -*-
# Ieva Zarina, 2016, licensed under the Apache 2.0 licnece
import re
def is_digit(word):
try:
int(word)
return True
except ValueError:
return False
cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']]
tr = dict([(a[0], a[1]) for (a) in cedilla2latin])
def transliterate(line):
new_line = ""
for letter in line:
if letter in tr:
new_line += tr[letter]
else:
new_line += letter
return new_line
text = ['This is dirty TEXT: A phone number +001234561234, moNey 3.333, some date like 09.08.2016 and weird Čárákterš.']
for line in text:
# decode line to worrk with utf8 symbols
line = line.decode('utf8')
line = line.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
# remove digits with regex
line = re.sub("(^|\W)\d+($|\W)", " ", line)
# OR remove digits with casting to int
new_line = []
for word in line.split():
if not is_digit(word):
new_line.append(word)
line = " ".join(new_line)
# transliterate to Latin characters
line = transliterate(line)
line = line.lower()
print line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment