IevaZarina/text_cleaning.py

## text_cleaning.py
# -*- coding: utf-8 -*-
# Ieva Zarina, 2016, licensed under the Apache 2.0 licnece

import re

def is_digit(word):
    try:
        int(word)
        return True
    except ValueError:
        return False

cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']]
tr = dict([(a[0], a[1]) for (a) in cedilla2latin])

def transliterate(line):
    new_line = ""
    for letter in line:
        if letter in tr:
            new_line += tr[letter]
        else:
            new_line += letter
    return new_line

text = ['This is dirty TEXT: A phone number +001234561234, moNey 3.333, some date like 09.08.2016 and weird Čárákterš.']

for line in text:
    # decode line to worrk with utf8 symbols
    line = line.decode('utf8')
    line = line.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
    # remove digits with regex
    line = re.sub("(^|\W)\d+($|\W)", " ", line)
    # OR remove digits with casting to int
    new_line = []
    for word in line.split():
        if not is_digit(word):
            new_line.append(word)
    line = " ".join(new_line)
    # transliterate to Latin characters
    line = transliterate(line)
    line = line.lower()
    print line
	# -- coding: utf-8 --
	# Ieva Zarina, 2016, licensed under the Apache 2.0 licnece

	import re

	def is_digit(word):
	try:
	int(word)
	return True
	except ValueError:
	return False

	cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']]
	tr = dict([(a[0], a[1]) for (a) in cedilla2latin])

	def transliterate(line):
	new_line = ""
	for letter in line:
	if letter in tr:
	new_line += tr[letter]
	else:
	new_line += letter
	return new_line

	text = ['This is dirty TEXT: A phone number +001234561234, moNey 3.333, some date like 09.08.2016 and weird Čárákterš.']

	for line in text:
	# decode line to worrk with utf8 symbols
	line = line.decode('utf8')
	line = line.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
	# remove digits with regex
	line = re.sub("(^\|\W)\d+($\|\W)", " ", line)
	# OR remove digits with casting to int
	new_line = []
	for word in line.split():
	if not is_digit(word):
	new_line.append(word)
	line = " ".join(new_line)
	# transliterate to Latin characters
	line = transliterate(line)
	line = line.lower()
	print line