Last active
August 16, 2023 18:29
-
-
Save IevaZarina/251215fd762d95ac973cd90bff32ccb3 to your computer and use it in GitHub Desktop.
Cleaning lines of text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Ieva Zarina, 2016, licensed under the Apache 2.0 licnece | |
import re | |
def is_digit(word): | |
try: | |
int(word) | |
return True | |
except ValueError: | |
return False | |
cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']] | |
tr = dict([(a[0], a[1]) for (a) in cedilla2latin]) | |
def transliterate(line): | |
new_line = "" | |
for letter in line: | |
if letter in tr: | |
new_line += tr[letter] | |
else: | |
new_line += letter | |
return new_line | |
text = ['This is dirty TEXT: A phone number +001234561234, moNey 3.333, some date like 09.08.2016 and weird Čárákterš.'] | |
for line in text: | |
# decode line to worrk with utf8 symbols | |
line = line.decode('utf8') | |
line = line.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ') | |
# remove digits with regex | |
line = re.sub("(^|\W)\d+($|\W)", " ", line) | |
# OR remove digits with casting to int | |
new_line = [] | |
for word in line.split(): | |
if not is_digit(word): | |
new_line.append(word) | |
line = " ".join(new_line) | |
# transliterate to Latin characters | |
line = transliterate(line) | |
line = line.lower() | |
print line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi!
Would not
line.replace('+|.|,|:', ' ')
do
line.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')?
To remove all punctuations I found string library to be powerful:
import string
punc= str.maketrans('', '', string.punctuation)
nopuncline = [w.translate(punc) for w in line]
;) Awesome job. Thanks for sharing.