from nltk import sent_tokenize
from random import shuffle
import random
import albumentations
from albumentations.core.transforms_interface import DualTransform, BasicTransform
LANGS = {
'en': 'english',
'it': 'italian',
'fr': 'french',
'es': 'spanish',
'tr': 'turkish',
'ru': 'russian',
'pt': 'portuguese'
}
def get_sentences(text, lang='en'):
"""I am A. I love B. -> ["I am A.", "I love B."]"""
return sent_tokenize(text, LANGS.get(lang, 'english'))
def exclude_duplicate_sentences(text, lang='en'):
sentences = []
for sentence in get_sentences(text, lang):
sentence = sentence.strip()
if sentence not in sentences:
sentences.append(sentence)
return ' '.join(sentences)
def clean_text(text, lang='en'):
text = str(text)
text = re.sub(r'[0-9"]', '', text)
text = re.sub(r'#[\S]+\b', '', text)
text = re.sub(r'@[\S]+\b', '', text)
text = re.sub(r'https?\S+', '', text)
text = re.sub(r'\s+', ' ', text) # match any space
text = exclude_duplicate_sentences(text, lang)
return text.strip()
class NLPTransform(BasicTransform):
""" Transform for nlp task."""
@property
def targets(self):
return {"data": self.apply}
def update_params(self, params, **kwargs):
if hasattr(self, "interpolation"):
params["interpolation"] = self.interpolation
if hasattr(self, "fill_value"):
params["fill_value"] = self.fill_value
return params
def get_sentences(self, text, lang='en'):
return sent_tokenize(text, LANGS.get(lang, 'english'))
class ShuffleSentencesTransform(NLPTransform):
""" Do shuffle by sentence """
def __init__(self, always_apply=False, p=0.5):
super(ShuffleSentencesTransform, self).__init__(always_apply, p)
def apply(self, data, **params):
text, lang = data
sentences = self.get_sentences(text, lang)
random.shuffle(sentences)
return ' '.join(sentences), lang
class ExcludeDuplicateSentencesTransform(NLPTransform):
""" Exclude equal sentences """
def __init__(self, always_apply=False, p=0.5):
super(ExcludeDuplicateSentencesTransform, self).__init__(always_apply, p)
def apply(self, data, **params):
text, lang = data
sentences = []
for sentence in self.get_sentences(text, lang):
sentence = sentence.strip()
if sentence not in sentences:
sentences.append(sentence)
return ' '.join(sentences), lang
class ExcludeNumbersTransform(NLPTransform):
""" exclude any numbers """
def __init__(self, always_apply=False, p=0.5):
super(ExcludeNumbersTransform, self).__init__(always_apply, p)
def apply(self, data, **params):
text, lang = data
text = re.sub(r'[0-9]', '', text)
text = re.sub(r'\s+', ' ', text)
return text, lang
class ExcludeHashtagsTransform(NLPTransform):
""" Exclude any hashtags with # """
def __init__(self, always_apply=False, p=0.5):
super(ExcludeHashtagsTransform, self).__init__(always_apply, p)
def apply(self, data, **params):
text, lang = data
text = re.sub(r'#[\S]+\b', '', text)
text = re.sub(r'\s+', ' ', text)
return text, lang
class ExcludeUsersMentionedTransform(NLPTransform):
""" Exclude @users """
def __init__(self, always_apply=False, p=0.5):
super(ExcludeUsersMentionedTransform, self).__init__(always_apply, p)
def apply(self, data, **params):
text, lang = data
text = re.sub(r'@[\S]+\b', '', text)
text = re.sub(r'\s+', ' ', text)
return text, lang
class ExcludeUrlsTransform(NLPTransform):
""" Exclude urls """
def __init__(self, always_apply=False, p=0.5):
super(ExcludeUrlsTransform, self).__init__(always_apply, p)
def apply(self, data, **params):
text, lang = data
text = re.sub(r'https?\S+', '', text)
text = re.sub(r'\s+', ' ', text)
return text, lang