Skip to content

Instantly share code, notes, and snippets.

@seanbenhur
Created December 4, 2021 07:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seanbenhur/f7f4c44d93ab4e229c9623a8b2f390c8 to your computer and use it in GitHub Desktop.
Save seanbenhur/f7f4c44d93ab4e229c9623a8b2f390c8 to your computer and use it in GitHub Desktop.
import re
import wandb
from datasets import load_dataset, concatenate_datasets
from functools import partial
import logging
logger = logging.getLogger(__name__)
def load_hf_format_dataset(file_path,split):
"""Loads a huggingface dataset"""
dataset = load_dataset("text",data_files=file_path)
dataset = dataset['train']
return dataset
#remove duplicates
memory = set()
def is_unique(elem , column, memory):
if elem[column] in memory:
return False
else:
memory.add(elem[column])
return True
def clean_text(example):
#remove html
example['text'] = re.sub(r'http\S+', '', example['text'])
#remove dates
example['text'] = re.sub('^(?:(?:[0-9]{2}[:\/,]){2}[0-9]{2,4}|am|pm)$', '', example['text'])
#remove emojis
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
example['text'] = re.sub(emoji_pattern, '', example['text'])
#remove \n
example['text'] = example['text'].replace('\n','')
#remove \t
example['text'] = example['text'].replace('\t', '')
return example
if __name__ == "__main__":
run = wandb.init(project="nvidia-tamil",group="format_data")
artifact = wandb.Artifact("processed_dataset",type="dataset")
tamil_common_crawl = load_hf_format_dataset(file_path="tamil_data/ta.txt.xz",split=None)
logging.info("Loaded Tamil Common Crawl")
oscar_tamil = load_dataset("oscar","unshuffled_original_ta",ignore_verifications=True)
oscar_tamil = oscar_tamil['train']
oscar_tamil = oscar_tamil.remove_columns(['id'])
logging.info("Loaded Oscar Tamil")
ai4bharat_tamil = load_hf_format_dataset(file_path="tamil_data/data/ta/ta.txt",split=None)
logging.info("Loaded AI4BHARAT TAMIL DATASET")
#concatenate datasets
tamil_final_dataset = concatenate_datasets([tamil_common_crawl,oscar_tamil,ai4bharat_tamil])
logging.info("Cocatenated the datasets")
# Drop duplicates
tamil_final_dataset = tamil_final_dataset.filter(partial(is_unique, column="text", memory=memory))
logging.info("Duplicates are dropped")
#preprocessing
logging.info("Started cleaning the dataset")
tamil_final_dataset = tamil_final_dataset.map(lambda x:clean_text(x))
logging.info("Dataset is cleaned")
tamil_final_dataset.save_to_disk("tamil_final_processed_dataset")
artifact.add_dir("tamil_final_processed_dataset")
run.log_artifact(artifact)
logging.info("Dataset is saved to disk")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment