Skip to content

Instantly share code, notes, and snippets.

@rdemorais
Created February 21, 2023 17:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rdemorais/ce2e708af4c07aba47930bc12ed92472 to your computer and use it in GitHub Desktop.
Save rdemorais/ce2e708af4c07aba47930bc12ed92472 to your computer and use it in GitHub Desktop.
Código para criação do dataset Blogset
import gzip
import shutil
from unicodedata import normalize
from bs4 import BeautifulSoup
import ftfy
import re
from tqdm import tqdm
import csv
import sys
import warnings
from huggingface_hub import HfApi
csv.field_size_limit(sys.maxsize)
api = HfApi()
BLOGSET_GZ = 'blogset-br.csv.gz'
BLOGSET_CSV = 'blogset-br.csv'
BLOGSET_TRAIN = 'blogset_br_train.txt'
with gzip.open(BLOGSET_GZ, 'rb') as f_in:
with open(BLOGSET_CSV, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
def clean_text(txt):
cleantext = normalize('NFKD', txt).encode('ascii', 'ignore').decode("utf-8")
cleantext = bytes(cleantext, 'utf-8').decode('utf-8', 'ignore')
cleantext = re.sub(r'[\n]+', '', cleantext)
if len(cleantext.strip()) == 0:
return ''
cleantext = ftfy.fix_encoding(cleantext)
cleantext = BeautifulSoup(cleantext, "lxml").text
return cleantext
with open(BLOGSET_CSV, 'r') as file_bs:
csv_content = csv.reader(file_bs)
row_count = sum(1 for row in csv_content)
# row_count = 7477855
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
csv.field_size_limit(sys.maxsize)
with open(BLOGSET_CSV, 'r') as csv_file:
csv_content = csv.reader(csv_file)
max_length = 512
expanded_text = ''
with open(BLOGSET_TRAIN, 'w') as blogset_file:
blogset_file.write('text\n')
for idx, line in enumerate(tqdm(csv_content, desc='BlogSet', total=row_count)):
if len(line) == 0:
continue
cleaned_text = clean_text(line[4])
if len(cleaned_text) == 0:
continue
expanded_text = expanded_text + ' ' + cleaned_text
splits_exp = expanded_text.split()
if len(splits_exp) >= max_length:
chunks_overlaped = [splits_exp[i:i + max_length]
for i in range(0, len(splits_exp), max_length)
]
for c in chunks_overlaped:
if len(c) > 30:
sentence = ' '.join(c)
blogset_file.write(f'{sentence}\n')
expanded_text = ''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment