rdemorais/blogset_br.py

## blogset_br.py
import gzip
import shutil

from unicodedata import normalize
from bs4 import BeautifulSoup
import ftfy
import re

from tqdm import tqdm
import csv
import sys

import warnings

from huggingface_hub import HfApi

csv.field_size_limit(sys.maxsize)

api = HfApi()

BLOGSET_GZ = 'blogset-br.csv.gz'
BLOGSET_CSV = 'blogset-br.csv'
BLOGSET_TRAIN = 'blogset_br_train.txt'

with gzip.open(BLOGSET_GZ, 'rb') as f_in:
  with open(BLOGSET_CSV, 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)


def clean_text(txt):
  cleantext = normalize('NFKD', txt).encode('ascii', 'ignore').decode("utf-8")
  cleantext = bytes(cleantext, 'utf-8').decode('utf-8', 'ignore')
  cleantext = re.sub(r'[\n]+', '', cleantext)
  if len(cleantext.strip()) == 0:
    return ''
  cleantext = ftfy.fix_encoding(cleantext)
  cleantext = BeautifulSoup(cleantext, "lxml").text

  return cleantext


with open(BLOGSET_CSV, 'r') as file_bs:
  csv_content = csv.reader(file_bs)
  row_count = sum(1 for row in csv_content)

# row_count = 7477855

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

csv.field_size_limit(sys.maxsize)

with open(BLOGSET_CSV, 'r') as csv_file:
  csv_content = csv.reader(csv_file)
  max_length = 512
  expanded_text = ''

  with open(BLOGSET_TRAIN, 'w') as blogset_file:
    blogset_file.write('text\n')
    for idx, line in enumerate(tqdm(csv_content, desc='BlogSet', total=row_count)):

      if len(line) == 0:
        continue
      cleaned_text = clean_text(line[4])
      if len(cleaned_text) == 0:
        continue

      expanded_text = expanded_text + ' ' + cleaned_text

      splits_exp = expanded_text.split()
      if len(splits_exp) >= max_length:
        chunks_overlaped = [splits_exp[i:i + max_length]
                            for i in range(0, len(splits_exp), max_length)
                            ]
        for c in chunks_overlaped:
          if len(c) > 30:
            sentence = ' '.join(c)
            blogset_file.write(f'{sentence}\n')
        expanded_text = ''
	import gzip
	import shutil

	from unicodedata import normalize
	from bs4 import BeautifulSoup
	import ftfy
	import re

	from tqdm import tqdm
	import csv
	import sys

	import warnings

	from huggingface_hub import HfApi

	csv.field_size_limit(sys.maxsize)

	api = HfApi()

	BLOGSET_GZ = 'blogset-br.csv.gz'
	BLOGSET_CSV = 'blogset-br.csv'
	BLOGSET_TRAIN = 'blogset_br_train.txt'

	with gzip.open(BLOGSET_GZ, 'rb') as f_in:
	with open(BLOGSET_CSV, 'wb') as f_out:
	shutil.copyfileobj(f_in, f_out)


	def clean_text(txt):
	cleantext = normalize('NFKD', txt).encode('ascii', 'ignore').decode("utf-8")
	cleantext = bytes(cleantext, 'utf-8').decode('utf-8', 'ignore')
	cleantext = re.sub(r'[\n]+', '', cleantext)
	if len(cleantext.strip()) == 0:
	return ''
	cleantext = ftfy.fix_encoding(cleantext)
	cleantext = BeautifulSoup(cleantext, "lxml").text

	return cleantext


	with open(BLOGSET_CSV, 'r') as file_bs:
	csv_content = csv.reader(file_bs)
	row_count = sum(1 for row in csv_content)

	# row_count = 7477855

	warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

	csv.field_size_limit(sys.maxsize)

	with open(BLOGSET_CSV, 'r') as csv_file:
	csv_content = csv.reader(csv_file)
	max_length = 512
	expanded_text = ''

	with open(BLOGSET_TRAIN, 'w') as blogset_file:
	blogset_file.write('text\n')
	for idx, line in enumerate(tqdm(csv_content, desc='BlogSet', total=row_count)):

	if len(line) == 0:
	continue
	cleaned_text = clean_text(line[4])
	if len(cleaned_text) == 0:
	continue

	expanded_text = expanded_text + ' ' + cleaned_text

	splits_exp = expanded_text.split()
	if len(splits_exp) >= max_length:
	chunks_overlaped = [splits_exp[i:i + max_length]
	for i in range(0, len(splits_exp), max_length)
	]
	for c in chunks_overlaped:
	if len(c) > 30:
	sentence = ' '.join(c)
	blogset_file.write(f'{sentence}\n')
	expanded_text = ''