afiaka87/filter_cah.py

## filter_cah.py

number_regex = re.compile(r'[0-9]{5,}')
date_regex = re.compile(r'[0-9]{4}-[0-9]{2}-[0-9]{2}')
url_regex = re.compile(r'https?://[^\s]+')

@lru_cache(maxsize=32)
def tokenize(s):
    s = s.decode('utf-8')
    s = s.lower()
    s = number_regex.sub('', s)
    s = date_regex.sub('', s)
    s = url_regex.sub('', s)
    return tokenizer.tokenize(s, TEXT_SEQ_LEN, truncate_text=args.truncate_captions).squeeze(0)

# WebDataset only: filtering options
MAXIMUM_CAP_LEN = 5000
MINIMUM_CAP_LEN = 1

MIN_ASPECT_RATIO = 0.5
MAX_ASPECT_RATIO = 2.0

FILTER_LANGUAGE = False
FILTER_LANGUAGE_CODE = 'en'

if FILTER_LANGUAGE:
    print(f'Filtering language: {FILTER_LANGUAGE_CODE} using cld3')

if ENABLE_WEBDATASET:
    DATASET_SIZE = int(1e9) # You need to set a nominal length for the Dataset in order to avoid warnings from DataLoader

    myimg, mycap = WEBDATASET_IMAGE_TEXT_COLUMNS
    image_text_mapping = {
        myimg: imagetransform,
        mycap: tokenize
    }
    image_mapping = {
        myimg: imagepreproc
    }

    def filter_dataset(item): # For e.g. C@H which (rarely) has no caption available.
        if mycap not in item: return False
        if myimg not in item: return False
        if 'json' not in item: return False

        metadata = json.loads(item['json'].decode('utf-8'))
        original_width = float(metadata['original_width'])
        original_height = float(metadata['original_height'])
        caption = item[mycap].decode('utf-8')

        # image size
        if original_width < IMAGE_SIZE or original_height < IMAGE_SIZE:
             return False

        # aspect ratio
        if original_width / original_height < MIN_ASPECT_RATIO or original_width / original_height > MAX_ASPECT_RATIO:
            return False

        # caption length
        if len(caption) > MAXIMUM_CAP_LEN or len(caption) < MINIMUM_CAP_LEN:
             return False

        # language detection
        detected = cld3.get_language(caption) # (you may ignore the linter warning about this)
        if FILTER_LANGUAGE and detected.language != FILTER_LANGUAGE_CODE:
             return False

        return True

    w_dataset = wds.WebDataset(DATASET, handler=wds.warn_and_continue)
    filtered_dataset = w_dataset.select(filter_dataset)
    ds = filtered_dataset.map_dict(**image_text_mapping).map_dict(**image_mapping).to_tuple(mycap, myimg).batched(BATCH_SIZE, partial=True)

	number_regex = re.compile(r'[0-9]{5,}')
	date_regex = re.compile(r'[0-9]{4}-[0-9]{2}-[0-9]{2}')
	url_regex = re.compile(r'https?://[^\s]+')

	@lru_cache(maxsize=32)
	def tokenize(s):
	s = s.decode('utf-8')
	s = s.lower()
	s = number_regex.sub('', s)
	s = date_regex.sub('', s)
	s = url_regex.sub('', s)
	return tokenizer.tokenize(s, TEXT_SEQ_LEN, truncate_text=args.truncate_captions).squeeze(0)

	# WebDataset only: filtering options
	MAXIMUM_CAP_LEN = 5000
	MINIMUM_CAP_LEN = 1

	MIN_ASPECT_RATIO = 0.5
	MAX_ASPECT_RATIO = 2.0

	FILTER_LANGUAGE = False
	FILTER_LANGUAGE_CODE = 'en'

	if FILTER_LANGUAGE:
	print(f'Filtering language: {FILTER_LANGUAGE_CODE} using cld3')

	if ENABLE_WEBDATASET:
	DATASET_SIZE = int(1e9) # You need to set a nominal length for the Dataset in order to avoid warnings from DataLoader

	myimg, mycap = WEBDATASET_IMAGE_TEXT_COLUMNS
	image_text_mapping = {
	myimg: imagetransform,
	mycap: tokenize
	}
	image_mapping = {
	myimg: imagepreproc
	}

	def filter_dataset(item): # For e.g. C@H which (rarely) has no caption available.
	if mycap not in item: return False
	if myimg not in item: return False
	if 'json' not in item: return False

	metadata = json.loads(item['json'].decode('utf-8'))
	original_width = float(metadata['original_width'])
	original_height = float(metadata['original_height'])
	caption = item[mycap].decode('utf-8')

	# image size
	if original_width < IMAGE_SIZE or original_height < IMAGE_SIZE:
	return False

	# aspect ratio
	if original_width / original_height < MIN_ASPECT_RATIO or original_width / original_height > MAX_ASPECT_RATIO:
	return False

	# caption length
	if len(caption) > MAXIMUM_CAP_LEN or len(caption) < MINIMUM_CAP_LEN:
	return False

	# language detection
	detected = cld3.get_language(caption) # (you may ignore the linter warning about this)
	if FILTER_LANGUAGE and detected.language != FILTER_LANGUAGE_CODE:
	return False

	return True

	w_dataset = wds.WebDataset(DATASET, handler=wds.warn_and_continue)
	filtered_dataset = w_dataset.select(filter_dataset)
	ds = filtered_dataset.map_dict(image_text_mapping).map_dict(image_mapping).to_tuple(mycap, myimg).batched(BATCH_SIZE, partial=True)