Skip to content

Instantly share code, notes, and snippets.

@afiaka87
Last active September 25, 2021 04:27
Show Gist options
  • Save afiaka87/e9b73930d9b1c46baf17f08185cb5f45 to your computer and use it in GitHub Desktop.
Save afiaka87/e9b73930d9b1c46baf17f08185cb5f45 to your computer and use it in GitHub Desktop.
Clean and Filter Crawling @ Home by length, aspect ratio, image size, detected language
number_regex = re.compile(r'[0-9]{5,}')
date_regex = re.compile(r'[0-9]{4}-[0-9]{2}-[0-9]{2}')
url_regex = re.compile(r'https?://[^\s]+')
@lru_cache(maxsize=32)
def tokenize(s):
s = s.decode('utf-8')
s = s.lower()
s = number_regex.sub('', s)
s = date_regex.sub('', s)
s = url_regex.sub('', s)
return tokenizer.tokenize(s, TEXT_SEQ_LEN, truncate_text=args.truncate_captions).squeeze(0)
# WebDataset only: filtering options
MAXIMUM_CAP_LEN = 5000
MINIMUM_CAP_LEN = 1
MIN_ASPECT_RATIO = 0.5
MAX_ASPECT_RATIO = 2.0
FILTER_LANGUAGE = False
FILTER_LANGUAGE_CODE = 'en'
if FILTER_LANGUAGE:
print(f'Filtering language: {FILTER_LANGUAGE_CODE} using cld3')
if ENABLE_WEBDATASET:
DATASET_SIZE = int(1e9) # You need to set a nominal length for the Dataset in order to avoid warnings from DataLoader
myimg, mycap = WEBDATASET_IMAGE_TEXT_COLUMNS
image_text_mapping = {
myimg: imagetransform,
mycap: tokenize
}
image_mapping = {
myimg: imagepreproc
}
def filter_dataset(item): # For e.g. C@H which (rarely) has no caption available.
if mycap not in item: return False
if myimg not in item: return False
if 'json' not in item: return False
metadata = json.loads(item['json'].decode('utf-8'))
original_width = float(metadata['original_width'])
original_height = float(metadata['original_height'])
caption = item[mycap].decode('utf-8')
# image size
if original_width < IMAGE_SIZE or original_height < IMAGE_SIZE:
return False
# aspect ratio
if original_width / original_height < MIN_ASPECT_RATIO or original_width / original_height > MAX_ASPECT_RATIO:
return False
# caption length
if len(caption) > MAXIMUM_CAP_LEN or len(caption) < MINIMUM_CAP_LEN:
return False
# language detection
detected = cld3.get_language(caption) # (you may ignore the linter warning about this)
if FILTER_LANGUAGE and detected.language != FILTER_LANGUAGE_CODE:
return False
return True
w_dataset = wds.WebDataset(DATASET, handler=wds.warn_and_continue)
filtered_dataset = w_dataset.select(filter_dataset)
ds = filtered_dataset.map_dict(**image_text_mapping).map_dict(**image_mapping).to_tuple(mycap, myimg).batched(BATCH_SIZE, partial=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment