Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save dustindorroh/000b44f16527611c23ee8232f20474de to your computer and use it in GitHub Desktop.
Save dustindorroh/000b44f16527611c23ee8232f20474de to your computer and use it in GitHub Desktop.
Runs DHash on a csv containing paths to images using Dask. Deletes duplicates (prefering to keep larger images). Rename Images to Dhash name. Create AWS signed urls
import pandas as pd
import imagehash
from PIL import Image
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize
from pathlib import Path
import shutil
def create_presigned_url(bucket_name, object_name, expiration=3600):
"""Generate a presigned URL to share an S3 object
:param bucket_name: string
:param object_name: string
:param expiration: Time in seconds for the presigned URL to remain valid
:return: Presigned URL as string. If error, returns None.
"""
# Generate a presigned URL for the S3 object
s3_client = boto3.client('s3')
try:
response = s3_client.generate_presigned_url('get_object',
Params={'Bucket': bucket_name,
'Key': object_name},
ExpiresIn=expiration)
except ClientError as e:
logging.error(e)
return None
# The response contains the presigned URL
return response
def rename(image_path,name):
path = Path(image_path)
new_path = path.with_name(name).with_suffix(path.suffix)
return new_path
def image_size(image_path):
'''Return image width,height,and number of pixels'''
image = Image.open(image_path)
return image.width,image.height,image.width*image.height
def calc_image_hash(image_path):
'''Calc dhash of image and return str of hex representation of binary array'''
image = Image.open(image_path)
image_dhash = imagehash.dhash(image)
return str(image_dhash)
with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof, ProgressBar():
dask_df = dd.read_csv('data.csv') # only needs an image_path column.
dask_df = dask_df.repartition(npartitions=os.cpu_count())
image_size_df = dask_df.image_path.apply(image_size,meta=tuple)
dask_df['width'] = image_size_df.apply(lambda i: i[0],meta='width')
dask_df['height'] = image_size_df.apply(lambda i: i[1],meta='height')
dask_df['num_pixels'] = image_size_df.apply(lambda i: i[2],meta='num_pixels')
dask_df['image_dhash'] = dask_df.image_path.apply(calc_image_hash,meta=('image_dhash',str))
df = dask_df.compute()
visualize([prof, rprof, cprof],file_path='/tmp/profile_dhash.html')
df = df.sort_values(by=['image_dhash','num_pixels'],ascending=False)
df['duplicated'] = df.duplicated(subset=['image_dhash'],keep='first')
remove_df = df.loc[df['duplicated']]
remove_df.image_path.apply(os.remove)
print('All images removed: {}'.format((~remove_df.image_path.apply(os.path.exists).all())))
keep_df = df.loc[~df['duplicated']].copy()
# Rename files to dhash
keep_df['image_dhash_path'] = keep_df.apply(lambda row: rename(row.image_path,row.image_dhash),axis=1).astype(str)
keep_df.apply(lambda row: shutil.move(row.image_path,row.image_dhash_path), axis=1)
print('All images moved: {}'.format(
( (~keep_df.image_path.apply(os.path.exists)
& keep_df.image_dhash_path.apply(os.path.exists)).all()) ))
keep_df.image_path = keep_df.image_dhash_path.astype(str) # astype str b/c it maybe pathlib.Path
del keep_df['image_dhash_path']
del keep_df['duplicated']
# Create signed urls
bucket_name = 'my-bucket-name-here'
seconds_per_year = 31540000
# This assumes that your image_paths were relative to your s3 bucket.
keep_df['image_signed_url'] = keep_df.image_path.apply(lambda p: create_presigned_url(bucket_name,p, seconds_per_year))
# Save our work
keep_df.to_csv('data_dhash.csv',index=False)
'''
You will need to copy the files to the path we signed in the urls.
AWS CLI Example:
aws s3 sync relative/path/to/data s3://my-bucket-name-here/relative/path/to/data
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment