Skip to content

Instantly share code, notes, and snippets.

@boazsender
Last active January 15, 2018 02:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save boazsender/6628205d677685078b7ca8fdfe6e3040 to your computer and use it in GitHub Desktop.
Save boazsender/6628205d677685078b7ca8fdfe6e3040 to your computer and use it in GitHub Desktop.
The first of each identity in the VGG face dataset.
#!/bin/bash
# Author: Boaz Sender
#
# Script for downloading all the good images in the VGG data set.
#
# VGG is available for download from www.robots.ox.ac.uk/~vgg/data/vgg_face/
#
# usage: from the root of a vgg_face_dataset download, run:
# mkdir full_scrape && ./vgg_full_scrape.sh
# Loop over all the files in the vgg data set that we downloaded
for file in `ls -1 files`
do
# Figure out the name of the person who we are downloading an image for
# from the file name
identity=$(echo $file | rev | cut -c5- | rev)
declare -i count=1
# make a file to log all the dead images in
touch dead_images.txt
function scrape_image()
{
# download the image
wget $1 --output-document=scrape_full/$2 --timeout=6 --tries 2;
# If the download creates a recognizeable image, then figure out the format,
# give it the correct extension, and move on
if file "scrape_full/$2" | grep "image"; then
format=$(file -0 -F" " "scrape_full/$2" | grep -aPo '\0\s*\K\S+' | tr '[:upper:]' '[:lower:]')
mv "scrape_full/$2" "scrape_full/$2.$format"
echo "******************************************************************"
echo "Images successful! for ${identity} at count number ${count}!"
echo "******************************************************************"
# Otherwise, if we got an unrecognizeable image delete that garbo
# and log that is was a bad image
else
echo "******************************************************************"
echo "Images download failed for ${identity} at count number ${count}! Recording and moving on"
echo "******************************************************************"
echo $identity $2 >> dead_images.txt
rm scrape_full/$2
fi
# Increment the counter, and call this function again with the next line
((count++))
arg=$count
arg+="p"
nextarr=(`sed -n $arg 'files/'$file`)
scrape_image ${nextarr[1]} ${identity}_${nextarr[0]}
}
# Kickoff this shindig with the first line of the current file
arr=(`sed -n 1p 'files/'$file`)
scrape_image ${arr[1]} ${identity}_${arr[0]}
done;
#!/bin/bash
# usage: from the root of a vgg_face_dataset download run
# ./vgg_process_files.sh > vgg_face_dataset_first_image_per_identity.txt
for i in `ls -1 files`
do
head 'files/'$i -n1 -v
done;
#!/bin/bash
# Author: Boaz Sender
#
# Script for processing the data in vgg_face_dataset and downloading a single
# image for each identity (person) in the data.
#
# The VGG data is available for download from
# www.robots.ox.ac.uk/~vgg/data/vgg_face/
#
# At the time of this writing, about 12% of the first images of each identity
# are no longer available, or borken in some way. This script tries
# downloading until it finds a good image.
#
# usage: from the root of a vgg_face_dataset download, run:
# mkdir scrape && ./vgg_process_files_and_scrape.sh
# Define our scraper function
function scrape_image()
{
# download the image, and if it downloads and creates a recognizeable image,
# then figure out the format, give it the correct extension, and move on
if wget $1 --output-document=scrape/$2 -q --timeout=6 --tries 2 && file "scrape/$2" | grep "image" && (( $(identify -format %n scrape/$2) < 2)); then
format=$(file -0 -F" " "scrape/$2" | grep -aPo '\0\s*\K\S+' | tr '[:upper:]' '[:lower:]')
mv "scrape/$2" "scrape/$2.$format"
echo "******************************************************************"
echo "Images successful! for ${identity} at count number ${count}!"
echo "******************************************************************"
# otherwise, if the wget fails, or downloads and empty file, or something
# that is not an image delete that garbo, and try again with the next line
# of the identity's file
else
((count++))
echo "******************************************************************"
echo "Image failed, moving on to image number ${count} for ${identity}"
echo "******************************************************************"
rm scrape/$2
arg=$count
arg+="p"
nextarr=(`sed -n $arg 'files/'$file`)
scrape_image ${nextarr[1]} ${identity}_${nextarr[0]}
fi
}
# Loop over all the files in the vgg data set that we downloaded
for file in `ls -1 files`
do
# Figure out the name of the person who we are downloading an image for
# from the file name
identity=$(echo $file | rev | cut -c5- | rev)
declare -i count=1
# Kickoff this shindig with the first line of the current file
arr=(`sed -n 1p 'files/'$file`)
scrape_image ${arr[1]} ${identity}_${arr[0]}
done;
#!/bin/bash
# usage: from the root of a vgg_face_dataset download run
# mkdir scrape && ./vgg_scrape.sh
while read p; do
arr=(`echo ${p}`)
wget ${arr[2]} --output-document=scrape/${arr[0]}.jpg --timeout=6 --tries 2
done < vgg_face_dataset_first_image_per_identity.txt
@boazsender
Copy link
Author

Great point, updated!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment