boazsender/vgg_full_scrape.sh

## vgg_full_scrape.sh
#!/bin/bash
# Author: Boaz Sender
#
# Script for downloading all the good images in the VGG data set.
#
# VGG is available for download from www.robots.ox.ac.uk/~vgg/data/vgg_face/
#
# usage: from the root of a vgg_face_dataset download, run:
# mkdir full_scrape && ./vgg_full_scrape.sh

# Loop over all the files in the vgg data set that we downloaded
for file in `ls -1 files`
  do
    # Figure out the name of the person who we are downloading an image for
    # from the file name
    identity=$(echo $file | rev | cut -c5- | rev)
    declare -i count=1

    # make a file to log all the dead images in
    touch dead_images.txt

    function scrape_image()
    {
      # download the image
      wget $1 --output-document=scrape_full/$2 --timeout=6 --tries 2;

      # If the download creates a recognizeable image, then figure out the format,
      #  give it the correct extension, and move on
      if file "scrape_full/$2" | grep "image"; then
        format=$(file -0 -F" " "scrape_full/$2" | grep -aPo '\0\s*\K\S+' | tr '[:upper:]' '[:lower:]')
        mv "scrape_full/$2" "scrape_full/$2.$format"

        echo "******************************************************************"
        echo "Images successful! for ${identity} at count number ${count}!"
        echo "******************************************************************"

        # Otherwise, if we got an unrecognizeable image delete that garbo
        # and log that is was a bad image
      else
        echo "******************************************************************"
        echo "Images download failed for ${identity} at count number ${count}! Recording and moving on"
        echo "******************************************************************"
        echo $identity $2 >> dead_images.txt
        rm scrape_full/$2
      fi

      # Increment the counter, and call this function again with the next line
      ((count++))
      arg=$count
      arg+="p"
      nextarr=(`sed -n $arg 'files/'$file`)
      scrape_image ${nextarr[1]} ${identity}_${nextarr[0]}
    }

    # Kickoff this shindig with the first line of the current file
    arr=(`sed -n 1p 'files/'$file`)
    scrape_image ${arr[1]} ${identity}_${arr[0]}
done;

## vgg_process_files.sh
#!/bin/bash
# usage: from the root of a vgg_face_dataset download run
# ./vgg_process_files.sh > vgg_face_dataset_first_image_per_identity.txt
for i in `ls -1 files`
do
   head 'files/'$i -n1 -v
done;

## vgg_process_files_and_scrape.sh
#!/bin/bash
# Author: Boaz Sender
#
# Script for processing the data in vgg_face_dataset and downloading a single
# image for each identity (person) in the data.
#
# The VGG data is available for download from
# www.robots.ox.ac.uk/~vgg/data/vgg_face/
#
# At the time of this writing, about 12% of the first images of each identity
# are no longer available, or borken in some way. This script tries
# downloading until it finds a good image.
#
# usage: from the root of a vgg_face_dataset download, run:
# mkdir scrape && ./vgg_process_files_and_scrape.sh

# Define our scraper function
function scrape_image()
{
  # download the image, and if it downloads and creates a recognizeable image,
  # then figure out the format, give it the correct extension, and move on
  if wget $1 --output-document=scrape/$2 -q --timeout=6 --tries 2 && file "scrape/$2" | grep "image" && (( $(identify -format %n scrape/$2) < 2)); then
    format=$(file -0 -F" " "scrape/$2" | grep -aPo '\0\s*\K\S+' | tr '[:upper:]' '[:lower:]')

    mv "scrape/$2" "scrape/$2.$format"
    echo "******************************************************************"
    echo "Images successful! for ${identity} at count number ${count}!"
    echo "******************************************************************"

  # otherwise, if the wget fails, or downloads and empty file, or something
  # that is not an image delete that garbo, and try again with the next line
  # of the identity's file
  else
    ((count++))
    echo "******************************************************************"
    echo "Image failed, moving on to image number ${count} for ${identity}"
    echo "******************************************************************"
    rm scrape/$2

    arg=$count
    arg+="p"
    nextarr=(`sed -n $arg 'files/'$file`)
    scrape_image ${nextarr[1]} ${identity}_${nextarr[0]}
  fi
}


# Loop over all the files in the vgg data set that we downloaded
for file in `ls -1 files`
  do
    # Figure out the name of the person who we are downloading an image for
    # from the file name
    identity=$(echo $file | rev | cut -c5- | rev)
    declare -i count=1


    # Kickoff this shindig with the first line of the current file
    arr=(`sed -n 1p 'files/'$file`)
    scrape_image ${arr[1]} ${identity}_${arr[0]}
done;

## vgg_scrape.sh
#!/bin/bash
# usage: from the root of a vgg_face_dataset download run
# mkdir scrape && ./vgg_scrape.sh
while read p; do
  arr=(`echo ${p}`)
  wget ${arr[2]} --output-document=scrape/${arr[0]}.jpg --timeout=6 --tries 2
done < vgg_face_dataset_first_image_per_identity.txt
	#!/bin/bash
	# Author: Boaz Sender
	#
	# Script for downloading all the good images in the VGG data set.
	#
	# VGG is available for download from www.robots.ox.ac.uk/~vgg/data/vgg_face/
	#
	# usage: from the root of a vgg_face_dataset download, run:
	# mkdir full_scrape && ./vgg_full_scrape.sh

	# Loop over all the files in the vgg data set that we downloaded
	for file in `ls -1 files`
	do
	# Figure out the name of the person who we are downloading an image for
	# from the file name
	identity=$(echo $file \| rev \| cut -c5- \| rev)
	declare -i count=1

	# make a file to log all the dead images in
	touch dead_images.txt

	function scrape_image()
	{
	# download the image
	wget $1 --output-document=scrape_full/$2 --timeout=6 --tries 2;

	# If the download creates a recognizeable image, then figure out the format,
	# give it the correct extension, and move on
	if file "scrape_full/$2" \| grep "image"; then
	format=$(file -0 -F" " "scrape_full/$2" \| grep -aPo '\0\s*\K\S+' \| tr '[:upper:]' '[:lower:]')
	mv "scrape_full/$2" "scrape_full/$2.$format"

	echo "******************************************************************"
	echo "Images successful! for ${identity} at count number ${count}!"
	echo "******************************************************************"

	# Otherwise, if we got an unrecognizeable image delete that garbo
	# and log that is was a bad image
	else
	echo "******************************************************************"
	echo "Images download failed for ${identity} at count number ${count}! Recording and moving on"
	echo "******************************************************************"
	echo $identity $2 >> dead_images.txt
	rm scrape_full/$2
	fi

	# Increment the counter, and call this function again with the next line
	((count++))
	arg=$count
	arg+="p"
	nextarr=(`sed -n $arg 'files/'$file`)
	scrape_image ${nextarr[1]} ${identity}_${nextarr[0]}
	}

	# Kickoff this shindig with the first line of the current file
	arr=(`sed -n 1p 'files/'$file`)
	scrape_image ${arr[1]} ${identity}_${arr[0]}
	done;
	#!/bin/bash
	# usage: from the root of a vgg_face_dataset download run
	# ./vgg_process_files.sh > vgg_face_dataset_first_image_per_identity.txt
	for i in `ls -1 files`
	do
	head 'files/'$i -n1 -v
	done;
	#!/bin/bash
	# Author: Boaz Sender
	#
	# Script for processing the data in vgg_face_dataset and downloading a single
	# image for each identity (person) in the data.
	#
	# The VGG data is available for download from
	# www.robots.ox.ac.uk/~vgg/data/vgg_face/
	#
	# At the time of this writing, about 12% of the first images of each identity
	# are no longer available, or borken in some way. This script tries
	# downloading until it finds a good image.
	#
	# usage: from the root of a vgg_face_dataset download, run:
	# mkdir scrape && ./vgg_process_files_and_scrape.sh

	# Define our scraper function
	function scrape_image()
	{
	# download the image, and if it downloads and creates a recognizeable image,
	# then figure out the format, give it the correct extension, and move on
	if wget $1 --output-document=scrape/$2 -q --timeout=6 --tries 2 && file "scrape/$2" \| grep "image" && (( $(identify -format %n scrape/$2) < 2)); then
	format=$(file -0 -F" " "scrape/$2" \| grep -aPo '\0\s*\K\S+' \| tr '[:upper:]' '[:lower:]')

	mv "scrape/$2" "scrape/$2.$format"
	echo "******************************************************************"
	echo "Images successful! for ${identity} at count number ${count}!"
	echo "******************************************************************"

	# otherwise, if the wget fails, or downloads and empty file, or something
	# that is not an image delete that garbo, and try again with the next line
	# of the identity's file
	else
	((count++))
	echo "******************************************************************"
	echo "Image failed, moving on to image number ${count} for ${identity}"
	echo "******************************************************************"
	rm scrape/$2

	arg=$count
	arg+="p"
	nextarr=(`sed -n $arg 'files/'$file`)
	scrape_image ${nextarr[1]} ${identity}_${nextarr[0]}
	fi
	}


	# Loop over all the files in the vgg data set that we downloaded
	for file in `ls -1 files`
	do
	# Figure out the name of the person who we are downloading an image for
	# from the file name
	identity=$(echo $file \| rev \| cut -c5- \| rev)
	declare -i count=1


	# Kickoff this shindig with the first line of the current file
	arr=(`sed -n 1p 'files/'$file`)
	scrape_image ${arr[1]} ${identity}_${arr[0]}
	done;
	#!/bin/bash
	# usage: from the root of a vgg_face_dataset download run
	# mkdir scrape && ./vgg_scrape.sh
	while read p; do
	arr=(`echo ${p}`)
	wget ${arr[2]} --output-document=scrape/${arr[0]}.jpg --timeout=6 --tries 2
	done < vgg_face_dataset_first_image_per_identity.txt