cwvhogue/ImageMagick_identify_output_to_R_histogram

## ImageMagick_identify_output_to_R_histogram
# image_identify.txt is a file with the default output from ImageMagick 'identify'
# run over a set of JPG files locally,
# or the equivalent Manta MapReduce 'identify' output from the previous Gist.

$ identify *.jpg > image_identify.txt
$ head -5 image_identify.txt
00000201.jpg JPEG 3295x5947 3295x5947+0+0 8-bit sRGB 23.99MB 0.010u 0:00.000
00000301.jpg[1] JPEG 4470x3126 4470x3126+0+0 8-bit sRGB 22.15MB 0.000u 0:00.009
00000401.jpg[2] JPEG 3115x4485 3115x4485+0+0 8-bit sRGB 19.41MB 0.000u 0:00.000
00000501.jpg[3] JPEG 3093x4515 3093x4515+0+0 8-bit sRGB 19.39MB 0.000u 0:00.000
00000601.jpg[4] JPEG 3871x3613 3871x3613+0+0 8-bit sRGB 26.64MB 0.000u 0:00.000

# Reorganize to a sorted list of file size, units, filename and write to a csv file:

$ cat image_identify.txt | \
  awk '{print $7 " " $1}' | \
  sort -n  | \
  sed 's/MB/ MB/' | \
  sed 's/KB/ KB/' | \
  sed 's/\[/ \[/' | \
  awk '{print $1 ", " $2 ", " $3}' > image_sizes.csv

# In this example, there are 3 files that were bad
# - these sort to the top of the list as you can see:

$ head -5 image_sizes.csv
0.000u, 03901101.jpg,
0.000u, 10821901.jpg,
0.010u, 00660201.jpg,
2.988, MB, 02047401.jpg
3.747, MB, 01505401.jpg
$ tail -5 image_sizes.csv
188.5, MB, 11033001.jpg
200.9, MB, 00063101.jpg
248.3, MB, 00104401.jpg
277.7, MB, 00056101.jpg
327.5, MB, 00099001.jpg


# Also look for any KB sized images - possibly broken data transfers
# that have complete JPEG headers, but may not be full-length files:

$ grep KB image_sizes.csv

# In this case - no output.
# Any transmission broken images you might find may need to be re-transferred.
# Remove any broken images from the data set.
# Repeat the 'identify' command on the cleaned up set of files and the
# above filter to generate a clean *.csv file with no broken images.
# Now you can plot a histogram in R


# Fire up R and plot a histogram of the image size distribution
# - assumes there are no KB sized images in the set!

$ R
> image_sizes<-read.csv(header=FALSE, "image_sizes.csv")
> hist(image_sizes[,1],breaks=500, main="Image Size", xlab="MB")
> rug(image_sizes[,1])
	# image_identify.txt is a file with the default output from ImageMagick 'identify'
	# run over a set of JPG files locally,
	# or the equivalent Manta MapReduce 'identify' output from the previous Gist.

	$ identify *.jpg > image_identify.txt
	$ head -5 image_identify.txt
	00000201.jpg JPEG 3295x5947 3295x5947+0+0 8-bit sRGB 23.99MB 0.010u 0:00.000
	00000301.jpg[1] JPEG 4470x3126 4470x3126+0+0 8-bit sRGB 22.15MB 0.000u 0:00.009
	00000401.jpg[2] JPEG 3115x4485 3115x4485+0+0 8-bit sRGB 19.41MB 0.000u 0:00.000
	00000501.jpg[3] JPEG 3093x4515 3093x4515+0+0 8-bit sRGB 19.39MB 0.000u 0:00.000
	00000601.jpg[4] JPEG 3871x3613 3871x3613+0+0 8-bit sRGB 26.64MB 0.000u 0:00.000

	# Reorganize to a sorted list of file size, units, filename and write to a csv file:

	$ cat image_identify.txt \| \
	awk '{print $7 " " $1}' \| \
	sort -n \| \
	sed 's/MB/ MB/' \| \
	sed 's/KB/ KB/' \| \
	sed 's/\[/ \[/' \| \
	awk '{print $1 ", " $2 ", " $3}' > image_sizes.csv

	# In this example, there are 3 files that were bad
	# - these sort to the top of the list as you can see:

	$ head -5 image_sizes.csv
	0.000u, 03901101.jpg,
	0.000u, 10821901.jpg,
	0.010u, 00660201.jpg,
	2.988, MB, 02047401.jpg
	3.747, MB, 01505401.jpg
	$ tail -5 image_sizes.csv
	188.5, MB, 11033001.jpg
	200.9, MB, 00063101.jpg
	248.3, MB, 00104401.jpg
	277.7, MB, 00056101.jpg
	327.5, MB, 00099001.jpg


	# Also look for any KB sized images - possibly broken data transfers
	# that have complete JPEG headers, but may not be full-length files:

	$ grep KB image_sizes.csv

	# In this case - no output.
	# Any transmission broken images you might find may need to be re-transferred.
	# Remove any broken images from the data set.
	# Repeat the 'identify' command on the cleaned up set of files and the
	# above filter to generate a clean *.csv file with no broken images.
	# Now you can plot a histogram in R


	# Fire up R and plot a histogram of the image size distribution
	# - assumes there are no KB sized images in the set!

	$ R
	> image_sizes<-read.csv(header=FALSE, "image_sizes.csv")
	> hist(image_sizes[,1],breaks=500, main="Image Size", xlab="MB")
	> rug(image_sizes[,1])