dmke/README.md

## README.md

      
    Raw
  

              README.md
            
          
    This is a response/extension to the Deep directory structure vs. flat directory structure to store millions of files on ext4
article on Medium. Here, I'm benchmarking not only a nesting level of two, but different levels.
This benchmark currently runs on an otherwise idle physical server with this specs:

HDD: spinning, encrypted (RAID 10)
CPU: Intel i7-2600 (4 cores + HT @ 3.4GHz)
RAM: 16GB

For reference, this is the result for a test run with for a depth of 5 and 100k files (output truncated and sorted):
$ ruby benchmark-deep-vs-flat-directories.rb 5 100000
Ruby 2.5.3 x86_64-linux, depth 5, iterations 100000
                      user     system      total        real
write-5           1.100000   3.390000   4.490000 (242.845647)
write-4           1.000000   3.070000   4.070000 (238.195257)
write-3           1.050000   2.910000   3.960000 (214.014290)
write-2           1.070000   3.160000   4.230000 (414.781391)
write-1           0.810000   1.640000   2.450000 (  5.496074)
write-0           0.720000   1.580000   2.300000 (  3.912904)
read-5            0.890000   0.680000   1.570000 (  1.895553)
read-4            0.670000   0.730000   1.400000 ( 20.789874)
read-3            0.790000   0.910000   1.700000 ( 21.961692)
read-2            0.560000   0.590000   1.150000 (  1.400197)
read-1            0.570000   0.580000   1.150000 (  6.023490)
read-0            0.550000   0.470000   1.020000 (  1.259735)

I'll update this gist with the results, once the benchmark is finished (with 10m files).

  
## benchmark-deep-vs-flat-directories.rb
#!/usr/bin/env ruby
# frozen_string_literal: true
#
# SYNOPSIS
#
#   ruby benchmark-deep-vs-flat-directories.rb [LEVEL [N]]
#
#   This will create a directory named "root" next to this source file.
#   It will then create a sub-directory tree LEVEL levels deep in this
#   "root" directory and place N files according to a simple sharding
#   algorithm. It measures the time to write and read all those files.
#   When finished, it will remove all files, and the deepest sub-tree.
#
#   It then restarts the same process with a sub-directory tree of
#   depth LEVEL-1, (writing, reading and deleting the files again). The
#   script will stop after LEVEL iterations.
#
# OPTIONS
#
#   LEVEL max depth of dir structure (default: 5)
#   N     number of files to store (default: 10000000)
#

require "digest"
require "benchmark"
require "pathname"

L   = (ARGV.shift || 5).to_i
N   = (ARGV.shift || 10_000_000).to_i
DIR = Pathname.new(__dir__).join("root").tap(&:mkpath)

entries = [] # SHA256(0) ... SHA256(N)
reads   = [] # reads[i] = real time measurement for reading files at depth i
writes  = [] # writes[i] same as above, for writes at depth i

puts "Ruby #{RUBY_VERSION} #{RUBY_PLATFORM}, depth #{L}, iterations #{N}"

def path_for(level, ent)
  pad = ent.rjust(level*2, "0")
  DIR.join(*pad.scan(/../).take(level), ent)
end

Benchmark.bm(15) do |x|
  x.report("prep-entries") {
    N.times do |i|
      entries << Digest::SHA256.hexdigest(i.to_s)
    end
  }

  x.report("prep-paths") {
    entries.each do |ent|
      path_for(L, ent).dirname.mkpath
    end
  }

  L.downto(0) do |l|
    curr = entries.map{|ent| path_for(l, ent) }

    tw = x.report("write-#{l}") {
      curr.each do |ent|
        ent.open("w") {|f| f.write(l.to_s) }
      end
    }
    writes << tw.real

    tr = x.report("read-#{l}") {
      curr.each do |ent|
        ent.read
      end
    }
    reads << tr.real

    x.report("delete-#{l}") {
      curr.each do |ent|
        ent.dirname.rmtree if ent.dirname.exist?
      end
    }
  end
end

# print a plot URL
r, w = [reads, writes].map {|el|
  el.reverse.map {|f| f.round(2) }.join(",")
}

puts "https://chart.googleapis.com/chart?" + [
  "cht=bvg",
  "chs=650x450",
  "chd=t:#{r}|#{w}",
  "chds=a",
  "chbh=a,1,#{L*10}",
  "chco=ff7f0e,1f77b4",
  "chtt=File%20access%20time%20for%20#{N}%20files",
  "chdl=read|write",
  "chxt=x,x,y,y",
  "chxl=1:|depth|3:|time%20[s]|"
].join("&")
	#!/usr/bin/env ruby
	# frozen_string_literal: true
	#
	# SYNOPSIS
	#
	# ruby benchmark-deep-vs-flat-directories.rb [LEVEL [N]]
	#
	# This will create a directory named "root" next to this source file.
	# It will then create a sub-directory tree LEVEL levels deep in this
	# "root" directory and place N files according to a simple sharding
	# algorithm. It measures the time to write and read all those files.
	# When finished, it will remove all files, and the deepest sub-tree.
	#
	# It then restarts the same process with a sub-directory tree of
	# depth LEVEL-1, (writing, reading and deleting the files again). The
	# script will stop after LEVEL iterations.
	#
	# OPTIONS
	#
	# LEVEL max depth of dir structure (default: 5)
	# N number of files to store (default: 10000000)
	#

	require "digest"
	require "benchmark"
	require "pathname"

	L = (ARGV.shift \|\| 5).to_i
	N = (ARGV.shift \|\| 10_000_000).to_i
	DIR = Pathname.new(__dir__).join("root").tap(&:mkpath)

	entries = [] # SHA256(0) ... SHA256(N)
	reads = [] # reads[i] = real time measurement for reading files at depth i
	writes = [] # writes[i] same as above, for writes at depth i

	puts "Ruby #{RUBY_VERSION} #{RUBY_PLATFORM}, depth #{L}, iterations #{N}"

	def path_for(level, ent)
	pad = ent.rjust(level*2, "0")
	DIR.join(*pad.scan(/../).take(level), ent)
	end

	Benchmark.bm(15) do \|x\|
	x.report("prep-entries") {
	N.times do \|i\|
	entries << Digest::SHA256.hexdigest(i.to_s)
	end
	}

	x.report("prep-paths") {
	entries.each do \|ent\|
	path_for(L, ent).dirname.mkpath
	end
	}

	L.downto(0) do \|l\|
	curr = entries.map{\|ent\| path_for(l, ent) }

	tw = x.report("write-#{l}") {
	curr.each do \|ent\|
	ent.open("w") {\|f\| f.write(l.to_s) }
	end
	}
	writes << tw.real

	tr = x.report("read-#{l}") {
	curr.each do \|ent\|
	ent.read
	end
	}
	reads << tr.real

	x.report("delete-#{l}") {
	curr.each do \|ent\|
	ent.dirname.rmtree if ent.dirname.exist?
	end
	}
	end
	end

	# print a plot URL
	r, w = [reads, writes].map {\|el\|
	el.reverse.map {\|f\| f.round(2) }.join(",")
	}

	puts "https://chart.googleapis.com/chart?" + [
	"cht=bvg",
	"chs=650x450",
	"chd=t:#{r}\|#{w}",
	"chds=a",
	"chbh=a,1,#{L*10}",
	"chco=ff7f0e,1f77b4",
	"chtt=File%20access%20time%20for%20#{N}%20files",
	"chdl=read\|write",
	"chxt=x,x,y,y",
	"chxl=1:\|depth\|3:\|time%20[s]\|"
	].join("&")