igrigorik (owner)

Revisions

gist: 227771 Download_button fork
public
Public Clone URL: git://gist.github.com/227771.git
Embed All Files: show embed
lucene-stats.rb #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
require 'rubygems'
require 'ruport'
require 'pp'
 
path = ARGV[0]
stats = {}
 
types = {
  'gen' => 'Segments File: stores information about segments',
  'lock' => 'Lock File: the Write lock prevents multiple IndexWriters from writing to the same file.',
  'cfs' => 'Compound File: an optional "virtual" file consisting of all the other index files for systems that frequently run out of file handles.',
  'fnm' => 'Fields: stores information about the fields',
  'fdx' => 'Field Index: contains pointers to field data',
  'fdt' => 'Field Data: the stored fields for documents',
  'tis' => 'Term Infos: part of the term dictionary, stores term info',
  'tii' => 'Term Info Index: the index into the Term Infos file',
  'frq' => 'Frequencies: contains the list of docs which contain each term along with frequency',
  'prx' => 'Positions: stores position information about where a term occurs in the index',
  'nrm' => 'Norms: encodes length and boost factors for docs and fields',
'tvx' => 'Term Vector Index: stores offset into the document data file',
  'tvd' => 'Term Vector Documents: contains information about each document that has term vectors',
  'tvf' => 'Term Vector Fields: the field level info about term vectors',
  'del' => 'Deleted Documents: info about what files are deleted'
}
 
Dir.glob(path + "*").each do |file|
  type = file.split('.').last
  
  if stats[type]
    stats[type][:size] += File.size(file)
  else
    stats[type] = {:size => File.size(file)}
  end
end
 
total = stats.values.inject(0){|t,v|t+=v[:size]}
stats = stats.collect do |file, stat|
  percent = format("%.2f", ((stat[:size].to_f / total) * 100)).to_f
  [file, format("%.2f", stat[:size].to_f / (1024*1024)), percent, types[file]]
end.sort{|a, b| b[-2] <=> a[-2]}
 
report = Ruport::Data::Table.new({
  :column_names => ['filetype', 'size (MB)', '%', 'description'],
  :data => stats
  })
 
puts report.to_s