dingsdax/Rakefile.rb

## helpers.rb
#adds helper methods to standard ruby classes

class String
  #make a string stemmable
  include Stemmable

  #get array of n-grams of string
  def ngrams(len = 1)
    ngrams = []
    len = size if len > size
    (0..size - len).each do |n|
      ng = self[n...(n + len)]
      ngrams.push(ng)
    end
    ngrams
  end

  #get last character of string
  def last
    self[size-1,1]
  end
end

class Array
  #return hash with frequencies of items in an array
  def freqs
    inject(Hash.new(0)){ |hash,x|
      hash[x] +=1
      hash
    }
  end

  #return hash with array elems as keys, values=1
  def exists
    inject(Hash.new(0)){ |hash,x|
      hash[x] =1
      hash
    }
  end

  #sum up array items
  def sum
    inject(nil) { |sum,x| sum ? sum+x : x }
  end
end

class Hash
  #sort hash by values descending
  def sort_num_value
    to_a.sort_by {|key,value| -value}
  end

  #convert all hash values to arrays
  def mk_ary_val
    self.each_pair do |k,v|
      self[k] = [v]
      self
    end
  end
end

class Float
  #round float to decimal places
  alias_method :round_orig, :round
  def round(n=0)
    (self * (10.0 ** n)).round_orig * (10.0 ** (-n))
  end
end

class Dir
  #recurse directory, return array of files,dirs
  def self.recurse(path='.', ext='*', &block)
    list = []
    stoplist = ['.', '..']
    Dir.foreach(path) do |f|
      next if stoplist.include?(f)
      filename = (path == '.' ? f : path + '/' + f)
      next if f.match(/^\./)
      list << filename
      block.call(filename) if block
      if FileTest.directory?(filename)
        list.concat( Dir.recurse(filename, &block) )
      end
    end
    list
  end
end

## indexer.rb
# encoding: UTF-8

require 'fileutils'
require 'iconv'
require 'stemmer.rb'
require 'helpers.rb'

class Indexer

  #constant for stop words
  STOP_WORDS = []

  def initialize
    #data structures
    @ngrams_doc = Hash.new #storing corpus ngrams + their doc-freq
    @docs = 0 #nr of docs processed

    # options
    @ngrams = 3 #nr of ngrams
    @stemming = true #do porter stemming on words
    @stopwording = true #remove stop words
    @upperbound = 1 #ngram must be in at most 90% of docs
    @lowerbound = 0 #ngram must be in at least 20% of docs
    @round = 6 #number of decimal places for tf-idf
    @name = 'indexer' #give the index a name

    # directories & files
    @wdir = Dir.getwd #working directory
    @stw_file = 'stop_words_en.txt' #english stop words file, default
    @tmp_dir = Dir.getwd + '/tmp' #dir for tmp data
    @out_file = Dir.getwd + '/output.arff' #dir for tmp data
    @ngrams_file = @tmp_dir + '/ngrams.dtf' # data store corpus ngrams + df
  end

  attr_accessor :ngrams, :stemming, :stopwording, :upperbound, :lowerbound, :name, :stw_file, :out_file, :ngrams_file, :wdir

  #import stop words from file
  def get_stop_words
    f = File.open @stw_file
    f.each_line do |l|
      STOP_WORDS << l.strip
    end
  end

  #remove stop-words with regexp from string, downcased automatically
  def rm_stop_words str
    @stopwording ? str.downcase.gsub(/(#{STOP_WORDS.join('|')})/, '') : str
  end

  #do some porter stemming
  def stem str
    if @stemming
      out = String.new
      str.split(' ').each { |s| out << s.stem << ' ' }
      out[0..out.size-2]
    else
      str
    end
  end

  #process a file, iterate through file, get ngrams + freq
  def process_file filename
    puts "processing: #{filename}"

    # get class assignment
    doc_class = get_class filename
    puts "class assignment: #{doc_class}"

    ngrams_cur = Hash.new #storing current doc ngrams + freq
    doc_cur = Hash.new #storing doc-freq of ngram, always 1
    file_str = String.new #for storing file contents

    #open file and put whole file into a string
    file_str = IO.read(filename)

    # fix encoding errors
    ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
    file_str = ic.iconv(file_str)

    #trim whitspaces: multiple, trailing, leading
    file_str = file_str.downcase.gsub(/\s/, " ").gsub("'",'"').squeeze(" ").strip

    #get ngrams + frequencies
    # replace single quotes with double quotes, escaping can be troublesome for some ngrams
    ngrams_cur = file_str.ngrams(@ngrams).freqs

    #get ngram existance
    doc_cur = file_str.ngrams(@ngrams).exists

    #merge results for global ngrams & doc-freq
    @ngrams_doc = @ngrams_doc.merge(doc_cur) do |key,val_old,val_new|
       val_old + val_new
    end

    #store ngrams of cur doc
    fn = File.basename(filename)
    c_file = @tmp_dir + '/' + doc_class + '_' + fn
    File.open(c_file, 'w+') do |f|
      Marshal.dump(ngrams_cur, f)
    end

    # increase nr of docs processed
    @docs += 1
  end

  # need to be overwritten
  def get_class f
    return 'braaack'
  end

  def process_dir
    #create tmp directory
    FileUtils.mkdir_p @tmp_dir

    # document class
    doc_class = String.new

    #change working dir & traverse through dir
    Dir.chdir(@wdir)
    Dir.recurse(@wdir) do |f|
      if !File.directory?(f)
        process_file(f)
      end
    end

    #cut ngrams to lower and upper bounds
    a = @ngrams_doc.select { |k,v|
      v.to_f/@docs > @lowerbound && v.to_f/@docs < @upperbound
    }
    @ngrams_doc = Hash[*a.flatten]

    #store all ngrams of corpus
    File.open(@ngrams_file, 'w') do |f|
      f.write Marshal.dump(@ngrams_doc.to_a)
    end

    puts "file #{@ngrams_file} written"
  end

  #delete tmp directory and files
  def cleanup
    if File.directory? @tmp_dir
      FileUtils.rm_r @tmp_dir
    end
  end

  #build the output arff file
  def build_arff
    ngrams = Array.new
    attributes = [['filename','STRING'],['klass','STRING']]
    File.open(@ngrams_file) do |f|
      ngrams = Marshal.load(f)
    end
    ngrams.each_with_index do |ngram, i|
      attributes << [ngram[0], 'NUMERIC']
    end
    File.open(@out_file, 'w') do |out|

      #start output
      r = '@RELATION ' + @name
      out.puts r
      out.puts ''

      #write attributes, escape '
      attributes.each do |a|
        #o = '@ATTRIBUTE \'' + a[0].to_s.gsub("\\","\\\\'").gsub("\'","\\\'") + '\' ' + a[1].to_s
        o = '@ATTRIBUTE \'' + a[0].to_s + '\' ' + a[1].to_s
        out.puts o
      end

      #start data section
      out.puts ''
      out.puts '@DATA'

      #enter tmp dir and get all tf files
      Dir.chdir @tmp_dir
      Dir.glob("*.*").each do |f|

        # lignore ngrams file
        if f == @ngrams_file.split('/').last
          next
        end

        #add filename and class name to instance
        instance = ['0 ' + f.split('_')[1],'1 ' + f.split('_')[0]]
        ngrams_cur = Hash.new

        File.open(f) do |c|
          ngrams_cur = Marshal.load(c)
        end

        #sum of all tf in document
        dtf = ngrams_cur.values.sum

        #for each ngram to appear in output
        ngrams.each_with_index do |ngram, i|
          if ngrams_cur.has_key? ngram[0]
            #calculate tf-idf
            x = ((ngrams_cur[ngram[0]].to_f/dtf)*(Math.log(@docs/ngram[1].to_f))).round(@round)
            instance << (i+2).to_s + ' ' + x.to_s
          end
        end

        # sort instances for sparse output
        instance.sort {|x,y| x.to_i <=> y.to_i }

        #write data section
        out << '{'
        instance.each_with_index do |e,i|
          if (i == (instance.size-1))
            e = e.to_s + '}'
            out.puts e
          else
            out << e.to_s + ', '
          end
        end
      end
    end
    puts "file: #{@out_file} written"
  end
end

## Rakefile.rb
require './lib/indexer.rb'

# rake task to build index
task :buildindex do

  # mixin to overwrite get_class method
  Indexer.class_eval do
    def get_class f
      f.split('/')[-2]# get class assignment implicitly with folder structure (first hierarchy level)
    end
  end

  # create Indexer
  i = Indexer.new

  # set options
  i.wdir = Dir.getwd + '/corpora/my_text_corpus' # working directory, root directory of corpus
  i.ngrams = 4 # n in ngrams
  i.stemming = true # use stemming?
  i.stopwording = true # use stopwords?
  i.upperbound = 0.4 # upper percentage of docs in which a certain ngram has to appear
  i.lowerbound = 0.01 # lower percentage of docs in which a certain ngram has to appear
  i.name = 'output' # name for the relation in the output file
  i.stw_file = 'stop_words_en.txt' # file containing stopwords
  i.out_file = Dir.getwd + '/output.arff' # name of index file
  i.process_dir # create temporary data files containing n-grams, frequencies
  i.build_arff # build the output arff file
  i.cleanup # delete temporary files

end

## stemmer.rb
# ruby porter stemmer by ray pareda, additions dingsdax
module Stemmable

  STEP_2_LIST = {
    'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
    'izer'=>'ize', 'bli'=>'ble',
    'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
    'ization'=>'ize', 'ation'=>'ate',
    'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
    'ousness'=>'ous', 'aliti'=>'al',
    'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
  }

  STEP_3_LIST = {
    'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
    'ical'=>'ic', 'ful'=>'', 'ness'=>''
  }

  SUFFIX_1_REGEXP = /(
                    ational  |
                    tional   |
                    enci     |
                    anci     |
                    izer     |
                    bli      |
                    alli     |
                    entli    |
                    eli      |
                    ousli    |
                    ization  |
                    ation    |
                    ator     |
                    alism    |
                    iveness  |
                    fulness  |
                    ousness  |
                    aliti    |
                    iviti    |
                    biliti   |
                    logi)$/x

  SUFFIX_2_REGEXP = /(
                      al       |
                      ance     |
                      ence     |
                      er       |
                      ic       |
                      able     |
                      ible     |
                      ant      |
                      ement    |
                      ment     |
                      ent      |
                      ou       |
                      ism      |
                      ate      |
                      iti      |
                      ous      |
                      ive      |
                      ize)$/x

  C = "[^aeiou]" #consonant
  V = "[aeiouy]" #vowel
  CC = "#{C}(?>[^aeiouy]*)" #consonant sequence
  VV = "#{V}(?>[aeiou]*)" #vowel sequence

  MGR0 = /^(#{CC})?#{VV}#{CC}/o #[cc]vvcc... is m>0
  MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o #[cc]vvcc[vv] is m=1
  MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o #[cc]vvccvvcc... is m>1
  VOWEL_IN_STEM   = /^(#{CC})?#{V}/o # vowel in stem
  PCT_MARKS = ['\.', '!', '\?', ':', '\(', '\)', ' - '] # trailing characters for removal to allow stemming

  def stem_porter

    # check for trailing characters
    mark = false
    w = String.new(self)
    char = w[-1,1]
    if w[-1,1].match(/(#{PCT_MARKS.join('|')})/)
      w = w.chop
      mark = true
    end

    return w if w.length < 3

    # now map initial y to Y so that the patterns never treat it as vowel
    w[0] = 'Y' if w[0] == ?y

    # Step 1a
    if w =~ /(ss|i)es$/
      w = $` + $1
    elsif w =~ /([^s])s$/
      w = $` + $1
    end

    # Step 1b
    if w =~ /eed$/
      w.chop! if $` =~ MGR0
    elsif w =~ /(ed|ing)$/
      stem = $`
      if stem =~ VOWEL_IN_STEM
        w = stem
	      case w
          when /(at|bl|iz)$/ then w << "e"
          when /([^aeiouylsz])\1$/ then w.chop!
          when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
        end
      end
    end

    if w =~ /y$/
      stem = $`
      w = stem + "i" if stem =~ VOWEL_IN_STEM
    end

    # Step 2
    if w =~ SUFFIX_1_REGEXP
      stem = $`
      suffix = $1
      # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
      if stem =~ MGR0
        w = stem + STEP_2_LIST[suffix]
      end
    end

    # Step 3
    if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
      stem = $`
      suffix = $1
      if stem =~ MGR0
        w = stem + STEP_3_LIST[suffix]
      end
    end

    # Step 4
    if w =~ SUFFIX_2_REGEXP
      stem = $`
      if stem =~ MGR1
        w = stem
      end
    elsif w =~ /(s|t)(ion)$/
      stem = $` + $1
      if stem =~ MGR1
        w = stem
      end
    end

    #  Step 5
    if w =~ /e$/
      stem = $`
      if (stem =~ MGR1) ||
          (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
        w = stem
      end
    end
    if w =~ /ll$/ && w =~ MGR1
      w.chop!
    end

    # and turn initial Y back to y
    w[0] = 'y' if w[0] == ?Y

    # put the trailing character back
    if mark
      w + char
    else
      w
    end
  end
  alias stem stem_porter
end

## stop_words_de.txt
aber
als
am
an
auch
auf
aus
bei
bin
bis
bist
da
dadurch
daher
darum
das
daß
dass
dein
deine
dem
den
der
des
dessen
deshalb
die
dies
dieser
dieses
doch
dort
du
durch
ein
eine
einem
einen
einer
eines
er
es
euer
eure
für
hatte
hatten
hattest
hattet
hier	hinter
ich
ihr
ihre
im
in
ist
ja
jede
jedem
jeden
jeder
jedes
jener
jenes
jetzt
kann
kannst
können
könnt
machen
mein
meine
mit
muß
mußt
musst
müssen
müßt
nach
nachdem
nein
nicht
nun
oder
seid
sein
seine
sich
sie
sind
soll
sollen
sollst
sollt
sonst
soweit
sowie
und
unser	unsere
unter
vom
von
vor
wann
warum
was
weiter
weitere
wenn
wer
werde
werden
werdet
weshalb
wie
wieder
wieso
wir
wird
wirst
wo
woher
wohin
zu
zum
zur
über

## stop_words_en.txt
a
about
above
across
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
am
among
amongst
amoungst
amount
an
and
another
any
anyhow
anyone
anything
anyway
anywhere
are
around
as
at
back
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
below
beside
besides
between
beyond
bill
both
bottom
but
by
call
can
cannot
cant
co
computer
con
could
couldnt
cry
de
describe
detail
do
done
down
due
during
each
eg
eight
either
eleven
else
elsewhere
empty
enough
etc
even
ever
every
everyone
everything
everywhere
except
few
fifteen
fify
fill
find
fire
first
five
for
former
formerly
forty
found
four
from
front
full
further
get
give
go
had
has
hasnt
have
he
hence
her
here
hereafter
hereby
herein
hereupon
hers
herse”
him
himse”
his
how
however
hundred
i
ie
if
in
inc
indeed
interest
into
is
it
its
itse”
keep
last
latter
latterly
least
less
ltd
made
many
may
me
meanwhile
might
mill
mine
more
moreover
most
mostly
move
much
must
my
myse”
name
namely
neither
never
nevertheless
next
nine
no
nobody
none
noone
nor
not
nothing
now
nowhere
of
off
often
on
once
one
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
own
part
per
perhaps
please
put
rather
re
same
see
seem
seemed
seeming
seems
serious
several
she
should
show
side
since
sincere
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
such
system
take
ten
than
that
the
their
them
themselves
then
thence
there
thereafter
thereby
therefore
therein
thereupon
these
they
thick
thin
third
this
those
though
three
through
throughout
thru
thus
to
together
too
top
toward
towards
twelve
twenty
two
un
under
until
up
upon
us
very
via
was
we
well
were
what
whatever
when
whence
whenever
where
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
whoever
whole
whom
whose
why
will
with
within
without
would
yet
you
your
yours
yourself
yourselves

## stop_words_jp.txt
これ
それ
あれ
この
その
あの
ここ
そこ
あそこ
こちら
どこ
だれ
なに
なん
何
私
貴方
貴方方
我々
私達
あの人
あのかた
彼女
彼
です
あります
おります
います
は
が
の
に
を
で
え
から
まで
より
も
どの
と
し
それで
しかし
	#adds helper methods to standard ruby classes

	class String
	#make a string stemmable
	include Stemmable

	#get array of n-grams of string
	def ngrams(len = 1)
	ngrams = []
	len = size if len > size
	(0..size - len).each do \|n\|
	ng = self[n...(n + len)]
	ngrams.push(ng)
	end
	ngrams
	end

	#get last character of string
	def last
	self[size-1,1]
	end
	end

	class Array
	#return hash with frequencies of items in an array
	def freqs
	inject(Hash.new(0)){ \|hash,x\|
	hash[x] +=1
	hash
	}
	end

	#return hash with array elems as keys, values=1
	def exists
	inject(Hash.new(0)){ \|hash,x\|
	hash[x] =1
	hash
	}
	end

	#sum up array items
	def sum
	inject(nil) { \|sum,x\| sum ? sum+x : x }
	end
	end

	class Hash
	#sort hash by values descending
	def sort_num_value
	to_a.sort_by {\|key,value\| -value}
	end

	#convert all hash values to arrays
	def mk_ary_val
	self.each_pair do \|k,v\|
	self[k] = [v]
	self
	end
	end
	end

	class Float
	#round float to decimal places
	alias_method :round_orig, :round
	def round(n=0)
	(self * (10.0 ** n)).round_orig * (10.0 ** (-n))
	end
	end

	class Dir
	#recurse directory, return array of files,dirs
	def self.recurse(path='.', ext='*', &block)
	list = []
	stoplist = ['.', '..']
	Dir.foreach(path) do \|f\|
	next if stoplist.include?(f)
	filename = (path == '.' ? f : path + '/' + f)
	next if f.match(/^\./)
	list << filename
	block.call(filename) if block
	if FileTest.directory?(filename)
	list.concat( Dir.recurse(filename, &block) )
	end
	end
	list
	end
	end
	# encoding: UTF-8

	require 'fileutils'
	require 'iconv'
	require 'stemmer.rb'
	require 'helpers.rb'

	class Indexer

	#constant for stop words
	STOP_WORDS = []

	def initialize
	#data structures
	@ngrams_doc = Hash.new #storing corpus ngrams + their doc-freq
	@docs = 0 #nr of docs processed

	# options
	@ngrams = 3 #nr of ngrams
	@stemming = true #do porter stemming on words
	@stopwording = true #remove stop words
	@upperbound = 1 #ngram must be in at most 90% of docs
	@lowerbound = 0 #ngram must be in at least 20% of docs
	@round = 6 #number of decimal places for tf-idf
	@name = 'indexer' #give the index a name

	# directories & files
	@wdir = Dir.getwd #working directory
	@stw_file = 'stop_words_en.txt' #english stop words file, default
	@tmp_dir = Dir.getwd + '/tmp' #dir for tmp data
	@out_file = Dir.getwd + '/output.arff' #dir for tmp data
	@ngrams_file = @tmp_dir + '/ngrams.dtf' # data store corpus ngrams + df
	end

	attr_accessor :ngrams, :stemming, :stopwording, :upperbound, :lowerbound, :name, :stw_file, :out_file, :ngrams_file, :wdir

	#import stop words from file
	def get_stop_words
	f = File.open @stw_file
	f.each_line do \|l\|
	STOP_WORDS << l.strip
	end
	end

	#remove stop-words with regexp from string, downcased automatically
	def rm_stop_words str
	@stopwording ? str.downcase.gsub(/(#{STOP_WORDS.join('\|')})/, '') : str
	end

	#do some porter stemming
	def stem str
	if @stemming
	out = String.new
	str.split(' ').each { \|s\| out << s.stem << ' ' }
	out[0..out.size-2]
	else
	str
	end
	end

	#process a file, iterate through file, get ngrams + freq
	def process_file filename
	puts "processing: #{filename}"

	# get class assignment
	doc_class = get_class filename
	puts "class assignment: #{doc_class}"

	ngrams_cur = Hash.new #storing current doc ngrams + freq
	doc_cur = Hash.new #storing doc-freq of ngram, always 1
	file_str = String.new #for storing file contents

	#open file and put whole file into a string
	file_str = IO.read(filename)

	# fix encoding errors
	ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
	file_str = ic.iconv(file_str)

	#trim whitspaces: multiple, trailing, leading
	file_str = file_str.downcase.gsub(/\s/, " ").gsub("'",'"').squeeze(" ").strip

	#get ngrams + frequencies
	# replace single quotes with double quotes, escaping can be troublesome for some ngrams
	ngrams_cur = file_str.ngrams(@ngrams).freqs

	#get ngram existance
	doc_cur = file_str.ngrams(@ngrams).exists

	#merge results for global ngrams & doc-freq
	@ngrams_doc = @ngrams_doc.merge(doc_cur) do \|key,val_old,val_new\|
	val_old + val_new
	end

	#store ngrams of cur doc
	fn = File.basename(filename)
	c_file = @tmp_dir + '/' + doc_class + '_' + fn
	File.open(c_file, 'w+') do \|f\|
	Marshal.dump(ngrams_cur, f)
	end

	# increase nr of docs processed
	@docs += 1
	end

	# need to be overwritten
	def get_class f
	return 'braaack'
	end

	def process_dir
	#create tmp directory
	FileUtils.mkdir_p @tmp_dir

	# document class
	doc_class = String.new

	#change working dir & traverse through dir
	Dir.chdir(@wdir)
	Dir.recurse(@wdir) do \|f\|
	if !File.directory?(f)
	process_file(f)
	end
	end

	#cut ngrams to lower and upper bounds
	a = @ngrams_doc.select { \|k,v\|
	v.to_f/@docs > @lowerbound && v.to_f/@docs < @upperbound
	}
	@ngrams_doc = Hash[*a.flatten]

	#store all ngrams of corpus
	File.open(@ngrams_file, 'w') do \|f\|
	f.write Marshal.dump(@ngrams_doc.to_a)
	end

	puts "file #{@ngrams_file} written"
	end

	#delete tmp directory and files
	def cleanup
	if File.directory? @tmp_dir
	FileUtils.rm_r @tmp_dir
	end
	end

	#build the output arff file
	def build_arff
	ngrams = Array.new
	attributes = [['filename','STRING'],['klass','STRING']]
	File.open(@ngrams_file) do \|f\|
	ngrams = Marshal.load(f)
	end
	ngrams.each_with_index do \|ngram, i\|
	attributes << [ngram[0], 'NUMERIC']
	end
	File.open(@out_file, 'w') do \|out\|

	#start output
	r = '@RELATION ' + @name
	out.puts r
	out.puts ''

	#write attributes, escape '
	attributes.each do \|a\|
	#o = '@ATTRIBUTE \'' + a[0].to_s.gsub("\\","\\\\'").gsub("\'","\\\'") + '\' ' + a[1].to_s
	o = '@ATTRIBUTE \'' + a[0].to_s + '\' ' + a[1].to_s
	out.puts o
	end

	#start data section
	out.puts ''
	out.puts '@DATA'

	#enter tmp dir and get all tf files
	Dir.chdir @tmp_dir
	Dir.glob(".").each do \|f\|

	# lignore ngrams file
	if f == @ngrams_file.split('/').last
	next
	end

	#add filename and class name to instance
	instance = ['0 ' + f.split('_')[1],'1 ' + f.split('_')[0]]
	ngrams_cur = Hash.new

	File.open(f) do \|c\|
	ngrams_cur = Marshal.load(c)
	end

	#sum of all tf in document
	dtf = ngrams_cur.values.sum

	#for each ngram to appear in output
	ngrams.each_with_index do \|ngram, i\|
	if ngrams_cur.has_key? ngram[0]
	#calculate tf-idf
	x = ((ngrams_cur[ngram[0]].to_f/dtf)*(Math.log(@docs/ngram[1].to_f))).round(@round)
	instance << (i+2).to_s + ' ' + x.to_s
	end
	end

	# sort instances for sparse output
	instance.sort {\|x,y\| x.to_i <=> y.to_i }

	#write data section
	out << '{'
	instance.each_with_index do \|e,i\|
	if (i == (instance.size-1))
	e = e.to_s + '}'
	out.puts e
	else
	out << e.to_s + ', '
	end
	end
	end
	end
	puts "file: #{@out_file} written"
	end
	end
	require './lib/indexer.rb'

	# rake task to build index
	task :buildindex do

	# mixin to overwrite get_class method
	Indexer.class_eval do
	def get_class f
	f.split('/')[-2]# get class assignment implicitly with folder structure (first hierarchy level)
	end
	end

	# create Indexer
	i = Indexer.new

	# set options
	i.wdir = Dir.getwd + '/corpora/my_text_corpus' # working directory, root directory of corpus
	i.ngrams = 4 # n in ngrams
	i.stemming = true # use stemming?
	i.stopwording = true # use stopwords?
	i.upperbound = 0.4 # upper percentage of docs in which a certain ngram has to appear
	i.lowerbound = 0.01 # lower percentage of docs in which a certain ngram has to appear
	i.name = 'output' # name for the relation in the output file
	i.stw_file = 'stop_words_en.txt' # file containing stopwords
	i.out_file = Dir.getwd + '/output.arff' # name of index file
	i.process_dir # create temporary data files containing n-grams, frequencies
	i.build_arff # build the output arff file
	i.cleanup # delete temporary files

	end
	# ruby porter stemmer by ray pareda, additions dingsdax
	module Stemmable

	STEP_2_LIST = {
	'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
	'izer'=>'ize', 'bli'=>'ble',
	'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
	'ization'=>'ize', 'ation'=>'ate',
	'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
	'ousness'=>'ous', 'aliti'=>'al',
	'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
	}

	STEP_3_LIST = {
	'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
	'ical'=>'ic', 'ful'=>'', 'ness'=>''
	}

	SUFFIX_1_REGEXP = /(
	ational \|
	tional \|
	enci \|
	anci \|
	izer \|
	bli \|
	alli \|
	entli \|
	eli \|
	ousli \|
	ization \|
	ation \|
	ator \|
	alism \|
	iveness \|
	fulness \|
	ousness \|
	aliti \|
	iviti \|
	biliti \|
	logi)$/x

	SUFFIX_2_REGEXP = /(
	al \|
	ance \|
	ence \|
	er \|
	ic \|
	able \|
	ible \|
	ant \|
	ement \|
	ment \|
	ent \|
	ou \|
	ism \|
	ate \|
	iti \|
	ous \|
	ive \|
	ize)$/x

	C = "[^aeiou]" #consonant
	V = "[aeiouy]" #vowel
	CC = "#{C}(?>[^aeiouy]*)" #consonant sequence
	VV = "#{V}(?>[aeiou]*)" #vowel sequence

	MGR0 = /^(#{CC})?#{VV}#{CC}/o #[cc]vvcc... is m>0
	MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o #[cc]vvcc[vv] is m=1
	MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o #[cc]vvccvvcc... is m>1
	VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
	PCT_MARKS = ['\.', '!', '\?', ':', '\(', '\)', ' - '] # trailing characters for removal to allow stemming

	def stem_porter

	# check for trailing characters
	mark = false
	w = String.new(self)
	char = w[-1,1]
	if w[-1,1].match(/(#{PCT_MARKS.join('\|')})/)
	w = w.chop
	mark = true
	end

	return w if w.length < 3

	# now map initial y to Y so that the patterns never treat it as vowel
	w[0] = 'Y' if w[0] == ?y

	# Step 1a
	if w =~ /(ss\|i)es$/
	w = $` + $1
	elsif w =~ /([^s])s$/
	w = $` + $1
	end

	# Step 1b
	if w =~ /eed$/
	w.chop! if $` =~ MGR0
	elsif w =~ /(ed\|ing)$/
	stem = $`
	if stem =~ VOWEL_IN_STEM
	w = stem
	case w
	when /(at\|bl\|iz)$/ then w << "e"
	when /([^aeiouylsz])\1$/ then w.chop!
	when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
	end
	end
	end

	if w =~ /y$/
	stem = $`
	w = stem + "i" if stem =~ VOWEL_IN_STEM
	end

	# Step 2
	if w =~ SUFFIX_1_REGEXP
	stem = $`
	suffix = $1
	# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
	if stem =~ MGR0
	w = stem + STEP_2_LIST[suffix]
	end
	end

	# Step 3
	if w =~ /(icate\|ative\|alize\|iciti\|ical\|ful\|ness)$/
	stem = $`
	suffix = $1
	if stem =~ MGR0
	w = stem + STEP_3_LIST[suffix]
	end
	end

	# Step 4
	if w =~ SUFFIX_2_REGEXP
	stem = $`
	if stem =~ MGR1
	w = stem
	end
	elsif w =~ /(s\|t)(ion)$/
	stem = $` + $1
	if stem =~ MGR1
	w = stem
	end
	end

	# Step 5
	if w =~ /e$/
	stem = $`
	if (stem =~ MGR1) \|\|
	(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
	w = stem
	end
	end
	if w =~ /ll$/ && w =~ MGR1
	w.chop!
	end

	# and turn initial Y back to y
	w[0] = 'y' if w[0] == ?Y

	# put the trailing character back
	if mark
	w + char
	else
	w
	end
	end
	alias stem stem_porter
	end
	aber
	als
	am
	an
	auch
	auf
	aus
	bei
	bin
	bis
	bist
	da
	dadurch
	daher
	darum
	das
	daß
	dass
	dein
	deine
	dem
	den
	der
	des
	dessen
	deshalb
	die
	dies
	dieser
	dieses
	doch
	dort
	du
	durch
	ein
	eine
	einem
	einen
	einer
	eines
	er
	es
	euer
	eure
	für
	hatte
	hatten
	hattest
	hattet
	hier hinter
	ich
	ihr
	ihre
	im
	in
	ist
	ja
	jede
	jedem
	jeden
	jeder
	jedes
	jener
	jenes
	jetzt
	kann
	kannst
	können
	könnt
	machen
	mein
	meine
	mit
	muß
	mußt
	musst
	müssen
	müßt
	nach
	nachdem
	nein
	nicht
	nun
	oder
	seid
	sein
	seine
	sich
	sie
	sind
	soll
	sollen
	sollst
	sollt
	sonst
	soweit
	sowie
	und
	unser unsere
	unter
	vom
	von
	vor
	wann
	warum
	was
	weiter
	weitere
	wenn
	wer
	werde
	werden
	werdet
	weshalb
	wie
	wieder
	wieso
	wir
	wird
	wirst
	wo
	woher
	wohin
	zu
	zum
	zur
	über
	a
	about
	above
	across
	after
	afterwards
	again
	against
	all
	almost
	alone
	along
	already
	also
	although
	always
	am
	among
	amongst
	amoungst
	amount
	an
	and
	another
	any
	anyhow
	anyone
	anything
	anyway
	anywhere
	are
	around
	as
	at
	back
	be
	became
	because
	become
	becomes
	becoming
	been
	before
	beforehand
	behind
	being
	below
	beside
	besides
	between
	beyond
	bill
	both
	bottom
	but
	by
	call
	can
	cannot
	cant
	co
	computer
	con
	could
	couldnt
	cry
	de
	describe
	detail
	do
	done
	down
	due
	during
	each
	eg
	eight
	either
	eleven
	else
	elsewhere
	empty
	enough
	etc
	even
	ever
	every
	everyone
	everything
	everywhere
	except
	few
	fifteen
	fify
	fill
	find
	fire
	first
	five
	for
	former
	formerly
	forty
	found
	four
	from
	front
	full
	further
	get
	give
	go
	had
	has
	hasnt
	have
	he
	hence
	her
	here
	hereafter
	hereby
	herein
	hereupon
	hers
	herse”
	him
	himse”
	his
	how
	however
	hundred
	i
	ie
	if
	in
	inc
	indeed
	interest
	into
	is
	it
	its
	itse”
	keep
	last
	latter
	latterly
	least
	less
	ltd
	made
	many
	may
	me
	meanwhile
	might
	mill
	mine
	more
	moreover
	most
	mostly
	move
	much
	must
	my
	myse”
	name
	namely
	neither
	never
	nevertheless
	next
	nine
	no
	nobody
	none
	noone
	nor
	not
	nothing
	now
	nowhere
	of
	off
	often
	on
	once
	one
	only
	onto
	or
	other
	others
	otherwise
	our
	ours
	ourselves
	out
	over
	own
	part
	per
	perhaps
	please
	put
	rather
	re
	same
	see
	seem
	seemed
	seeming
	seems
	serious
	several
	she
	should
	show
	side
	since
	sincere
	six
	sixty
	so
	some
	somehow
	someone
	something
	sometime
	sometimes
	somewhere
	still
	such
	system
	take
	ten
	than
	that
	the
	their
	them
	themselves
	then
	thence
	there
	thereafter
	thereby
	therefore
	therein
	thereupon
	these
	they
	thick
	thin
	third
	this
	those
	though
	three
	through
	throughout
	thru
	thus
	to
	together
	too
	top
	toward
	towards
	twelve
	twenty
	two
	un
	under
	until
	up
	upon
	us
	very
	via
	was
	we
	well
	were
	what
	whatever
	when
	whence
	whenever
	where
	whereafter
	whereas
	whereby
	wherein
	whereupon
	wherever
	whether
	which
	while
	whither
	who
	whoever
	whole
	whom
	whose
	why
	will
	with
	within
	without
	would
	yet
	you
	your
	yours
	yourself
	yourselves
	これ
	それ
	あれ
	この
	その
	あの
	ここ
	そこ
	あそこ
	こちら
	どこ
	だれ
	なに
	なん
	何
	私
	貴方
	貴方方
	我々
	私達
	あの人
	あのかた
	彼女
	彼
	です
	あります
	おります
	います
	は
	が
	の
	に
	を
	で
	え
	から
	まで
	より
	も
	どの
	と
	し
	それで
	しかし