Skip to content

Instantly share code, notes, and snippets.

@dingsdax
Last active January 17, 2021 20:00
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dingsdax/1339123 to your computer and use it in GitHub Desktop.
Save dingsdax/1339123 to your computer and use it in GitHub Desktop.
n-grams/tf-idf indexer written in ruby
#adds helper methods to standard ruby classes
class String
#make a string stemmable
include Stemmable
#get array of n-grams of string
def ngrams(len = 1)
ngrams = []
len = size if len > size
(0..size - len).each do |n|
ng = self[n...(n + len)]
ngrams.push(ng)
end
ngrams
end
#get last character of string
def last
self[size-1,1]
end
end
class Array
#return hash with frequencies of items in an array
def freqs
inject(Hash.new(0)){ |hash,x|
hash[x] +=1
hash
}
end
#return hash with array elems as keys, values=1
def exists
inject(Hash.new(0)){ |hash,x|
hash[x] =1
hash
}
end
#sum up array items
def sum
inject(nil) { |sum,x| sum ? sum+x : x }
end
end
class Hash
#sort hash by values descending
def sort_num_value
to_a.sort_by {|key,value| -value}
end
#convert all hash values to arrays
def mk_ary_val
self.each_pair do |k,v|
self[k] = [v]
self
end
end
end
class Float
#round float to decimal places
alias_method :round_orig, :round
def round(n=0)
(self * (10.0 ** n)).round_orig * (10.0 ** (-n))
end
end
class Dir
#recurse directory, return array of files,dirs
def self.recurse(path='.', ext='*', &block)
list = []
stoplist = ['.', '..']
Dir.foreach(path) do |f|
next if stoplist.include?(f)
filename = (path == '.' ? f : path + '/' + f)
next if f.match(/^\./)
list << filename
block.call(filename) if block
if FileTest.directory?(filename)
list.concat( Dir.recurse(filename, &block) )
end
end
list
end
end
# encoding: UTF-8
require 'fileutils'
require 'iconv'
require 'stemmer.rb'
require 'helpers.rb'
class Indexer
#constant for stop words
STOP_WORDS = []
def initialize
#data structures
@ngrams_doc = Hash.new #storing corpus ngrams + their doc-freq
@docs = 0 #nr of docs processed
# options
@ngrams = 3 #nr of ngrams
@stemming = true #do porter stemming on words
@stopwording = true #remove stop words
@upperbound = 1 #ngram must be in at most 90% of docs
@lowerbound = 0 #ngram must be in at least 20% of docs
@round = 6 #number of decimal places for tf-idf
@name = 'indexer' #give the index a name
# directories & files
@wdir = Dir.getwd #working directory
@stw_file = 'stop_words_en.txt' #english stop words file, default
@tmp_dir = Dir.getwd + '/tmp' #dir for tmp data
@out_file = Dir.getwd + '/output.arff' #dir for tmp data
@ngrams_file = @tmp_dir + '/ngrams.dtf' # data store corpus ngrams + df
end
attr_accessor :ngrams, :stemming, :stopwording, :upperbound, :lowerbound, :name, :stw_file, :out_file, :ngrams_file, :wdir
#import stop words from file
def get_stop_words
f = File.open @stw_file
f.each_line do |l|
STOP_WORDS << l.strip
end
end
#remove stop-words with regexp from string, downcased automatically
def rm_stop_words str
@stopwording ? str.downcase.gsub(/(#{STOP_WORDS.join('|')})/, '') : str
end
#do some porter stemming
def stem str
if @stemming
out = String.new
str.split(' ').each { |s| out << s.stem << ' ' }
out[0..out.size-2]
else
str
end
end
#process a file, iterate through file, get ngrams + freq
def process_file filename
puts "processing: #{filename}"
# get class assignment
doc_class = get_class filename
puts "class assignment: #{doc_class}"
ngrams_cur = Hash.new #storing current doc ngrams + freq
doc_cur = Hash.new #storing doc-freq of ngram, always 1
file_str = String.new #for storing file contents
#open file and put whole file into a string
file_str = IO.read(filename)
# fix encoding errors
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
file_str = ic.iconv(file_str)
#trim whitspaces: multiple, trailing, leading
file_str = file_str.downcase.gsub(/\s/, " ").gsub("'",'"').squeeze(" ").strip
#get ngrams + frequencies
# replace single quotes with double quotes, escaping can be troublesome for some ngrams
ngrams_cur = file_str.ngrams(@ngrams).freqs
#get ngram existance
doc_cur = file_str.ngrams(@ngrams).exists
#merge results for global ngrams & doc-freq
@ngrams_doc = @ngrams_doc.merge(doc_cur) do |key,val_old,val_new|
val_old + val_new
end
#store ngrams of cur doc
fn = File.basename(filename)
c_file = @tmp_dir + '/' + doc_class + '_' + fn
File.open(c_file, 'w+') do |f|
Marshal.dump(ngrams_cur, f)
end
# increase nr of docs processed
@docs += 1
end
# need to be overwritten
def get_class f
return 'braaack'
end
def process_dir
#create tmp directory
FileUtils.mkdir_p @tmp_dir
# document class
doc_class = String.new
#change working dir & traverse through dir
Dir.chdir(@wdir)
Dir.recurse(@wdir) do |f|
if !File.directory?(f)
process_file(f)
end
end
#cut ngrams to lower and upper bounds
a = @ngrams_doc.select { |k,v|
v.to_f/@docs > @lowerbound && v.to_f/@docs < @upperbound
}
@ngrams_doc = Hash[*a.flatten]
#store all ngrams of corpus
File.open(@ngrams_file, 'w') do |f|
f.write Marshal.dump(@ngrams_doc.to_a)
end
puts "file #{@ngrams_file} written"
end
#delete tmp directory and files
def cleanup
if File.directory? @tmp_dir
FileUtils.rm_r @tmp_dir
end
end
#build the output arff file
def build_arff
ngrams = Array.new
attributes = [['filename','STRING'],['klass','STRING']]
File.open(@ngrams_file) do |f|
ngrams = Marshal.load(f)
end
ngrams.each_with_index do |ngram, i|
attributes << [ngram[0], 'NUMERIC']
end
File.open(@out_file, 'w') do |out|
#start output
r = '@RELATION ' + @name
out.puts r
out.puts ''
#write attributes, escape '
attributes.each do |a|
#o = '@ATTRIBUTE \'' + a[0].to_s.gsub("\\","\\\\'").gsub("\'","\\\'") + '\' ' + a[1].to_s
o = '@ATTRIBUTE \'' + a[0].to_s + '\' ' + a[1].to_s
out.puts o
end
#start data section
out.puts ''
out.puts '@DATA'
#enter tmp dir and get all tf files
Dir.chdir @tmp_dir
Dir.glob("*.*").each do |f|
# lignore ngrams file
if f == @ngrams_file.split('/').last
next
end
#add filename and class name to instance
instance = ['0 ' + f.split('_')[1],'1 ' + f.split('_')[0]]
ngrams_cur = Hash.new
File.open(f) do |c|
ngrams_cur = Marshal.load(c)
end
#sum of all tf in document
dtf = ngrams_cur.values.sum
#for each ngram to appear in output
ngrams.each_with_index do |ngram, i|
if ngrams_cur.has_key? ngram[0]
#calculate tf-idf
x = ((ngrams_cur[ngram[0]].to_f/dtf)*(Math.log(@docs/ngram[1].to_f))).round(@round)
instance << (i+2).to_s + ' ' + x.to_s
end
end
# sort instances for sparse output
instance.sort {|x,y| x.to_i <=> y.to_i }
#write data section
out << '{'
instance.each_with_index do |e,i|
if (i == (instance.size-1))
e = e.to_s + '}'
out.puts e
else
out << e.to_s + ', '
end
end
end
end
puts "file: #{@out_file} written"
end
end
require './lib/indexer.rb'
# rake task to build index
task :buildindex do
# mixin to overwrite get_class method
Indexer.class_eval do
def get_class f
f.split('/')[-2]# get class assignment implicitly with folder structure (first hierarchy level)
end
end
# create Indexer
i = Indexer.new
# set options
i.wdir = Dir.getwd + '/corpora/my_text_corpus' # working directory, root directory of corpus
i.ngrams = 4 # n in ngrams
i.stemming = true # use stemming?
i.stopwording = true # use stopwords?
i.upperbound = 0.4 # upper percentage of docs in which a certain ngram has to appear
i.lowerbound = 0.01 # lower percentage of docs in which a certain ngram has to appear
i.name = 'output' # name for the relation in the output file
i.stw_file = 'stop_words_en.txt' # file containing stopwords
i.out_file = Dir.getwd + '/output.arff' # name of index file
i.process_dir # create temporary data files containing n-grams, frequencies
i.build_arff # build the output arff file
i.cleanup # delete temporary files
end
# ruby porter stemmer by ray pareda, additions dingsdax
module Stemmable
STEP_2_LIST = {
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
'izer'=>'ize', 'bli'=>'ble',
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
'ization'=>'ize', 'ation'=>'ate',
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
'ousness'=>'ous', 'aliti'=>'al',
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
}
STEP_3_LIST = {
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
'ical'=>'ic', 'ful'=>'', 'ness'=>''
}
SUFFIX_1_REGEXP = /(
ational |
tional |
enci |
anci |
izer |
bli |
alli |
entli |
eli |
ousli |
ization |
ation |
ator |
alism |
iveness |
fulness |
ousness |
aliti |
iviti |
biliti |
logi)$/x
SUFFIX_2_REGEXP = /(
al |
ance |
ence |
er |
ic |
able |
ible |
ant |
ement |
ment |
ent |
ou |
ism |
ate |
iti |
ous |
ive |
ize)$/x
C = "[^aeiou]" #consonant
V = "[aeiouy]" #vowel
CC = "#{C}(?>[^aeiouy]*)" #consonant sequence
VV = "#{V}(?>[aeiou]*)" #vowel sequence
MGR0 = /^(#{CC})?#{VV}#{CC}/o #[cc]vvcc... is m>0
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o #[cc]vvcc[vv] is m=1
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o #[cc]vvccvvcc... is m>1
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
PCT_MARKS = ['\.', '!', '\?', ':', '\(', '\)', ' - '] # trailing characters for removal to allow stemming
def stem_porter
# check for trailing characters
mark = false
w = String.new(self)
char = w[-1,1]
if w[-1,1].match(/(#{PCT_MARKS.join('|')})/)
w = w.chop
mark = true
end
return w if w.length < 3
# now map initial y to Y so that the patterns never treat it as vowel
w[0] = 'Y' if w[0] == ?y
# Step 1a
if w =~ /(ss|i)es$/
w = $` + $1
elsif w =~ /([^s])s$/
w = $` + $1
end
# Step 1b
if w =~ /eed$/
w.chop! if $` =~ MGR0
elsif w =~ /(ed|ing)$/
stem = $`
if stem =~ VOWEL_IN_STEM
w = stem
case w
when /(at|bl|iz)$/ then w << "e"
when /([^aeiouylsz])\1$/ then w.chop!
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
end
end
end
if w =~ /y$/
stem = $`
w = stem + "i" if stem =~ VOWEL_IN_STEM
end
# Step 2
if w =~ SUFFIX_1_REGEXP
stem = $`
suffix = $1
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
if stem =~ MGR0
w = stem + STEP_2_LIST[suffix]
end
end
# Step 3
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
stem = $`
suffix = $1
if stem =~ MGR0
w = stem + STEP_3_LIST[suffix]
end
end
# Step 4
if w =~ SUFFIX_2_REGEXP
stem = $`
if stem =~ MGR1
w = stem
end
elsif w =~ /(s|t)(ion)$/
stem = $` + $1
if stem =~ MGR1
w = stem
end
end
# Step 5
if w =~ /e$/
stem = $`
if (stem =~ MGR1) ||
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
w = stem
end
end
if w =~ /ll$/ && w =~ MGR1
w.chop!
end
# and turn initial Y back to y
w[0] = 'y' if w[0] == ?Y
# put the trailing character back
if mark
w + char
else
w
end
end
alias stem stem_porter
end
aber
als
am
an
auch
auf
aus
bei
bin
bis
bist
da
dadurch
daher
darum
das
daß
dass
dein
deine
dem
den
der
des
dessen
deshalb
die
dies
dieser
dieses
doch
dort
du
durch
ein
eine
einem
einen
einer
eines
er
es
euer
eure
für
hatte
hatten
hattest
hattet
hier hinter
ich
ihr
ihre
im
in
ist
ja
jede
jedem
jeden
jeder
jedes
jener
jenes
jetzt
kann
kannst
können
könnt
machen
mein
meine
mit
muß
mußt
musst
müssen
müßt
nach
nachdem
nein
nicht
nun
oder
seid
sein
seine
sich
sie
sind
soll
sollen
sollst
sollt
sonst
soweit
sowie
und
unser unsere
unter
vom
von
vor
wann
warum
was
weiter
weitere
wenn
wer
werde
werden
werdet
weshalb
wie
wieder
wieso
wir
wird
wirst
wo
woher
wohin
zu
zum
zur
über
a
about
above
across
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
am
among
amongst
amoungst
amount
an
and
another
any
anyhow
anyone
anything
anyway
anywhere
are
around
as
at
back
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
below
beside
besides
between
beyond
bill
both
bottom
but
by
call
can
cannot
cant
co
computer
con
could
couldnt
cry
de
describe
detail
do
done
down
due
during
each
eg
eight
either
eleven
else
elsewhere
empty
enough
etc
even
ever
every
everyone
everything
everywhere
except
few
fifteen
fify
fill
find
fire
first
five
for
former
formerly
forty
found
four
from
front
full
further
get
give
go
had
has
hasnt
have
he
hence
her
here
hereafter
hereby
herein
hereupon
hers
herse”
him
himse”
his
how
however
hundred
i
ie
if
in
inc
indeed
interest
into
is
it
its
itse”
keep
last
latter
latterly
least
less
ltd
made
many
may
me
meanwhile
might
mill
mine
more
moreover
most
mostly
move
much
must
my
myse”
name
namely
neither
never
nevertheless
next
nine
no
nobody
none
noone
nor
not
nothing
now
nowhere
of
off
often
on
once
one
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
own
part
per
perhaps
please
put
rather
re
same
see
seem
seemed
seeming
seems
serious
several
she
should
show
side
since
sincere
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
such
system
take
ten
than
that
the
their
them
themselves
then
thence
there
thereafter
thereby
therefore
therein
thereupon
these
they
thick
thin
third
this
those
though
three
through
throughout
thru
thus
to
together
too
top
toward
towards
twelve
twenty
two
un
under
until
up
upon
us
very
via
was
we
well
were
what
whatever
when
whence
whenever
where
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
whoever
whole
whom
whose
why
will
with
within
without
would
yet
you
your
yours
yourself
yourselves
これ
それ
あれ
この
その
あの
ここ
そこ
あそこ
こちら
どこ
だれ
なに
なん
貴方
貴方方
我々
私達
あの人
あのかた
彼女
です
あります
おります
います
から
まで
より
どの
それで
しかし
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment