Skip to content

Instantly share code, notes, and snippets.

@muddana
Created December 28, 2009 22:53
Show Gist options
  • Save muddana/265015 to your computer and use it in GitHub Desktop.
Save muddana/265015 to your computer and use it in GitHub Desktop.
class Collection
def initialize
@map = Hash.new
end
def inc_frequency(word)
@map[word] = @map[word] ? (@map[word]+1) : 1
end
def add(word)
inc_frequency(word)
end
def to_csv
csv = ""
@map.each do |key, value|
csv << key.to_s + " , " + value.to_s + "\n"
end
csv
end
def pretty_print
@map.each do |key, value|
puts key.to_s + " => " + value.to_s
end
end
end
class Analyser
def initialize(dir)
@dir = dir
@coll = Collection.new
end
def get_words_starting_with(letter)
@coll = Collection.new
analyse(@dir) do |text|
a_words = []
text.scan(/\W[#{letter}#{letter.upcase}][a-zA-Z]*/) {|w| a_words << w.scan(/[#{letter}#{letter.upcase}][a-zA-Z]*/)[0] }
a_words.each do |word|
construct_coll(word.downcase)
end
end
@coll
end
private
def read_file(file_name)
text = ""
File.open(File.join( @dir , file_name), 'r') do |webpage|
text = webpage.gets(nil)
end
text || ""
end
def construct_coll(word)
@coll.add(word)
end
def analyse(dir)
Dir.new(dir).each do |file_name|
next if(file_name == "." || file_name == "..")
text = read_file file_name
yield(text) if block_given?
end
end
end
if (dir_path = ARGV[0])
analyser = Analyser.new(dir_path)
File.new('output.csv', 'w').write(analyser.get_words_starting_with('a').to_csv)
('a'..'z').each do |alph|
analyser.get_words_starting_with(alph).pretty_print
end
else
puts "Please pass the directory path as the arguement"
end
#test
def collect_test_data(url)
require 'rubygems'
require 'anemone'
require 'digest/sha1'
Anemone.crawl(url) do |anemone|
anemone.on_every_page do |page|
File.open(Digest::SHA1.hexdigest(page.url.to_s), 'w') do |file|
file.write page.doc
end
end
end
end
#collect_test_data('http://www.google.com')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment