Created
December 28, 2009 22:53
-
-
Save muddana/265015 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Collection | |
def initialize | |
@map = Hash.new | |
end | |
def inc_frequency(word) | |
@map[word] = @map[word] ? (@map[word]+1) : 1 | |
end | |
def add(word) | |
inc_frequency(word) | |
end | |
def to_csv | |
csv = "" | |
@map.each do |key, value| | |
csv << key.to_s + " , " + value.to_s + "\n" | |
end | |
csv | |
end | |
def pretty_print | |
@map.each do |key, value| | |
puts key.to_s + " => " + value.to_s | |
end | |
end | |
end | |
class Analyser | |
def initialize(dir) | |
@dir = dir | |
@coll = Collection.new | |
end | |
def get_words_starting_with(letter) | |
@coll = Collection.new | |
analyse(@dir) do |text| | |
a_words = [] | |
text.scan(/\W[#{letter}#{letter.upcase}][a-zA-Z]*/) {|w| a_words << w.scan(/[#{letter}#{letter.upcase}][a-zA-Z]*/)[0] } | |
a_words.each do |word| | |
construct_coll(word.downcase) | |
end | |
end | |
@coll | |
end | |
private | |
def read_file(file_name) | |
text = "" | |
File.open(File.join( @dir , file_name), 'r') do |webpage| | |
text = webpage.gets(nil) | |
end | |
text || "" | |
end | |
def construct_coll(word) | |
@coll.add(word) | |
end | |
def analyse(dir) | |
Dir.new(dir).each do |file_name| | |
next if(file_name == "." || file_name == "..") | |
text = read_file file_name | |
yield(text) if block_given? | |
end | |
end | |
end | |
if (dir_path = ARGV[0]) | |
analyser = Analyser.new(dir_path) | |
File.new('output.csv', 'w').write(analyser.get_words_starting_with('a').to_csv) | |
('a'..'z').each do |alph| | |
analyser.get_words_starting_with(alph).pretty_print | |
end | |
else | |
puts "Please pass the directory path as the arguement" | |
end | |
#test | |
def collect_test_data(url) | |
require 'rubygems' | |
require 'anemone' | |
require 'digest/sha1' | |
Anemone.crawl(url) do |anemone| | |
anemone.on_every_page do |page| | |
File.open(Digest::SHA1.hexdigest(page.url.to_s), 'w') do |file| | |
file.write page.doc | |
end | |
end | |
end | |
end | |
#collect_test_data('http://www.google.com') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment