karmi (owner)

Forks

Revisions

gist: 222260 Download_button fork
public
Public Clone URL: git://gist.github.com/222260.git
Embed All Files: show embed
phpfashion_com_exercise.rb #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# Simple exercise based on call to action at
# http://phpfashion.com/cisty-programatorsky-experiment
 
require 'net/http'
require 'tmpdir'
require 'digest/md5'
require 'fileutils'
require 'pathname'
 
module Crawler
 
  # Generic file-based storage. Do your own: MySQLStorage, etc.
  #
  # In real life, you'd create an abstract class to define the "interface" etc :P
  # In real life, you'd need some Cache class to wrap the Storage.
  # You need to ask the cache for data, not the storage. But we don't care here.
  # And <b>first of all</b>, in real life you'd need some expiration logic :)
  #
  class FileStorage
    def initialize(path=nil)
      raise ArgumentError, "Storage path does not exist!" if path && !File.exist?(path)
      path ||= Dir::tmpdir
      @store = Pathname.new(path)
    end
    def set(key, value); File.open( @store.join(encode(key)), 'w' ) { |file| file << Marshal.dump(value) }; end
    def get(key); Marshal.load(File.read( @store.join(encode(key)))); end
    def exist?(key); File.exist? @store.join(encode(key)); end
    private
    def encode(key); Digest::MD5.hexdigest(key); end
  end
 
  # = Wrap a HTML page
  #
  # The +load+ method returns the representation of HTML page either from cache,
  # or from the network (and caches it)
  #
  # == Usage
  #
  # require 'web_page'
  # page = Crawler::WebPage.load('htpp://example.com')
  # puts page.url
  # puts page.body
  # puts page.headers.inspect
  #
  class WebPage
 
    @cache = Crawler::FileStorage.new
 
    def self.load(url)
      return cache.get( url ) if cache.exist?( url )
      url = URI.parse(url)
      url.path = '/' if url.path =~ /^$/
      client = Net::HTTP.start(url.host, url.port)
      response = client.request_get(url.path)
      webpage = WebPage.new( url.to_s, response.body, response.to_hash )
      cache.set(url.to_s, webpage)
      return webpage
    end
 
    attr_reader :url, :body, :headers
    def initialize(url, body, headers)
      @url, @body, @headers = url, body, headers
      self
    end
 
    def self.cache; @cache; end
    def thumbnail
      @thumbnail ||= create_thumbnail
    end
 
    private
 
    def create_thumbnail
      # Thumbnail.new(url) ... Some expensive logic ... etc
    end
 
  end
 
end
 
 
if $0 == __FILE__
 
  require 'test/unit'
  require 'rubygems'
  require 'fakeweb'
  require 'shoulda'
 
  include Crawler
 
  FakeWeb.register_uri :get, 'http://example.com', :response => DATA.read
  FakeWeb.allow_net_connect = false
 
  class WebPageTest < Test::Unit::TestCase
 
    context "When loading a URL, it" do
 
      should "handle the request" do
        assert_nothing_raised { @webpage = WebPage.load('http://example.com/') }
        assert_not_nil @webpage
      end
 
      should "add trailing slash" do
        assert_nothing_raised { @webpage = WebPage.load('http://example.com') }
        assert_not_nil @webpage
      end
 
      should "return the URL with trailing slash back" do
        @webpage = WebPage.load('http://example.com')
        assert_equal 'http://example.com/', @webpage.url
      end
 
      should "parse the body" do
        @webpage = WebPage.load('http://example.com')
        assert_match /Example Web Page/, @webpage.body
      end
 
      should "parse the headers" do
        @webpage = WebPage.load('http://example.com')
        assert_not_nil @webpage.headers
        assert_equal 'text/html; charset=UTF-8', @webpage.headers['content-type'].to_s
      end
 
      should "have thumbnail" do
        @webpage = WebPage.load('http://example.com')
        assert_respond_to @webpage, :thumbnail
      end
 
      should "cache the response" do
        @webpage = WebPage.load('http://example.com/')
        assert_not_nil WebPage.cache.exist?( 'http://example.com/' )
        @cached = WebPage.cache.get( 'http://example.com/' )
        assert_instance_of WebPage, @cached
      end
 
      should "load valid web page from cache" do
        @webpage = WebPage.load('http://example.com/')
        @cached = WebPage.cache.get( 'http://example.com/' )
        assert_equal @webpage.url, @cached.url
        assert_equal @webpage.body, @cached.body
        assert_equal @webpage.headers, @cached.headers
      end
    end
 
    # ---------------------------------------------------------------------------
 
    context "FileStorage" do
      setup do
        @tmp_path = File.join(File.dirname(__FILE__), 'tmp')
        FileUtils.mkdir_p @tmp_path
      end
 
      teardown do
        FileUtils.rm_rf @tmp_path
      end
 
      should "be initialized with a valid path" do
        assert_nothing_raised { @storage = FileStorage.new @tmp_path }
        assert File.exist?(@tmp_path), "Path does not exist"
      end
 
      should "raise when initialized with invalid path" do
        assert_raise(ArgumentError) { FileStorage.new('/some/path/to/hell') }
      end
 
      should "not have data missing key" do
        @storage = FileStorage.new @tmp_path
        assert_equal false, @storage.exist?('who-do-you-think-you-are-talking-to')
      end
 
      should "have data for valid key" do
        @storage = FileStorage.new @tmp_path
        @storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
        assert @storage.exist?('abc123'), "Does not have data for the abc123 key?!"
      end
 
      should "store and retrieve data" do
        @storage = FileStorage.new @tmp_path
        @storage.set('abc123', { :string => 'Hello', :array => [1, 2, 3] })
        assert_equal @storage.get('abc123')[:array], [1, 2, 3]
      end
    end
 
    # ---------------------------------------------------------------------------
    
  end
 
  
end
 
__END__
HTTP/1.1 200 OK
Server: Apache/2.2.3 (Red Hat)
Last-Modified: Tue, 15 Nov 2005 13:24:10 GMT
ETag: "b300b4-1b6-4059a80bfd280"
Accept-Ranges: bytes
Content-Type: text/html; charset=UTF-8
Connection: Keep-Alive
Date: Fri, 30 Oct 2009 09:20:03 GMT
Age: 2361   
Content-Length: 438
 
<HTML>
<HEAD>
  <TITLE>Example Web Page</TITLE>
</HEAD>
<body>  
<p>You have reached this web page by typing &quot;example.com&quot;,
&quot;example.net&quot;,
  or &quot;example.org&quot; into your web browser.</p>
<p>These domain names are reserved for use in documentation and are not available
  for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC
  2606</a>, Section 3.</p>
</BODY>
</HTML>
 
zzz_lost_in_translation.rb #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Now, let's play design patterns freaks!!!11 :D
 
require 'phpfashion_com_exercise'
 
module LostInTranslation
 
  # First, we need to untangle the "mess" in WebPage
  # Everything needs to be MUCH MORE complicated, right? :)
  # In Ruby, we just perform a little sugery on out
  # WebPage class. Let's do this!
  ::Crawler::WebPage.class_eval do
    @cache = nil
    def self.load(url)
      url = URI.parse(url)
      url.path = '/' if url.path =~ /^$/
      client = Net::HTTP.start(url.host, url.port)
      response = client.request_get(url.path)
      webpage = ::Crawler::WebPage.new( url.to_s, response.body, response.to_hash )
      return webpage
    end
  end
 
 
  # Gee, now this makes MUCH MORE sense!
  # See, it's *storage* for chrissake, right in the name!!! :)
  class WebPageStorage
 
    @cache = Crawler::FileStorage.new
    def self.cache; @cache; end
 
    def self.load(url)
      return cache.get( url ) if cache.exist?( url ) # Already cached
      webpage = Crawler::WebPage.load( url )
      cache.set(url.to_s, webpage) # Cache the result
      return webpage
    end
 
  end
 
end
 
# We don't need no stinking tests, it's obvious, right?! :)
# We just `puts inspect` something, hell, it's Friday anyway!
 
puts "1/"
puts LostInTranslation::WebPageStorage.load('http://example.com').inspect
 
puts "---"
 
puts "2/"
puts LostInTranslation::WebPageStorage.load('http://example.com').inspect