Created
October 5, 2012 17:32
-
-
Save aycabta/3841182 to your computer and use it in GitHub Desktop.
usage: mkdir images; mkdir images_small; ruby negi_t.rb > output && ruby data_to_html.rb > html && sh resize.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
puts "<table border=\"0\" width=\"700\" align=\"center\">\n" | |
puts " <tr bgcolor=\"#000000\">\n" | |
puts " <th></th>\n" | |
puts " <th></th>\n" | |
puts " <th></th>\n" | |
puts " <th></th>\n" | |
puts " <th></th>\n" | |
puts " </tr>\n" | |
items = Array.new | |
open('output') do |f| | |
while not f.eof? | |
begin | |
name = f.gets.chomp! | |
image = f.gets.chomp! | |
path = f.gets.chomp! | |
rescue | |
break | |
end | |
items << {'name' => name, 'image' => image, 'path' => path} | |
end | |
end | |
while not items.empty? | |
items_of_line = Array.new | |
5.times do | |
items_of_line << items.pop if not items.empty? | |
end | |
puts " <tr align=center>\n" | |
items_of_line.each do |item| | |
puts " <td><a href=\"http://ttrinity.jp#{item['path']}\" target=\"_blank\">" + | |
"<img src=\"http://negineesan.com/etc/tshirts/#{item['image']}\" alt=\"#{item['name']}\" width=\"144\" height=\"144\" border=\"0\" style=\"background-color: #4c4c4c;\" />" + | |
"</a></td>\n" | |
end | |
(5 - items_of_line.size).times do | |
puts " <td></td>\n" | |
end | |
puts " </tr>\n" | |
puts " <tr align=\"center\">\n" | |
items_of_line.each do |item| | |
puts " <td><a href=\"http://ttrinity.jp#{item['path']}\">#{item['name']}</a></td>\n" | |
end | |
(5 - items_of_line.size).times do | |
puts " <td></td>\n" | |
end | |
puts " </tr>\n" | |
puts " <tr align=\"center\">\n" | |
items_of_line.each do |item| | |
puts " <td><a href=\"http://ttrinity.jp#{item['path']}\">#{item['name']}</a></td>\n" | |
end | |
(5 - items_of_line.size).times do | |
puts " <td></td>\n" | |
end | |
puts " </tr>\n" | |
end | |
puts "</table>\n" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
def download_image(http, path) | |
filename_regexp = Regexp.compile('/([^/]+)$') | |
filename = filename_regexp.match(path).captures[0] | |
begin | |
resp = http.get(path) | |
rescue | |
sleep 30 | |
retry | |
end | |
open(File.join('images', filename), 'w') do |f| | |
f.puts resp.body | |
end | |
filename | |
end | |
def scrape_item(http, path) | |
/(\d+)/ =~ path | |
id = $~.captures[0] | |
begin | |
resp = http.get(path) | |
rescue | |
sleep 30 | |
retry | |
end | |
page = resp.body | |
begin | |
itemname_regexp = Regexp.compile('<div id="itemNameArea">\s+<h2 class="wb">([^<]+)</h2>', Regexp::MULTILINE) | |
itemname_match = itemname_regexp.match(page) | |
itemname = itemname_match.captures[0] | |
rescue | |
p "itemname: #{path}" | |
end | |
# images_regexp = Regexp.compile('<a href="([^"]+)"><img src="[^"]+" id="img_f"[^>]+>') | |
# images_match = images_regexp.match(page) | |
# foreside_image = images_match.captures[0] | |
# images_regexp = Regexp.compile('<a href="([^"]+)"><img src="[^"]+" id="img_b"[^>]+>') | |
# images_match = images_regexp.match(page) | |
# backside_image = images_match.captures[0] | |
begin | |
images_regexp = Regexp.compile('<a href="([^"]+)"><img src="[^"]+"[^>]+rel="photo"[^>]+class="png[^>]*>') | |
images_match = images_regexp.match(page) | |
orig_image = images_match.captures[0] | |
rescue | |
p "orig_image: #{path}" | |
end | |
# download_image(http, foreside_image) | |
# download_image(http, backside_image) | |
orig_image_filename = download_image(http, orig_image) | |
{'itemname' => itemname, 'image_filename' => orig_image_filename, 'path' => path} | |
end | |
def scrape_list(http, path) | |
items = Array.new | |
begin | |
resp = http.get(path) | |
rescue | |
sleep 30 | |
retry | |
end | |
page = resp.body | |
list_regexp = Regexp.compile('<p class="item"><a href="(/product/\d+)#\d+">') | |
while not (list_match = list_regexp.match(page)).nil? | |
sleep 2 | |
items << scrape_item(http, list_match.captures[0]) | |
page = list_match.post_match | |
end | |
next_page_regexp = Regexp.compile('<li class="next active"><a href="(.+)">') | |
next_page_match = next_page_regexp.match(resp.body) | |
if next_page_match | |
next_page_path = '/shop/negineesan/' + next_page_match.captures[0] | |
next_page_path.gsub!(/&/, '&') | |
a = scrape_list(http, next_page_path) | |
items.concat a | |
end | |
items | |
end | |
items = nil | |
Net::HTTP.start('ttrinity.jp') do |http| | |
items = scrape_list(http, '/shop/negineesan/') | |
end | |
items.each do |item| | |
puts "#{item['itemname']}\n#{item['image_filename']}\n#{item['path']}\n" | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cd images | |
find . -name "*" -exec convert -resize 130x130 {} ../images_small/{} \; | |
cd .. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment