Skip to content

Instantly share code, notes, and snippets.

@ironbyte
Last active August 13, 2017 18:26
Show Gist options
  • Save ironbyte/8b2f1263208c3852d634b3a3316d29c5 to your computer and use it in GitHub Desktop.
Save ironbyte/8b2f1263208c3852d634b3a3316d29c5 to your computer and use it in GitHub Desktop.
# CBI Scraper (Offline)
# It scraps comic book issues from the website - comicsbackissues.com - and print them to a single text file
require 'oga'
require 'pry'
FILENAME = "cbi_batman.html"
class ComicBook
attr_accessor :year, :title, :issue_no, :writer, :penciller, :storyline
def initialize(year = 0, title = "", issue_no = 1986, writer = "", penciller = "", storyline = "")
@year = year
@title = title
@issue_no = issue_no
@writer = writer
@penciller = penciller
@storyline = storyline
end
def to_a_single_line_string
"#{title} ##{issue_no} (#{year}) (#{writer} & #{penciller})"
end
end
class CBIParser
def self.parse(filename, range_from = 0)
comicbook_list = []
handle = File.open(filename)
document = Oga.parse_html(handle)
rows = document.css("tr")
# :year, :title, :issue_no, :writer, :penciller, :storyline
rows.each_with_index do |row, row_index|
if row_index > range_from
new_comicbook = ComicBook.new
row.children.each_with_index do |field, field_index|
if field_index > 0 && field_index < 6
if field_index == 1
new_comicbook.year = field.text.to_i
elsif field_index == 2
new_comicbook.title = field.text
elsif field_index == 3
new_comicbook.issue_no = field.text.to_i
elsif field_index == 4
new_comicbook.writer = field.text.split(', ').first
new_comicbook.penciller = field.text.split(', ').last
else field_index == 5
new_comicbook.storyline = field.text
end
end
end
comicbook_list << new_comicbook
end
end
comicbook_list
end
end
list = CBIParser.parse(FILENAME, 2071)
outf = File.new("output.txt", "w")
list.each do |issue|
outf.puts issue.to_a_single_line_string
end
outf.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment