Last active
August 13, 2017 18:26
-
-
Save ironbyte/8b2f1263208c3852d634b3a3316d29c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CBI Scraper (Offline) | |
# It scraps comic book issues from the website - comicsbackissues.com - and print them to a single text file | |
require 'oga' | |
require 'pry' | |
FILENAME = "cbi_batman.html" | |
class ComicBook | |
attr_accessor :year, :title, :issue_no, :writer, :penciller, :storyline | |
def initialize(year = 0, title = "", issue_no = 1986, writer = "", penciller = "", storyline = "") | |
@year = year | |
@title = title | |
@issue_no = issue_no | |
@writer = writer | |
@penciller = penciller | |
@storyline = storyline | |
end | |
def to_a_single_line_string | |
"#{title} ##{issue_no} (#{year}) (#{writer} & #{penciller})" | |
end | |
end | |
class CBIParser | |
def self.parse(filename, range_from = 0) | |
comicbook_list = [] | |
handle = File.open(filename) | |
document = Oga.parse_html(handle) | |
rows = document.css("tr") | |
# :year, :title, :issue_no, :writer, :penciller, :storyline | |
rows.each_with_index do |row, row_index| | |
if row_index > range_from | |
new_comicbook = ComicBook.new | |
row.children.each_with_index do |field, field_index| | |
if field_index > 0 && field_index < 6 | |
if field_index == 1 | |
new_comicbook.year = field.text.to_i | |
elsif field_index == 2 | |
new_comicbook.title = field.text | |
elsif field_index == 3 | |
new_comicbook.issue_no = field.text.to_i | |
elsif field_index == 4 | |
new_comicbook.writer = field.text.split(', ').first | |
new_comicbook.penciller = field.text.split(', ').last | |
else field_index == 5 | |
new_comicbook.storyline = field.text | |
end | |
end | |
end | |
comicbook_list << new_comicbook | |
end | |
end | |
comicbook_list | |
end | |
end | |
list = CBIParser.parse(FILENAME, 2071) | |
outf = File.new("output.txt", "w") | |
list.each do |issue| | |
outf.puts issue.to_a_single_line_string | |
end | |
outf.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment