Created
November 11, 2012 14:20
-
-
Save inbeom/4055019 to your computer and use it in GitHub Desktop.
Parser for HTML files generated by Microsoft Excel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding = utf-8 | |
require 'nokogiri' | |
require 'iconv' | |
require 'table_parser' | |
module TableParser | |
class Table | |
def each | |
return unless block_given? | |
@columns.each { |c| yield c } | |
end | |
end | |
class TableColumn | |
def to_a | |
@children | |
end | |
end | |
class TableNode | |
def to_s | |
@text | |
end | |
end | |
end | |
base = File.dirname(__FILE__) | |
def parse(filename) | |
f = File.open(filename, 'r') | |
body = Iconv.conv('utf-8', 'euc-kr', f.read) | |
doc = Nokogiri::HTML body | |
table = TableParser::Table.new doc, '//table', header: false | |
cols = [] | |
table.each do |colgroup| | |
cols << colgroup.to_a | |
end | |
rows = {} | |
cols.each do |col| | |
col.each_with_index do |entry, i| | |
rows[i] ||= [] | |
rows[i] << entry | |
end | |
end | |
rows.each do |i, row| | |
puts row.join("\t") | |
end | |
end | |
Dir.entries(base).each do |entry| | |
next if entry == '.' || entry == '..' | |
if File.directory?(File.join(base, entry)) | |
Dir.entries(File.join(base, entry)).each do |inner_entry| | |
next unless inner_entry =~ /sheet.+\.htm/ | |
parse(File.join(base, entry, inner_entry)) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment