Skip to content

Instantly share code, notes, and snippets.

@miyucy
Last active January 9, 2018 08:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miyucy/186d94d15103a50ca61d513d5b3bc9cc to your computer and use it in GitHub Desktop.
Save miyucy/186d94d15103a50ca61d513d5b3bc9cc to your computer and use it in GitHub Desktop.
medimg
mkdir -p data image
ruby download.rb
ruby import.rb

pg_dump --dbname=orca_test --encoding=UTF8 --no-owner --clean --table "tbl_yakujyo" | gzip --best --to-stdout > tbl_yakujyo.sql.gz
pg_dump --dbname=orca_test --encoding=UTF8 --no-owner --clean --table "tbl_med_image" | gzip --best --to-stdout > tbl_med_image.sql.gz

gzcat tbl_yakujyo.sql.gz | psql --host=127.0.0.1 --port=15432 --username=orca --dbname=orca
gzcat tbl_med_image.sql.gz | psql --host=127.0.0.1 --port=15432 --username=orca --dbname=orca
require 'httpclient'
require 'nokogiri'
require 'thread'
require 'time'
require 'fileutils'
require 'json'
class Download
attr_reader :link_html, :link_filter, :download_dir, :status_file,
:http_client, :download_urls, :download_errors
def initialize(link_html, link_filter, download_dir, status_file, http_client)
@link_html = link_html
@link_filter = link_filter
@download_dir = download_dir
@status_file = status_file
@http_client = http_client
@download_urls = Queue.new
@download_errors = []
@mutex = Mutex.new
end
def perform
response = http_client.get link_html
unless response.ok?
puts response.status
puts response.body
exit 1
end
mtimes = if File.exist? status_file
JSON.parse File.read(status_file)
else
{}
end
puts "mtimes.size=#{mtimes.size}"
html = Nokogiri::HTML response.body
(html / 'a').each do |node|
href = node.attribute("href")&.value
next unless href
next unless link_filter.call(href)
url = link_html + href
download_urls.push [url, mtimes[href]]
end
download_threads = 4.times.map do
Thread.new do
while tuple = download_urls.pop
url, mtime = tuple
file = File.basename url
path = File.join download_dir, file
request_headers = {}
request_headers['If-Modified-Since'] = mtime if mtime
request_headers.clear unless File.size?(path)
response = http_client.get(url, header: request_headers)
case response.status
when 304
# skip
when 200
File.binwrite(path, response.body)
mtime = response.headers['Last-Modified'] || response.headers['Date']
@mutex.synchronize do
mtimes[file] = mtime
end if mtime
puts "file=#{file} size=#{File.size?(path)} mtime=#{mtime}"
else
download_errors << [url, response.status, response.body]
end
end
end
end
download_threads.each { download_urls.push nil }
download_threads.each(&:join)
unless download_errors.empty?
puts "Error: #{download_errors.size}"
download_errors.each do |url, status, body|
puts "url = #{url}"
puts "status = #{status}"
puts "body = #{body.inspect}"
puts ""
end
end
File.write status_file, mtimes.to_json
end
end
http_client = HTTPClient.new
dl = Download.new('http://ftp.orca.med.or.jp/pub/orca_data/master/siori/data/',
->(href) { href =~ /\.p7m\z/i },
File.join(__dir__, 'data'),
'data.json',
http_client)
dl.perform
dl = Download.new('http://ftp.orca.med.or.jp/pub/orca_data/master/siori/image/',
->(href) { href =~ /\.jpg\z/i },
File.join(__dir__, 'image'),
'image.json',
http_client)
dl.perform
require 'openssl'
require 'nkf'
require 'pg'
require 'base64'
class Import
CREYMD = Time.now.strftime('%Y%m%d')
PATTERN = {
':TERMID:' => '',
':HOSPNUM:' => '1',
':CREYMD:' => CREYMD,
}
PATTERN_REG = Regexp.union(*PATTERN.keys.map { |e| Regexp.escape e })
def self.import
new.perform
end
def initialize
@pg = PG::Connection.new('postgres:///orca_test')
end
def perform
@pg.exec "TRUNCATE tbl_yakujyo"
@pg.exec "TRUNCATE tbl_med_image"
@pg.exec "TRUNCATE tbl_yakujyo_temp"
Dir['data/*.p7m'].each do |path|
transaction do
#puts "Processing: #{File.basename(path)}"
srycd = File.basename(path).sub('-siori.dat.p7m', '')
@pg.exec "SAVEPOINT tmp#{srycd}"
data = replace_placeholder(NKF.nkf('-Ewx', extract_pkcs7_data(File.binread(path))))
begin
@pg.exec data
rescue PG::UniqueViolation => e
puts "Error: #{File.basename(path)}\n#{data}\n#{e.inspect}"
puts ""
@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"
break
end
# ファイル名のsrycdとインサートされたデータのsrycdが違う
sql = "SELECT COUNT(*) FROM tbl_yakujyo_temp WHERE srycd = $1"
params = [srycd]
result = @pg.exec sql, params
unless result[0]["count"].to_i > 0
puts "Error: #{File.basename(path)}\n#{data}"
puts ""
@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"
break
end
rows = @pg.exec("SELECT * FROM tbl_yakujyo_temp ORDER BY rennum ASC").each.map { |row| row }
@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"
photo = ""
%w[photo1_filename photo2_filename photo3_filename].each_with_index do |attr_name, idx|
photo_type = idx + 1
rows.each do |row|
filename = row[attr_name]
next unless filename
next if filename.empty?
photo_filepath = File.join(__dir__, "image", filename)
#puts "Photo type=#{photo_type} file=#{File.basename(photo_filepath)}"
next unless File.exist? photo_filepath
image_data = Base64.strict_encode64(File.binread(photo_filepath))
params = [row["hospnum"], srycd, photo_type, image_data]
sql = <<~SQL
INSERT INTO tbl_med_image
(hospnum, srycd, imagekbn, imagedata)
VALUES
($1, $2, $3, $4)
SQL
@pg.exec sql, params
photo = filename if photo.empty?
#{ srycd: srycd, photo_type: photo_type, filename: filename, image_data: image_data }
break
end
end
rows.each do |row|
sql = <<~SQL
INSERT INTO tbl_yakujyo
(teknum, termid, srycd, shape, color, mark, effect, caution, creymd, hospnum, name, photo)
VALUES
(1, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
SQL
params = [
row["termid"], srycd, row["shape"], row["color"], row["mark"],
row["effect"], row["caution"], row["creymd"], row["hospnum"], row["name"], photo
]
@pg.exec sql, params
break
end
end
end
end
def transaction
@pg.exec "BEGIN"
yield
ensure
@pg.exec "COMMIT"
# @pg.exec "ABORT"
end
def extract_pkcs7_data(smime)
pkcs7 = OpenSSL::PKCS7.read_smime(smime)
signed_data = OpenSSL::ASN1.decode(pkcs7).value[1].value[0]
encap_content_info = signed_data.value[2] if signed_data
econtent = encap_content_info.value[1] if encap_content_info
data = econtent.value[0].value if econtent
data
end
def replace_placeholder(str)
str.gsub(PATTERN_REG, PATTERN)
end
end
Import.import
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment