ephesus/data_folder_scrape.rb

## data_folder_scrape.rb
#!/usr/bin/env ruby
# encoding: UTF-8

# == matches the Examiner name from ==:
# 特許出願の番号　　　　　　特願２０１４－５５１４９９
# 起案日　　　　　　　　　　平成２７年　６月２２日
# 特許庁審査官　　　　　　　衣鳩　文彦　　　　　　　　９１９９　５Ｘ００
# 特許出願人代理人　　　　　佐伯　義文（外　１名）　様
# 適用条文　　　　　　　　　第２９条第２項、第３６条
#
# == to the English version, this ==:
# Application Number:	2014-551499
# Drafted:	2015/06/22	(year/month/day)
# Examiner:	Fumihiko IBATO	9199 5X00
# Attorney:	Yoshifumi SAEKI et al.
# Cited Articles: Article 29, Paragraph 2, Article 36


if ARGV.count < 2
        puts "<translations folder> <oadownloads folder>"
        exit
end

require 'yomu'
require 'charlock_holmes'
require 'find'

def scrape(f)
  data = Yomu.new f
  m = data.text.scan(/Application Number:\s(\d+\-\d+)/)
  app_no = m[0][0] unless m.nil? or m[0].nil?
  htmlfile = get_html(f, app_no) unless m.nil?
  return if htmlfile.nil?
  html = File.read("#{ARGV[1]}/#{htmlfile}") if File.exist?("#{ARGV[1]}/#{htmlfile}")
  encoding = CharlockHolmes::EncodingDetector.detect(html)
  hdata = CharlockHolmes::Converter.convert html, encoding[:encoding], 'UTF-8'
  m = data.text.scan(/Examiner:\s(\w+\s+\w+)\s*[0-9A-Z]+\s[0-9A-Z]+/m)
  eng_exam = m[0][0] unless m[0].nil?
  m = hdata.scan(/特許庁審査官\p{Z}+(\p{L}+)\p{Z}(\p{L}+)\p{Z}+\p{N}+\p{Z}[\p{N}\p{L}]+/)
  ja_f = m[0][1] unless m[0].nil?
  ja_l = m[0][0] unless m[0].nil?
  return unless (eng_exam and ja_l)
  puts "#{eng_exam}, #{ja_l} #{ja_f}"
end

def get_html(docfilename, app_no)
  result = nil
  m = docfilename.match(/2015(\d\d\d\d)/)
  return if m.nil? or m[1].nil?
  mnyr = m[1]
  index = "#{ARGV[1]}/2015/#{mnyr}/index.txt"
  f = File.read(index) if File.exist?(index)
  return nil unless app_no
  return nil if f.nil?
  f.scan(/^#{app_no.gsub('-', '\-')},.+拒絶.+$/) do |hit|
    #add to the array 'results' a hash with the three sections
    results = Hash[ [:app_number, :oatype, :filename].zip(hit.split(/, /)) ]
    result = results[:filename]
  end

  return result
end

#start

docdir = ARGV[0]

Dir.glob("#{docdir}/**/*doc", File::FNM_CASEFOLD) {|filename|
  scrape(filename)
}

exit
	#!/usr/bin/env ruby
	# encoding: UTF-8

	# == matches the Examiner name from ==:
	# 特許出願の番号　　　　　　特願２０１４－５５１４９９
	# 起案日　　　　　　　　　　平成２７年　６月２２日
	# 特許庁審査官　　　　　　　衣鳩　文彦　　　　　　　　９１９９　５Ｘ００
	# 特許出願人代理人　　　　　佐伯　義文（外　１名）　様
	# 適用条文　　　　　　　　　第２９条第２項、第３６条
	#
	# == to the English version, this ==:
	# Application Number: 2014-551499
	# Drafted: 2015/06/22 (year/month/day)
	# Examiner: Fumihiko IBATO 9199 5X00
	# Attorney: Yoshifumi SAEKI et al.
	# Cited Articles: Article 29, Paragraph 2, Article 36


	if ARGV.count < 2
	puts "<translations folder> <oadownloads folder>"
	exit
	end

	require 'yomu'
	require 'charlock_holmes'
	require 'find'

	def scrape(f)
	data = Yomu.new f
	m = data.text.scan(/Application Number:\s(\d+\-\d+)/)
	app_no = m[0][0] unless m.nil? or m[0].nil?
	htmlfile = get_html(f, app_no) unless m.nil?
	return if htmlfile.nil?
	html = File.read("#{ARGV[1]}/#{htmlfile}") if File.exist?("#{ARGV[1]}/#{htmlfile}")
	encoding = CharlockHolmes::EncodingDetector.detect(html)
	hdata = CharlockHolmes::Converter.convert html, encoding[:encoding], 'UTF-8'
	m = data.text.scan(/Examiner:\s(\w+\s+\w+)\s*[0-9A-Z]+\s[0-9A-Z]+/m)
	eng_exam = m[0][0] unless m[0].nil?
	m = hdata.scan(/特許庁審査官\p{Z}+(\p{L}+)\p{Z}(\p{L}+)\p{Z}+\p{N}+\p{Z}[\p{N}\p{L}]+/)
	ja_f = m[0][1] unless m[0].nil?
	ja_l = m[0][0] unless m[0].nil?
	return unless (eng_exam and ja_l)
	puts "#{eng_exam}, #{ja_l} #{ja_f}"
	end

	def get_html(docfilename, app_no)
	result = nil
	m = docfilename.match(/2015(\d\d\d\d)/)
	return if m.nil? or m[1].nil?
	mnyr = m[1]
	index = "#{ARGV[1]}/2015/#{mnyr}/index.txt"
	f = File.read(index) if File.exist?(index)
	return nil unless app_no
	return nil if f.nil?
	f.scan(/^#{app_no.gsub('-', '\-')},.+拒絶.+$/) do \|hit\|
	#add to the array 'results' a hash with the three sections
	results = Hash[ [:app_number, :oatype, :filename].zip(hit.split(/, /)) ]
	result = results[:filename]
	end

	return result
	end

	#start

	docdir = ARGV[0]

	Dir.glob("#{docdir}/*/doc", File::FNM_CASEFOLD) {\|filename\|
	scrape(filename)
	}

	exit