kirushik/gist:573700

## gistfile1.rb
$KCODE = 'UTF8'

require 'rubygems'

require 'mechanize'
require 'docsplit'

require 'fastercsv'

a = Mechanize.new
a.get "http://www.council.gov.ru/events/shed/index.html"
full_uri = a.page.uri + a.page.links.find {|l| l.href.match(/.+\.pdf$/)}.uri

filename = full_uri.to_s.split('/').last

if Dir.glob(filename).size!=1
then
  system 'rm -f '+ filename
  system 'wget '+ full_uri.to_s
end


Docsplit.extract_text(filename, :ocr => false, :output => './')

raw_data = ""

File.open(filename.gsub(/\.pdf$/,'.txt')).each_line do |line|
  raw_data += line
end

raw_data.gsub!(/.+.doc G/, '').gsub!(/\d+
1

2

3

4
/m,'')

puts raw_data


system 'rm -f '+ filename.gsub(/\.pdf$/,'.csv')
FasterCSV.open(filename.gsub(/\.pdf$/,'.csv'), 'w') do |csv|
raw_data.scan(/^((\d+\.)+)\n\n(\D(.+\n)+)\n(\D(.+\n)+)?\n(\D(.+\n)+)?/).each{|s| csv << [s[0],s[2],s[4],s[6]]}
end
	$KCODE = 'UTF8'

	require 'rubygems'

	require 'mechanize'
	require 'docsplit'

	require 'fastercsv'

	a = Mechanize.new
	a.get "http://www.council.gov.ru/events/shed/index.html"
	full_uri = a.page.uri + a.page.links.find {\|l\| l.href.match(/.+\.pdf$/)}.uri

	filename = full_uri.to_s.split('/').last

	if Dir.glob(filename).size!=1
	then
	system 'rm -f '+ filename
	system 'wget '+ full_uri.to_s
	end



	Docsplit.extract_text(filename, :ocr => false, :output => './')

	raw_data = ""

	File.open(filename.gsub(/\.pdf$/,'.txt')).each_line do \|line\|
	raw_data += line
	end

	raw_data.gsub!(/.+.doc G/, '').gsub!(/\d+
	1

	2

	3

	4
	/m,'')

	puts raw_data


	system 'rm -f '+ filename.gsub(/\.pdf$/,'.csv')
	FasterCSV.open(filename.gsub(/\.pdf$/,'.csv'), 'w') do \|csv\|
	raw_data.scan(/^((\d+\.)+)\n\n(\D(.+\n)+)\n(\D(.+\n)+)?\n(\D(.+\n)+)?/).each{\|s\| csv << [s[0],s[2],s[4],s[6]]}
	end