harishbsrinivas/fetch_news.rb

## fetch_news.rb
#   Copyright 2013                              Harish B Srinivas
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

#   This program fetches the individual pages of the Indian news paper
#   Deccan Herald and fetches all of the (20+) pages to disk. Which can
#   Then be trasfered to a tablet. I use this to read the news paper on
#   my daily commute :)
#   In case of errors see mechanize.log for inforamtion.
#
#...With a little modifications this might work for other sites as well.

#!/usr/bin/env ruby

require 'date'
require 'rubygems'
require 'mechanize'
require 'logger'

URL_LOGIN =  "http://deccanheraldepaper.com"
MAIN_PAGE = "http://deccanheraldepaper.com/svww_left.php"

agent = Mechanize.new
agent.follow_meta_refresh = true
agent.redirect_ok = true
agent.user_agent = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6;en-US;"+
 " rv:1.9.2) Gecko/20100115 Firefox/3.6"
agent.log = Logger.new("mechanize.log")
login_page = agent.get(URL_LOGIN)
#Load login page and then click on the anonymous login link

main_page =  login_page.link_with(:text => "Click here")
agent.click(main_page)
index = agent.get(MAIN_PAGE)

page_list = index.body.scan(/'201.*'/)

# need to get the current date and then use the year, month and day
utc_offset = +6
cur_date = DateTime.now
cur_date = cur_date.new_offset(Rational(utc_offset,24))


if(cur_date.month <= 9)
 mod_month  = "0"+cur_date.month.to_s
else
 mod_month = cur_date.month.to_s
end


if(cur_date.day <= 9)
 mod_day  = "0"+cur_date.day.to_s
else
 mod_day = cur_date.day.to_s
end


Dir.chdir("news/deccan") do
   Dir.mkdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s, 0777)
      Dir.chdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s) do
      i = 1
      page_list.each do |filename|
       filename.gsub!("'","")
       file_url = URL_LOGIN+"/pdf/"+cur_date.year.to_s+"/"+mod_month.to_s+"/"+
        mod_day.to_s+"/"+filename.to_s+".pdf"
       page = agent.get(file_url)

       if(i <= 9)
        j = "0"+i.to_s
       else
       j = i.to_s
       end
       #dump the stream to file
        File.open(j.to_s+".pdf", 'w+b') do |file|
           file << page.body.strip
       end
       i += 1
      end
   end
end
	# Copyright 2013 Harish B Srinivas
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# This program fetches the individual pages of the Indian news paper
	# Deccan Herald and fetches all of the (20+) pages to disk. Which can
	# Then be trasfered to a tablet. I use this to read the news paper on
	# my daily commute :)
	# In case of errors see mechanize.log for inforamtion.
	#
	#...With a little modifications this might work for other sites as well.

	#!/usr/bin/env ruby

	require 'date'
	require 'rubygems'
	require 'mechanize'
	require 'logger'

	URL_LOGIN = "http://deccanheraldepaper.com"
	MAIN_PAGE = "http://deccanheraldepaper.com/svww_left.php"

	agent = Mechanize.new
	agent.follow_meta_refresh = true
	agent.redirect_ok = true
	agent.user_agent = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6;en-US;"+
	" rv:1.9.2) Gecko/20100115 Firefox/3.6"
	agent.log = Logger.new("mechanize.log")
	login_page = agent.get(URL_LOGIN)
	#Load login page and then click on the anonymous login link

	main_page = login_page.link_with(:text => "Click here")
	agent.click(main_page)
	index = agent.get(MAIN_PAGE)

	page_list = index.body.scan(/'201.*'/)

	# need to get the current date and then use the year, month and day
	utc_offset = +6
	cur_date = DateTime.now
	cur_date = cur_date.new_offset(Rational(utc_offset,24))


	if(cur_date.month <= 9)
	mod_month = "0"+cur_date.month.to_s
	else
	mod_month = cur_date.month.to_s
	end


	if(cur_date.day <= 9)
	mod_day = "0"+cur_date.day.to_s
	else
	mod_day = cur_date.day.to_s
	end


	Dir.chdir("news/deccan") do
	Dir.mkdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s, 0777)
	Dir.chdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s) do
	i = 1
	page_list.each do \|filename\|
	filename.gsub!("'","")
	file_url = URL_LOGIN+"/pdf/"+cur_date.year.to_s+"/"+mod_month.to_s+"/"+
	mod_day.to_s+"/"+filename.to_s+".pdf"
	page = agent.get(file_url)

	if(i <= 9)
	j = "0"+i.to_s
	else
	j = i.to_s
	end
	#dump the stream to file
	File.open(j.to_s+".pdf", 'w+b') do \|file\|
	file << page.body.strip
	end
	i += 1
	end
	end
	end