jrgns/import_guardian.rb

## import_guardian.rb
# encoding: UTF-8
require 'elasticsearch'
require 'rest-client'
require 'json'
require 'pp'
require 'hashie'

# Default options
def params
  {
    'api-key' => ENV['GUARDIAN_API_KEY'],
    'page-size' => 20
  }
end

guardian_url = 'http://content.guardianapis.com/search'

client = Elasticsearch::Client.new log: false, host: ENV['ELASTICSEARCH_HOST']

# Page through the first available 20 pages
(1..20).each do |page|
  puts "Fetching page #{page}"
  begin
    # page
    response = RestClient.get guardian_url, params: params.merge(
      # Custom options
      { page: page, section: 'books|business|education|environment|news|science' }
    )
  rescue => e
    pp e.response
    raise e
  end

  response = Hashie::Mash.new JSON.parse(response)

  response = response.response
  response.results.each do |article|
    puts "Recording #{article.id}"
    content = RestClient.get(article.apiUrl, params: params.merge({'show-fields' => 'all', 'show-tags' => 'all'}))
    content = Hashie::Mash.new JSON.parse(content)
    tags    = content.response.content.tags.select{ |tag| tag.type == 'keyword' }.collect(&:webTitle)
    content = content.response.content.fields.body
    body = {
      title: article.webTitle,
      date: article.webPublicationDate,
      categories: tags,
      content: content,
      section: article.sectionName
    }
    client.index index: 'static-elastic', type: 'guardian', id: article.id.gsub(/\//, '-'), body: body
  end
end
	# encoding: UTF-8
	require 'elasticsearch'
	require 'rest-client'
	require 'json'
	require 'pp'
	require 'hashie'

	# Default options
	def params
	{
	'api-key' => ENV['GUARDIAN_API_KEY'],
	'page-size' => 20
	}
	end

	guardian_url = 'http://content.guardianapis.com/search'

	client = Elasticsearch::Client.new log: false, host: ENV['ELASTICSEARCH_HOST']

	# Page through the first available 20 pages
	(1..20).each do \|page\|
	puts "Fetching page #{page}"
	begin
	# page
	response = RestClient.get guardian_url, params: params.merge(
	# Custom options
	{ page: page, section: 'books\|business\|education\|environment\|news\|science' }
	)
	rescue => e
	pp e.response
	raise e
	end

	response = Hashie::Mash.new JSON.parse(response)

	response = response.response
	response.results.each do \|article\|
	puts "Recording #{article.id}"
	content = RestClient.get(article.apiUrl, params: params.merge({'show-fields' => 'all', 'show-tags' => 'all'}))
	content = Hashie::Mash.new JSON.parse(content)
	tags = content.response.content.tags.select{ \|tag\| tag.type == 'keyword' }.collect(&:webTitle)
	content = content.response.content.fields.body
	body = {
	title: article.webTitle,
	date: article.webPublicationDate,
	categories: tags,
	content: content,
	section: article.sectionName
	}
	client.index index: 'static-elastic', type: 'guardian', id: article.id.gsub(/\//, '-'), body: body
	end
	end