Skip to content

Instantly share code, notes, and snippets.

@amardaxini
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amardaxini/8971653 to your computer and use it in GitHub Desktop.
Save amardaxini/8971653 to your computer and use it in GitHub Desktop.
CNN news story parser
require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'pry'
require 'open-uri'
module CnnScrapper
class CnnParser
URL = "http://edition.cnn.com"
attr_accessor :cnn_doc,:news_stories,:news_urls
def initialize
@news_stories = []
@news_urls = []
end
def parse_it
@cnn_doc = Nokogiri::HTML(open(URL))
get_news_urls
end
def get_news_urls
news_lists = @cnn_doc.css("#cnn_maintt2bul").search('div[data-vr-zone="t3"] ul li')
news_lists.each do |news_list|
news_url = news_list.search("a[@href]")[0].attributes["href"].value.to_s
news_text = news_list.search("a[@href]")[0].text
begin
if(URI.parse(news_url).host == nil)
unless(news_url =~/^\/video/)
@news_urls << {:url=>URL+news_url,:short_title=> news_text}
end
end
rescue
end
end
end
def get_news_stories
@news_urls.each do |news_url|
news_story = NewsStory.new(news_url[:url],news_url[:short_title])
news_story.parse_it
@news_stories << news_story
end
end
end
class NewsStory
attr_accessor :cnn_news_url,:title,:author,:news_time,:story_line,:paragraph,:short_title,:html_content,:content
def initialize(cnn_news_url,short_title=nil)
@cnn_news_url =cnn_news_url
@short_title = short_title
end
def parse_it
news_url_doc = Nokogiri::HTML(open(@cnn_news_url))
container = news_url_doc.css("#cnnContentContainer")
@title = container.search("h1")[0].text rescue nil
@author = container.search(".cnn_stryathrtmp .cnnByline").text rescue nil
@news_time = container.search(".cnn_stryathrtmp .cnn_strytmstmp").text rescue nil
@story_line = container.search("p")[0].text
@paragraph = container.search("p.cnn_storypgraph2").text
binding.pry
@html_content = container.search("p").to_s
@content = container.search("p").text
end
end
end
# doc = Nokogiri::HTML(open(url))
# news_urls = []
# news_lists = doc.css("#cnn_maintt2bul").search('div[data-vr-zone="t3"] ul li')
# news_lists.each do |news_list|
# news_url = news_list.search("a[@href]")[0].attributes["href"].value.to_s
# begin
# if(URI.parse(news_url).host == nil)
# unless(news_url =~/^\/video/)
# news_urls << "http://edition.cnn.com"+news_url
# end
# end
# rescue
# end
# end
# news_urls.each do |news_url|
# news_url_doc = Nokogiri::HTML(open(news_url))
# container = news_url_doc.css("#cnnContentContainer")
# title = container.search("h1")[0].text rescue nil
# author = container.search(".cnn_stryathrtmp .cnnByline").text rescue nil
# news_time = container.search(".cnn_stryathrtmp .cnn_strytmstmp").text rescue nil
# story_line = container.search("p")[0].text
# paragraph = container.search("p.cnn_storypgraph2").text
# end
# binding.pry
# cnn = CnnScrapper::CnnParser.new
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment