Skip to content

Instantly share code, notes, and snippets.

@gregelin
Created October 8, 2009 13:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregelin/205030 to your computer and use it in GitHub Desktop.
Save gregelin/205030 to your computer and use it in GitHub Desktop.
# parse a file of urls and use hpricot
require 'hpricot'
require 'open-uri'
fn='top_500_urls.txt'
urls=[]
f.each_line {|line| urls.push line }
f.close
url.each {|u| puts u}
site="http://ucp.org"
doc=open(site+urls[8]) { |f| Hpricot(f) }
# title
(doc/"title").inner_html
#content
pages = {}
content = ""
content = (doc/"#contentwrap").inner_html
if content.size == 0
content = (doc/"#contentwrap1col").inner_html
end
pages[u] = content
# Get the page and parse out the title and main body of content
def get_page(url)
doc = open(url) { |f| Hpricot(f) }
title = doc.at("title").inner_html
puts title
content = ""
content = (doc/"#contentwrap").inner_html
if content.size == 0 then
content = (doc/"#contentwrap1col").inner_html
end
return title, content, doc
end
counter = 0
pages={}
urls.each { |url|
counter = counter +1
puts counter
pages[url] = []
begin
t,c,h = get_page(site+url)
pages[url].push t
pages[url].push c
pages[url].push h
rescue
pages[url].push nil
pages[url].push nil
pages[url].push nil
end
}
# This works for extracting files and insterting
require 'hpricot'
require 'open-uri'
fn='../top_500_urls.txt'
urls=[]
f=File.open(fn) or die "Unable to open file #{fn}"
f.each_line {|line| urls.push line }
f.close
urls.each {|u| puts u}
site="http://ucp.org"
def get_page(url)
doc = open(url) { |f| Hpricot(f) }
html = doc.innerHTML
title = doc.at("title").inner_html
puts title
content = ""
content = (doc/"#contentwrap").inner_html
if content.size == 0 then
content = (doc/"#contentwrap1col").inner_html
end
return title, content, html
end
counter = 0
pages={}
urls.each { |url|
counter = counter +1
puts counter
pages[url] = []
begin
t,c,h = get_page(site+url)
pages[url].push t
pages[url].push c
pages[url].push h
rescue
pages[url].push nil
pages[url].push nil
pages[url].push nil
end
}
pages.keys.each {|k|
puts k
url=site+k
t=pages[k][0]
c=pages[k][1]
h=pages[k][2]
p=Page.new(:address => url)
p.save
pt=PageExtract.new(:page_id => p.id, :name => "title", :extract => t)
pt.save
pc=PageExtract.new(:page_id => p.id, :name => "contentwrap", :extract => c)
pc.save
ph=PageExtract.new(:page_id => p.id, :name => "html", :extract => h)
ph.save
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment