Created
May 8, 2012 11:38
-
-
Save timeout/2634395 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby19 -w | |
require 'sqlite3' | |
require 'nokogiri' | |
# create a database | |
db = SQLite3::Database.new "raw_article.db" | |
rows = db.execute <<-SQL | |
CREATE TABLE raw_articles ( | |
filename VARCHAR(256), | |
heading TEXT, | |
lead TEXT, | |
date VARCHAR(16), | |
article TEXT | |
); | |
SQL | |
data = [] | |
# open article | |
ARGV.each do |filename| | |
data.clear | |
f = File.open(filename, "r") | |
doc = Nokogiri::HTML(f) do |config| | |
config.strict.noent.noblanks | |
end | |
f.close | |
data.push(filename) | |
# title | |
print 'Looking for a title... ' | |
title = doc.xpath('//div[@class = "field field-title"]') | |
puts title.text unless title.nil? | |
puts | |
data.push(title) | |
puts 'Looking for an article lead... ' | |
lead = doc.xpath('//p[@class = "lead"]') | |
puts lead.text unless lead.nil? | |
puts | |
data.push(lead) | |
print 'Looking for a date... ' | |
date = doc.xpath('//span[@class = "date-display-single"]') | |
puts date.text unless date.nil? | |
data.push(date) | |
print 'Getting the article... ' | |
parag = "'" | |
articles = doc.xpath('//div[@class = "field field-article-article"]//p') | |
articles.each do |paragraph| | |
parag << paragraph.to_html | |
end | |
parag << "'" | |
data.push(parag) | |
puts parag | |
db.execute("INSERT INTO raw_articles VALUES (?, ?, ?, ?, ?);", data) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment