kentbye/extract-drupal-blog-content.rb

## extract-drupal-blog-content.rb
#!/usr/bin/env ruby
require 'nokogiri'

# This script will grab the main content out of a Drupal blog post with class of '.node-blog-post',
# and then write the cleaned HTML files to a new directory. The header, sidebar, footer will
# all be removed.
#
# To use, first download set of blog post nodes from your site.
# For example, I created a view of the node ids (nid) from all of the blog posts.
# Then you can create a space-delimited shell command with the command of
# 'wget http://puppetlabs.com/node/2924;'
# Create an EXTRACTHTML with the folder of downloadedhtml and a TEMP directory
# Remove the non-html *.sh files from the downloadedhtml folder.
# Make a copy of the downloadedhtml into the TEMP directory.
#
# There will be a couple of input files that you'll need to create, but the final
# directory should look like this:
#
# EXTRACTHTML
#    |--- downloadedhtml    # Copied directory and files so that the ruby script can overwrite the files
#    |--- TEMP
#          |--- downloadedhtml                  # Directory with original data
#          |--- extract-drupal-blog-content.rb  # That is this file
#          |--- input-files.txt                 # A pruned list of HTML files
#          |--- single-file.txt                 # A single file to use to debug the primary content selector
#
# To create the list of files to scrape, run this command:
# $find downloadedhtml -type f > input-files.txt;
#
# Confirm that there is a carriage return after the last entry, otherwise you may get an error like:
# ./extract-drupal-blog-content.rb:24:in `initialize': No such file or directory - downloadedhtml/88 (Errno::ENOENT)
#
# You should be ready to either debug the primarycontent selector, or to scrape the input files.
# cd to TEMP directory and run ./extract-drupal-blog-content.rb to execute this file

# Opens up the list of list to iterate through
filename = 'input-files.txt' # Comment this line out if debugging the primarycontent SELECTOR

# If you're altering the primarycontent selector, then first create single-file.txt by
# cp input-files.txt single-file.txt;
# Open single-file.txt and delete all but the first line

# Opens up the list of list to iterate through
# filename = 'single-file.txt' # Uncomment this line for debugging the primarycontent SELECTOR

File.open(filename, 'r').each_line do |line|
  # Provide feedback as to which file is actively being parsed
  puts line[0..-2]

  # Open up the file that is passed in through the input of the script
  f = File.open(line[0..-2])

  doc = Nokogiri::HTML(f)

  # SELECTOR: Select the main content of each blog post
  primarycontent = doc.css('.node-blog-post')
  # puts primarycontent # Uncomment this line to debug the primarycontent selection

  # Close out the original file
  f.close()

  # Create a new file with the filename entered as an argument and prepend it with ebook
  new = File.open("../" + line[0..-2], "w")

  # Write the first instance of primarycontent. The second instance is erroneous
  new.write(primarycontent[0])

  # Indicate to the command line that this file is finished to help debug whether a file crashes.
  puts line[0..-2] + " FINISHED"
end
	#!/usr/bin/env ruby
	require 'nokogiri'

	# This script will grab the main content out of a Drupal blog post with class of '.node-blog-post',
	# and then write the cleaned HTML files to a new directory. The header, sidebar, footer will
	# all be removed.
	#
	# To use, first download set of blog post nodes from your site.
	# For example, I created a view of the node ids (nid) from all of the blog posts.
	# Then you can create a space-delimited shell command with the command of
	# 'wget http://puppetlabs.com/node/2924;'
	# Create an EXTRACTHTML with the folder of downloadedhtml and a TEMP directory
	# Remove the non-html *.sh files from the downloadedhtml folder.
	# Make a copy of the downloadedhtml into the TEMP directory.
	#
	# There will be a couple of input files that you'll need to create, but the final
	# directory should look like this:
	#
	# EXTRACTHTML
	# \|--- downloadedhtml # Copied directory and files so that the ruby script can overwrite the files
	# \|--- TEMP
	# \|--- downloadedhtml # Directory with original data
	# \|--- extract-drupal-blog-content.rb # That is this file
	# \|--- input-files.txt # A pruned list of HTML files
	# \|--- single-file.txt # A single file to use to debug the primary content selector
	#
	# To create the list of files to scrape, run this command:
	# $find downloadedhtml -type f > input-files.txt;
	#
	# Confirm that there is a carriage return after the last entry, otherwise you may get an error like:
	# ./extract-drupal-blog-content.rb:24:in `initialize': No such file or directory - downloadedhtml/88 (Errno::ENOENT)
	#
	# You should be ready to either debug the primarycontent selector, or to scrape the input files.
	# cd to TEMP directory and run ./extract-drupal-blog-content.rb to execute this file

	# Opens up the list of list to iterate through
	filename = 'input-files.txt' # Comment this line out if debugging the primarycontent SELECTOR

	# If you're altering the primarycontent selector, then first create single-file.txt by
	# cp input-files.txt single-file.txt;
	# Open single-file.txt and delete all but the first line

	# Opens up the list of list to iterate through
	# filename = 'single-file.txt' # Uncomment this line for debugging the primarycontent SELECTOR

	File.open(filename, 'r').each_line do \|line\|
	# Provide feedback as to which file is actively being parsed
	puts line[0..-2]

	# Open up the file that is passed in through the input of the script
	f = File.open(line[0..-2])

	doc = Nokogiri::HTML(f)

	# SELECTOR: Select the main content of each blog post
	primarycontent = doc.css('.node-blog-post')
	# puts primarycontent # Uncomment this line to debug the primarycontent selection

	# Close out the original file
	f.close()

	# Create a new file with the filename entered as an argument and prepend it with ebook
	new = File.open("../" + line[0..-2], "w")

	# Write the first instance of primarycontent. The second instance is erroneous
	new.write(primarycontent[0])

	# Indicate to the command line that this file is finished to help debug whether a file crashes.
	puts line[0..-2] + " FINISHED"
	end