Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save maricris-sn/641983 to your computer and use it in GitHub Desktop.
Save maricris-sn/641983 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'open-uri'
require 'net/http'
require 'hpricot'
require 'atom/entry'
require 'atom/collection'
#Declare your variables
urls_to_import = "urls.txt"
wp_blog_host = "livinglife.sweetperceptions.com"
wp_blog_uri = "http://#{wp_blog_host}"
wp_base = "http://#{wp_blog_host}/wp-app.php"
wp_blog_username = "myusername"
wp_blog_password = "mypassword"
your_blog_source = "http://sweetperceptions.i.ph"
which_pages = 1..19
authors = {
'Maricris Nonato' => {'user' => 'myusername', 'password' => 'mypassword'}
}
registered_categories = ["About me", "Artistry", "Cool Finds", "Dreams", "Events", "Health and Beauty", "Horoscope", "Living Life", "Meme", "Movies", "Music", "Notes", "Pet Love", "Quotes", "Random thoughts", "Stories to share", "Techie", "Travel"]
synonym_categories = {
"About me" => ["me"],
"Artistry" => ["poem"],
"Cool Finds" => ["cool"],
"Dreams" => ["dream","dreams"],
"Events" => ["event", "bday", "birthday", "Christmas", "New year", "new-year", "celebration"],
"Health and Beauty" => ["health", "sickness", "headache", "fever", "cancer"],
"Horoscope" => ["cookie", "fortune", "horoscope", "astrology", "psych"],
"Living Life" => ["life", "kalokohan"],
"Meme" => ["meme"],
"Movies" => ["hollywood", "movie", "movies", "movie-lines", "happy-feet"],
"Music" => ["song", "songs", "singer", "music", "ost"],
"Notes" => ["notes"],
"Pet Love" => ["pet", "cat", "dog", "animal", "animals", "pets"],
"Quotes" => ["quote", "quotes"],
"Random thoughts" => ["thought", "thoughts", "think", "logic"],
"Stories to share" => ["story", "stories", "adventure"],
"Techie" => ["tech", "techie", "work", "web2.0", "development", "software", "online", "skype", "pc"],
"Travel" => ["philippines", "travel", "province"],
}
# Rules of matching to categories:
# 1. exact match
# 2. synonyms/variations -> manual
# Get all urls of your posts
# Uncomment if you want to use Option B
# A. By scraping your links online, OR
urls = Array.new
which_pages.each do |page|
from = Hpricot(open(your_blog_source + "/page/#{page.to_s}/"))
urls << (from/"h3[@class='entrytitle']/a").collect{|x| x['href']}
end
# B. read in URLs from text file
# urls = File.readlines(urls_to_import).map { |line| line.chomp }
urls = urls.flatten.compact
# Parse each HTML document from list of URLs
urls.each { |target|
doc = Hpricot(open(target))
# Extract HTML within element matching XPath expression
title = (CGI::unescapeHTML((doc/"div/h3[@class='entrytitle']/a").inner_html.strip)).gsub(/\r\n/, '')
author = "Chris"
timestr = (doc/"div[@class='meta-post']").inner_html[/\d+:\d\d:\d\d/]
datestr = ((doc/"div/span[@class='date']").inner_html.strip).gsub(/\r\n/, '')
datestr = datestr + " " + timestr
datestr = DateTime.parse(datestr).strftime('%a, %-d %b %Y %T -0500')
hExcerpt = ((doc/"div[@class='entry_summary']").inner_html).gsub(/\r\n/, '')
filtered_tags = []
tags = (doc/"div[@class='tag-list']/a").collect{|x| x.inner_html}
#rule 1 -> exact match
filtered_tags << tags.collect{|x| x if registered_categories.include?(x)}.compact
#rule 2 -> synonyms
synonym_categories.keys.each do |syn|
filtered_tags << tags.collect{|x| syn if (synonym_categories[syn]).include?(x)}.compact
end
tags = filtered_tags.flatten.compact.uniq.join(',')
# Get your contents by finding all paras in the entry post
entry_id = "postentry-#{doc.at("div[@class='blog']")['id'].split('-').last}"
# Get the main body content
contents = (doc/"##{entry_id}")
# Remove unneeded elements
(doc/"##{entry_id}/h3").remove
(doc/"##{entry_id}/span[@class='date']").remove
(doc/"##{entry_id}/div[@class='tag-list']").remove
(doc/"##{entry_id}/div[@class='meta-post']").remove
# removing string not found in any Hpricot element parent
contents = (doc/"##{entry_id}").inner_html.gsub("\n \n \n \n \n \n",'').gsub("\n \n \n",'')
content = contents
# Atom Author element
author = Atom::Author.new
author.name = author
author.uri = wp_blog_uri
# Atom Entry element
entry = Atom::Entry.new
entry.title = title
entry.summary = hExcerpt
entry.content = content
entry.content.type = "html"
entry.published = datestr
entry.updated = datestr
entry.tag_with(tags, ',')
entry.authors << author
req = Atom::HTTP.new
req.user = wp_blog_username
req.pass = wp_blog_password
req.always_auth = :basic
# Atom Collection
c = Atom::Collection.new(wp_base + "/posts", req)
res = c.post! entry
puts "Imported URL: #{target}, at #{datestr}, #{res.message}\n"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment