Skip to content

Instantly share code, notes, and snippets.

@joeydi
Created November 19, 2014 21:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joeydi/1e312205ff6a5dc6196b to your computer and use it in GitHub Desktop.
Save joeydi/1e312205ff6a5dc6196b to your computer and use it in GitHub Desktop.
Simple WordPress Content Scraper/Importer
import os
import lxml.html
import urllib
import BeautifulSoup
from pprint import pprint
from wordpress_xmlrpc import Client, WordPressPage
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
from wordpress_xmlrpc.methods.users import GetUserInfo
def get_doc_title(url):
try:
doc = urllib.urlopen(url)
soup = BeautifulSoup.BeautifulSoup(doc)
return soup.findAll('h1')[1].string.strip()
except Exception:
return url.replace('http://fscb.com', '')
def get_doc_content(url):
try:
doc = urllib.urlopen(url)
soup = BeautifulSoup.BeautifulSoup(doc)
return str(soup.findAll(id='right-content')[0])
except Exception:
return ''
def create_page(client, title, content):
page = WordPressPage()
page.title = title
page.content = content
page.post_status = 'publish'
return client.call(NewPost(page))
def process_file(in_file, wp):
in_file = open(in_file,'r')
for line in in_file.readlines():
page = {}
try:
page['title'] = get_doc_title(line)
page['content'] = get_doc_content(line)
create_page(wp, page['title'], page['content'])
# pprint(page)
print 'Success'
except:
print 'Error on line: %s' % line
in_file.close()
return None
if __name__ == "__main__":
wp = Client('http://example.com/xmlrpc.php', 'username', 'password')
in_file = 'urls.txt'
process_file(in_file, wp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment