Skip to content

Instantly share code, notes, and snippets.

@nkuln
Created September 15, 2012 11:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nkuln/3727492 to your computer and use it in GitHub Desktop.
Save nkuln/3727492 to your computer and use it in GitHub Desktop.
Download all images from a Wordpress posts, and then replace all the existing URLs
import MySQLdb as mdb
import HTMLParser
import random
import urllib2
import urlparse
from bs4 import BeautifulSoup
from os import path
class ImageMigrator:
def __init__(self,
imgdir='migrate_images',
imgexts=['jpg', 'jpeg', 'gif', 'bmp', 'png'],
oldbaseurl=u'http://www.oldblog.net',
newbaseurl=u'http://www.example.com/migrate_images'):
self.imgdir = imgdir
self.imgexts = imgexts
self.oldbaseurl = oldbaseurl
self.newbaseurl = newbaseurl
def download_images_and_create_mappings(self, html):
imglinks = self._extract_image_links(html)
urlmap = {}
for link in imglinks:
try:
link = urlparse.urljoin(self.oldbaseurl, link)
filename = self._filename_from_url(link)
savepath = path.join(self.imgdir, filename)
print "downloading from", link, ".."
imgfile = urllib2.urlopen(link)
while path.isfile(savepath):
filename = str(random.randint(0,99)) + filename
savepath = path.join(self.imgdir, filename)
f = open(savepath, 'wb')
f.write(imgfile.read())
f.close()
imgfile.close()
newurl = urlparse.urljoin(self.newbaseurl, filename)
print "image downloaded", newurl
urlmap[link] = newurl
except urllib2.HTTPError:
print "**** cannot download image from ", link, " due to HTTP error! ****"
except urllib2.URLError:
print "**** cannot download image from ", link, " due to URL error! ****"
except:
print "**** cannot download image from ", link, " due to unknown error! ****"
return urlmap
def _filename_from_url(self, url):
url = url.replace('?','/')
url = url.replace('#','/')
url = url.replace('=','/')
return url.split('/')[-1]
def _extract_image_links(self, html):
try:
soup = BeautifulSoup(html)
links = []
imgs = soup.find_all('img')
links += [ x['src'] for x in imgs if x.has_attr('src') ]
anchors = soup.find_all('a')
links += [ x['href'] for x in anchors if x.has_attr('href') ]
imglinks = list(set(filter(self._is_image_link, links)))
return imglinks
except HTMLParser.HTMLParseError:
print "**** cannot parse the HTML from post ****"
return []
def _is_image_link(self, link):
l = link.lower()
for ext in self.imgexts:
if l.endswith(ext): return True
return False
def apply_mapping(self, urlmap, html):
ret = html
for k,v in urlmap.items():
ret = ret.replace(k, v)
f = open('migrate_images/output.html','a')
f.write(ret.encode('UTF-8'))
f.close()
return ret
def relative_to_absolute_url(self, html):
urlmap = {}
imglinks = self._extract_image_links(html)
for link in imglinks:
if link.startswith('/') or link.startswith('.'):
filename = self._filename_from_url(link)
newurl = urlparse.urljoin(self.newbaseurl, filename)
print 'map %s to %s' % (link, newurl)
urlmap[link] = newurl
return urlmap
if __name__ == '__main__':
db = mdb.connect(host='localhost', db='lifeonvm', user='root', charset='utf8')
migrator = ImageMigrator(oldbaseurl='http://www.solidskill.net/',
newbaseurl='http://static.nkuln.com/uploaded/')
cursor = db.cursor()
cursor.execute('SELECT * from wp_posts')
rows = cursor.fetchall()
for row in rows:
post_id = row[0]
post_content = row[4]
print "==== PROCESSING POST ID = %s ====" % post_id
urlmap = migrator.download_images_and_create_mappings(post_content)
# Quick patch to change relative to absolute URLs ..
#urlmap = migrator.relative_to_absolute_url(post_content)
data = migrator.apply_mapping(urlmap, post_content)
ret = cursor.execute('UPDATE wp_posts SET post_content=%s WHERE ID=%s', (data, post_id))
print "row effected by update =", ret, "id=", post_id
print "commiting changes .."
db.commit()
print "commiting done!"
db.close()
@teerapap
Copy link

You have a WordPress blog?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment