nkuln/wpphotomigrator.py

## wpphotomigrator.py

import MySQLdb as mdb
import HTMLParser
import random
import urllib2
import urlparse
from bs4 import BeautifulSoup
from os import path

class ImageMigrator:
    def __init__(self,
            imgdir='migrate_images',
            imgexts=['jpg', 'jpeg', 'gif', 'bmp', 'png'],
            oldbaseurl=u'http://www.oldblog.net',
            newbaseurl=u'http://www.example.com/migrate_images'):
        self.imgdir = imgdir
        self.imgexts = imgexts
        self.oldbaseurl = oldbaseurl
        self.newbaseurl = newbaseurl

    def download_images_and_create_mappings(self, html):
        imglinks = self._extract_image_links(html)
        urlmap = {}

        for link in imglinks:
            try:
                link = urlparse.urljoin(self.oldbaseurl, link)
                filename = self._filename_from_url(link)
                savepath = path.join(self.imgdir, filename)

                print "downloading from", link, ".."
                imgfile = urllib2.urlopen(link)

                while path.isfile(savepath):
                    filename = str(random.randint(0,99)) + filename
                    savepath = path.join(self.imgdir, filename)

                f = open(savepath, 'wb')
                f.write(imgfile.read())
                f.close()
                imgfile.close()

                newurl = urlparse.urljoin(self.newbaseurl, filename)
                print "image downloaded", newurl

                urlmap[link] = newurl
            except urllib2.HTTPError:
                print "**** cannot download image from ", link, " due to HTTP error! ****"
            except urllib2.URLError:
                print "**** cannot download image from ", link, " due to URL error! ****"
            except:
                print "**** cannot download image from ", link, " due to unknown error! ****"

        return urlmap

    def _filename_from_url(self, url):
        url = url.replace('?','/')
        url = url.replace('#','/')
        url = url.replace('=','/')
        return url.split('/')[-1]

    def _extract_image_links(self, html):
        try:
            soup = BeautifulSoup(html)
            links = []

            imgs = soup.find_all('img')
            links += [ x['src'] for x in imgs if x.has_attr('src') ]

            anchors = soup.find_all('a')
            links += [ x['href'] for x in anchors if x.has_attr('href') ]

            imglinks = list(set(filter(self._is_image_link, links)))

            return imglinks
        except HTMLParser.HTMLParseError:
            print "**** cannot parse the HTML from post ****"

        return []

    def _is_image_link(self, link):
        l = link.lower()
        for ext in self.imgexts:
            if l.endswith(ext): return True
        return False

    def apply_mapping(self, urlmap, html):
        ret = html
        for k,v in urlmap.items():
            ret = ret.replace(k, v)

        f = open('migrate_images/output.html','a')
        f.write(ret.encode('UTF-8'))
        f.close()
        return ret

    def relative_to_absolute_url(self, html):
        urlmap = {}

        imglinks = self._extract_image_links(html)
        for link in imglinks:
            if link.startswith('/') or link.startswith('.'):
                filename = self._filename_from_url(link)
                newurl = urlparse.urljoin(self.newbaseurl, filename)
                print 'map %s to %s' % (link, newurl)
                urlmap[link] = newurl

        return urlmap

if __name__ == '__main__':
    db = mdb.connect(host='localhost', db='lifeonvm', user='root', charset='utf8')
    migrator = ImageMigrator(oldbaseurl='http://www.solidskill.net/',
            newbaseurl='http://static.nkuln.com/uploaded/')
    cursor = db.cursor()
    cursor.execute('SELECT * from wp_posts')
    rows = cursor.fetchall()

    for row in rows:
        post_id = row[0]
        post_content = row[4]

        print "==== PROCESSING POST ID = %s ====" % post_id
        urlmap = migrator.download_images_and_create_mappings(post_content)

        # Quick patch to change relative to absolute URLs ..
        #urlmap = migrator.relative_to_absolute_url(post_content)

        data = migrator.apply_mapping(urlmap, post_content)
        ret = cursor.execute('UPDATE wp_posts SET post_content=%s WHERE ID=%s', (data, post_id))
        print "row effected by update =", ret, "id=", post_id


    print "commiting changes .."
    db.commit()
    print "commiting done!"
    db.close()

	import MySQLdb as mdb
	import HTMLParser
	import random
	import urllib2
	import urlparse
	from bs4 import BeautifulSoup
	from os import path

	class ImageMigrator:
	def __init__(self,
	imgdir='migrate_images',
	imgexts=['jpg', 'jpeg', 'gif', 'bmp', 'png'],
	oldbaseurl=u'http://www.oldblog.net',
	newbaseurl=u'http://www.example.com/migrate_images'):
	self.imgdir = imgdir
	self.imgexts = imgexts
	self.oldbaseurl = oldbaseurl
	self.newbaseurl = newbaseurl

	def download_images_and_create_mappings(self, html):
	imglinks = self._extract_image_links(html)
	urlmap = {}

	for link in imglinks:
	try:
	link = urlparse.urljoin(self.oldbaseurl, link)
	filename = self._filename_from_url(link)
	savepath = path.join(self.imgdir, filename)

	print "downloading from", link, ".."
	imgfile = urllib2.urlopen(link)

	while path.isfile(savepath):
	filename = str(random.randint(0,99)) + filename
	savepath = path.join(self.imgdir, filename)

	f = open(savepath, 'wb')
	f.write(imgfile.read())
	f.close()
	imgfile.close()

	newurl = urlparse.urljoin(self.newbaseurl, filename)
	print "image downloaded", newurl

	urlmap[link] = newurl
	except urllib2.HTTPError:
	print "** cannot download image from ", link, " due to HTTP error! **"
	except urllib2.URLError:
	print "** cannot download image from ", link, " due to URL error! **"
	except:
	print "** cannot download image from ", link, " due to unknown error! **"

	return urlmap

	def _filename_from_url(self, url):
	url = url.replace('?','/')
	url = url.replace('#','/')
	url = url.replace('=','/')
	return url.split('/')[-1]

	def _extract_image_links(self, html):
	try:
	soup = BeautifulSoup(html)
	links = []

	imgs = soup.find_all('img')
	links += [ x['src'] for x in imgs if x.has_attr('src') ]

	anchors = soup.find_all('a')
	links += [ x['href'] for x in anchors if x.has_attr('href') ]

	imglinks = list(set(filter(self._is_image_link, links)))

	return imglinks
	except HTMLParser.HTMLParseError:
	print "** cannot parse the HTML from post **"

	return []

	def _is_image_link(self, link):
	l = link.lower()
	for ext in self.imgexts:
	if l.endswith(ext): return True
	return False

	def apply_mapping(self, urlmap, html):
	ret = html
	for k,v in urlmap.items():
	ret = ret.replace(k, v)

	f = open('migrate_images/output.html','a')
	f.write(ret.encode('UTF-8'))
	f.close()
	return ret

	def relative_to_absolute_url(self, html):
	urlmap = {}

	imglinks = self._extract_image_links(html)
	for link in imglinks:
	if link.startswith('/') or link.startswith('.'):
	filename = self._filename_from_url(link)
	newurl = urlparse.urljoin(self.newbaseurl, filename)
	print 'map %s to %s' % (link, newurl)
	urlmap[link] = newurl

	return urlmap

	if __name__ == '__main__':
	db = mdb.connect(host='localhost', db='lifeonvm', user='root', charset='utf8')
	migrator = ImageMigrator(oldbaseurl='http://www.solidskill.net/',
	newbaseurl='http://static.nkuln.com/uploaded/')
	cursor = db.cursor()
	cursor.execute('SELECT * from wp_posts')
	rows = cursor.fetchall()

	for row in rows:
	post_id = row[0]
	post_content = row[4]

	print "==== PROCESSING POST ID = %s ====" % post_id
	urlmap = migrator.download_images_and_create_mappings(post_content)

	# Quick patch to change relative to absolute URLs ..
	#urlmap = migrator.relative_to_absolute_url(post_content)

	data = migrator.apply_mapping(urlmap, post_content)
	ret = cursor.execute('UPDATE wp_posts SET post_content=%s WHERE ID=%s', (data, post_id))
	print "row effected by update =", ret, "id=", post_id


	print "commiting changes .."
	db.commit()
	print "commiting done!"
	db.close()