Apreche/vanillafix.py

## vanillafix.py
#!/usr/bin/env python
# Rerformat comments for transition from Vanilla1 to Vanilla 2

import re
import MySQLdb

from BeautifulSoup import BeautifulSoup

connection = MySQLdb.connect(user='root',db='newforum', use_unicode=True)
select_cursor = connection.cursor()
update_cursor = connection.cursor()
bad_comments = set()

# Reformat embedded videos
video_formats = {
    'youtube': "http://www.youtube.com/watch?v=%s ",
    'vimeo': "http://vimeo.com/%s ",
    'google': "http://video.google.com/videoplay?docid=%s ",
}

select_query = """
SELECT CommentID, Body
FROM GDN_Comment
WHERE Body LIKE '%video%'
"""
select_cursor.execute(select_query)
comments = select_cursor.fetchall()

video_types = {}
for comment in comments:
    id = comment[0]
    body = comment[1]
    try:
        soup = BeautifulSoup(''.join(body))
    except Exception, e:
        bad_comments.add(id)
        continue
    video_tags = soup.findAll('video')
    if video_tags:
        for tag in video_tags:
            if tag.has_key('type'):
                video_type = tag['type'].lower()
                video_id = tag.decodeContents()
                if video_types.has_key(video_type):
                    video_types[video_type] += 1
                else:
                    video_types[video_type] = 1
                if video_formats.has_key(video_type):
                    replacement = video_formats[video_type] % video_id
                    tag.replaceWith(replacement)
        update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))
print "VIDEO TYPE COUNT: %s" % video_types

# Remove width attribute from all img tags
select_query = """
SELECT CommentID, Body
FROM GDN_Comment
WHERE Body LIKE '%img%'
"""
select_cursor.execute(select_query)
comments = select_cursor.fetchall()
for comment in comments:
    id = comment[0]
    body = comment[1]
    try:
        soup = BeautifulSoup(''.join(body))
    except Exception, e:
        bad_comments.add(id)
        continue
    for tag in soup.findAll('img'):
        del(tag['width'])
    update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))

# Rewrite all quote citations to the new format
select_query = """
SELECT CommentID, Body
FROM GDN_Comment
WHERE Body LIKE '%blockquote%'
"""
select_cursor.execute(select_query)
comments = select_cursor.fetchall()
for comment in comments:
    id = comment[0]
    body = comment[1]
    try:
        soup = BeautifulSoup(''.join(body))
    except Exception, e:
        bad_comments.add(id)
        continue
    pattern = re.compile(r'^Posted By: (.*)')
    for tag in soup.findAll('blockquote'):
        if tag.cite:
            citation = tag.cite.decodeContents()
            match = pattern.match(citation)
            if match:
                citation = match.groups()[0]
            tag.cite.extract()
            tag['rel'] = citation
    update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))

update_cursor.close()
select_cursor.close()
connection.close()

print "BAD COMMENTS: %s" % bad_comments
	#!/usr/bin/env python
	# Rerformat comments for transition from Vanilla1 to Vanilla 2

	import re
	import MySQLdb

	from BeautifulSoup import BeautifulSoup

	connection = MySQLdb.connect(user='root',db='newforum', use_unicode=True)
	select_cursor = connection.cursor()
	update_cursor = connection.cursor()
	bad_comments = set()

	# Reformat embedded videos
	video_formats = {
	'youtube': "http://www.youtube.com/watch?v=%s ",
	'vimeo': "http://vimeo.com/%s ",
	'google': "http://video.google.com/videoplay?docid=%s ",
	}

	select_query = """
	SELECT CommentID, Body
	FROM GDN_Comment
	WHERE Body LIKE '%video%'
	"""
	select_cursor.execute(select_query)
	comments = select_cursor.fetchall()

	video_types = {}
	for comment in comments:
	id = comment[0]
	body = comment[1]
	try:
	soup = BeautifulSoup(''.join(body))
	except Exception, e:
	bad_comments.add(id)
	continue
	video_tags = soup.findAll('video')
	if video_tags:
	for tag in video_tags:
	if tag.has_key('type'):
	video_type = tag['type'].lower()
	video_id = tag.decodeContents()
	if video_types.has_key(video_type):
	video_types[video_type] += 1
	else:
	video_types[video_type] = 1
	if video_formats.has_key(video_type):
	replacement = video_formats[video_type] % video_id
	tag.replaceWith(replacement)
	update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))
	print "VIDEO TYPE COUNT: %s" % video_types

	# Remove width attribute from all img tags
	select_query = """
	SELECT CommentID, Body
	FROM GDN_Comment
	WHERE Body LIKE '%img%'
	"""
	select_cursor.execute(select_query)
	comments = select_cursor.fetchall()
	for comment in comments:
	id = comment[0]
	body = comment[1]
	try:
	soup = BeautifulSoup(''.join(body))
	except Exception, e:
	bad_comments.add(id)
	continue
	for tag in soup.findAll('img'):
	del(tag['width'])
	update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))

	# Rewrite all quote citations to the new format
	select_query = """
	SELECT CommentID, Body
	FROM GDN_Comment
	WHERE Body LIKE '%blockquote%'
	"""
	select_cursor.execute(select_query)
	comments = select_cursor.fetchall()
	for comment in comments:
	id = comment[0]
	body = comment[1]
	try:
	soup = BeautifulSoup(''.join(body))
	except Exception, e:
	bad_comments.add(id)
	continue
	pattern = re.compile(r'^Posted By: (.*)')
	for tag in soup.findAll('blockquote'):
	if tag.cite:
	citation = tag.cite.decodeContents()
	match = pattern.match(citation)
	if match:
	citation = match.groups()[0]
	tag.cite.extract()
	tag['rel'] = citation
	update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))

	update_cursor.close()
	select_cursor.close()
	connection.close()

	print "BAD COMMENTS: %s" % bad_comments