Skip to content

Instantly share code, notes, and snippets.

@Apreche
Created October 27, 2011 03:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Apreche/1318716 to your computer and use it in GitHub Desktop.
Save Apreche/1318716 to your computer and use it in GitHub Desktop.
Reformats comments from Vanilla 1 to Vanilla 2 style
#!/usr/bin/env python
# Rerformat comments for transition from Vanilla1 to Vanilla 2
import re
import MySQLdb
from BeautifulSoup import BeautifulSoup
connection = MySQLdb.connect(user='root',db='newforum', use_unicode=True)
select_cursor = connection.cursor()
update_cursor = connection.cursor()
bad_comments = set()
# Reformat embedded videos
video_formats = {
'youtube': "http://www.youtube.com/watch?v=%s ",
'vimeo': "http://vimeo.com/%s ",
'google': "http://video.google.com/videoplay?docid=%s ",
}
select_query = """
SELECT CommentID, Body
FROM GDN_Comment
WHERE Body LIKE '%video%'
"""
select_cursor.execute(select_query)
comments = select_cursor.fetchall()
video_types = {}
for comment in comments:
id = comment[0]
body = comment[1]
try:
soup = BeautifulSoup(''.join(body))
except Exception, e:
bad_comments.add(id)
continue
video_tags = soup.findAll('video')
if video_tags:
for tag in video_tags:
if tag.has_key('type'):
video_type = tag['type'].lower()
video_id = tag.decodeContents()
if video_types.has_key(video_type):
video_types[video_type] += 1
else:
video_types[video_type] = 1
if video_formats.has_key(video_type):
replacement = video_formats[video_type] % video_id
tag.replaceWith(replacement)
update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))
print "VIDEO TYPE COUNT: %s" % video_types
# Remove width attribute from all img tags
select_query = """
SELECT CommentID, Body
FROM GDN_Comment
WHERE Body LIKE '%img%'
"""
select_cursor.execute(select_query)
comments = select_cursor.fetchall()
for comment in comments:
id = comment[0]
body = comment[1]
try:
soup = BeautifulSoup(''.join(body))
except Exception, e:
bad_comments.add(id)
continue
for tag in soup.findAll('img'):
del(tag['width'])
update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))
# Rewrite all quote citations to the new format
select_query = """
SELECT CommentID, Body
FROM GDN_Comment
WHERE Body LIKE '%blockquote%'
"""
select_cursor.execute(select_query)
comments = select_cursor.fetchall()
for comment in comments:
id = comment[0]
body = comment[1]
try:
soup = BeautifulSoup(''.join(body))
except Exception, e:
bad_comments.add(id)
continue
pattern = re.compile(r'^Posted By: (.*)')
for tag in soup.findAll('blockquote'):
if tag.cite:
citation = tag.cite.decodeContents()
match = pattern.match(citation)
if match:
citation = match.groups()[0]
tag.cite.extract()
tag['rel'] = citation
update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id))
update_cursor.close()
select_cursor.close()
connection.close()
print "BAD COMMENTS: %s" % bad_comments
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment