Skip to content

Instantly share code, notes, and snippets.

@kytu800
Created May 12, 2015 01:51
Show Gist options
  • Save kytu800/25b5e8dc9de14911b65b to your computer and use it in GitHub Desktop.
Save kytu800/25b5e8dc9de14911b65b to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import pymongo
from pymongo import MongoClient
from logbook import Logger
from bson.objectid import ObjectId
import datetime
import re
from datetime import datetime
# log = open('post_regex.log', 'a')
def iso_format(dt):
try:
utc = dt + dt.utcoffset()
except TypeError as e:
utc = dt
isostring = datetime.strftime(utc, '%Y-%m-%dT%H:%M:%S.{0}Z')
return isostring.format(int(round(utc.microsecond / 1000.0)))
def main():
client = MongoClient('localhost', 27017)
db = client['dcard']
for post in db.posts.find():
log.info("")
post_id = post['id']
log.info("Checking post id:", post['id'])
log.info("Title:", post['version'][-1]['title'])
post_title = post['version'][-1]['title']
content = post['version'][-1]['content']
if re.search(r'([dD]\s*[cC][aA][rR][dD])', post_title):
log.info("----------------TITLE-CHANGES-----------------")
log.info(post_title)
post_title = re.sub(r'([dD]\s*[cC][aA][rR][dD])', r"Dcard",
post_title)
log.info(post_title)
if re.search(r'^[\[|\{|\(『|\{](.*?)[\]|\}|\)|』|\}]', post_title):
log.info("----------------TITLE-CHANGES-----------------")
log.info(post_title)
post_title = re.sub(r'^[\[|\{|\(『|\{](.*?)[\]|\}|\)|』|\}]',
r"【\1】", post_title)
log.info(post_title)
has_image = re.search(r'https?:\/\/i\.imgur\.com\/\w*.\w*', content)
has_image_text = re.search(r'【圖】', post_title)
if has_image and not has_image_text:
log.info("---------------TITLE-CHANGES-----------------")
log.info(post_title)
post_title = "【圖】" + post_title
log.info(post_title)
if re.search(r'([dD]\s*[cC][aA][rR][dD])', content):
lines = content.split('\n')
for index, line in enumerate(lines):
if re.search(r'([dD]\s*[cC][aA][rR][dD])',
line) and not re.search(
r'https?:\/\/www\.dcard\.tw', line):
log.info("---------------CONTENT-CHANGES-----------------")
log.info('at line', index + 1)
log.info(line)
line = re.sub(r'([dD]\s*[cC][aA][rR][dD])', r"Dcard", line)
# line = re.sub(r'[\]|\}|\)|』|\}]',r"】",line)
# line = re.sub(r'[\[|\{|\(『|\{]',r"【",line)
line = re.sub(r'https?:\/\/www\.Dcard\.tw',
r"http://www.dcard.tw", line)
log.info(line)
content = re.sub(r'([dD]\s*[cC][aA][rR][dD])', r"Dcard", content)
content = re.sub(r'https?:\/\/www\.Dcard\.tw',
r"http://www.dcard.tw", content)
result = db.posts.update({"id": post_id}, {
"$push": {
"version": {
'title': post_title,
'content': content,
'createdAt': iso_format(datetime.now())
}
}
})
log.info(
"-----------------------------------------------------------------")
log.info(result)
log.info(
"-----------------------------------------------------------------")
from logbook import FileHandler
log_handler = FileHandler('post_regex.log')
if __name__ == '__main__':
with log_handler.applicationbound():
log = Logger('Logbook')
log.info(
"======================================================================================================")
main()
log.info(
"======================================================================================================")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment