Created
May 12, 2015 01:51
-
-
Save kytu800/25b5e8dc9de14911b65b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import sys | |
import pymongo | |
from pymongo import MongoClient | |
from logbook import Logger | |
from bson.objectid import ObjectId | |
import datetime | |
import re | |
from datetime import datetime | |
# log = open('post_regex.log', 'a') | |
def iso_format(dt): | |
try: | |
utc = dt + dt.utcoffset() | |
except TypeError as e: | |
utc = dt | |
isostring = datetime.strftime(utc, '%Y-%m-%dT%H:%M:%S.{0}Z') | |
return isostring.format(int(round(utc.microsecond / 1000.0))) | |
def main(): | |
client = MongoClient('localhost', 27017) | |
db = client['dcard'] | |
for post in db.posts.find(): | |
log.info("") | |
post_id = post['id'] | |
log.info("Checking post id:", post['id']) | |
log.info("Title:", post['version'][-1]['title']) | |
post_title = post['version'][-1]['title'] | |
content = post['version'][-1]['content'] | |
if re.search(r'([dD]\s*[cC][aA][rR][dD])', post_title): | |
log.info("----------------TITLE-CHANGES-----------------") | |
log.info(post_title) | |
post_title = re.sub(r'([dD]\s*[cC][aA][rR][dD])', r"Dcard", | |
post_title) | |
log.info(post_title) | |
if re.search(r'^[\[|\{|\(『|\{](.*?)[\]|\}|\)|』|\}]', post_title): | |
log.info("----------------TITLE-CHANGES-----------------") | |
log.info(post_title) | |
post_title = re.sub(r'^[\[|\{|\(『|\{](.*?)[\]|\}|\)|』|\}]', | |
r"【\1】", post_title) | |
log.info(post_title) | |
has_image = re.search(r'https?:\/\/i\.imgur\.com\/\w*.\w*', content) | |
has_image_text = re.search(r'【圖】', post_title) | |
if has_image and not has_image_text: | |
log.info("---------------TITLE-CHANGES-----------------") | |
log.info(post_title) | |
post_title = "【圖】" + post_title | |
log.info(post_title) | |
if re.search(r'([dD]\s*[cC][aA][rR][dD])', content): | |
lines = content.split('\n') | |
for index, line in enumerate(lines): | |
if re.search(r'([dD]\s*[cC][aA][rR][dD])', | |
line) and not re.search( | |
r'https?:\/\/www\.dcard\.tw', line): | |
log.info("---------------CONTENT-CHANGES-----------------") | |
log.info('at line', index + 1) | |
log.info(line) | |
line = re.sub(r'([dD]\s*[cC][aA][rR][dD])', r"Dcard", line) | |
# line = re.sub(r'[\]|\}|\)|』|\}]',r"】",line) | |
# line = re.sub(r'[\[|\{|\(『|\{]',r"【",line) | |
line = re.sub(r'https?:\/\/www\.Dcard\.tw', | |
r"http://www.dcard.tw", line) | |
log.info(line) | |
content = re.sub(r'([dD]\s*[cC][aA][rR][dD])', r"Dcard", content) | |
content = re.sub(r'https?:\/\/www\.Dcard\.tw', | |
r"http://www.dcard.tw", content) | |
result = db.posts.update({"id": post_id}, { | |
"$push": { | |
"version": { | |
'title': post_title, | |
'content': content, | |
'createdAt': iso_format(datetime.now()) | |
} | |
} | |
}) | |
log.info( | |
"-----------------------------------------------------------------") | |
log.info(result) | |
log.info( | |
"-----------------------------------------------------------------") | |
from logbook import FileHandler | |
log_handler = FileHandler('post_regex.log') | |
if __name__ == '__main__': | |
with log_handler.applicationbound(): | |
log = Logger('Logbook') | |
log.info( | |
"======================================================================================================") | |
main() | |
log.info( | |
"======================================================================================================") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment