Created
April 1, 2016 19:05
-
-
Save haruo31/0a62526dd49f4c6299dfcdaf599b7e2b to your computer and use it in GitHub Desktop.
pukiwikiに投稿された英字のみからなるエントリを除去してページとバックアップを再構成するスクリプト wiki/ backup/ を読み込み、 wiki_out/ backup_out/ にフィルタ済み結果を吐き出す。 Text.langs を is_spam 的なメソッドに置換すれば、spam判断をとりこむこともできるかも。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# -*- eval: (setq flycheck-python-pylint-executable "/home/haruo31/app/caffe/bin/pylint") -*- | |
from codecs import decode, encode | |
from datetime import datetime, timedelta | |
from glob import glob | |
import gzip | |
from itertools import chain, repeat, groupby | |
import json | |
import logging | |
import os | |
import langdetect | |
logging.basicConfig(level=logging.DEBUG) | |
def decode_title(filepath): | |
fn = os.path.basename(filepath).split('.')[0] | |
try: | |
return decode(fn, 'hex') | |
except: | |
logging.error(filepath) | |
raise | |
def decode_timestamp(i): | |
return datetime.fromtimestamp(i if isinstance(i, int) else int(i)) | |
POSIX_ZERO = datetime.fromtimestamp(0) | |
def encode_timestamp(d): | |
return (d - POSIX_ZERO) // timedelta(seconds=1) | |
def get_mtime(filepath): | |
return decode_timestamp(os.path.getmtime(filepath)) | |
class Text(object): | |
def __init__(self, data): | |
self._langs = None | |
self.codec = None | |
for c in ['euc_jp', 'iso8859']: | |
try: | |
self.text = decode(data, c) | |
self.codec = c | |
break | |
except: | |
continue | |
if not self.codec: | |
raise Exception('no codec matched.') | |
def __str__(self): | |
return self.text | |
def __repr__(self): | |
return u'<Text[codec:"%s", text:"%s"]>' % (self.codec, self.text) | |
def encode(self): | |
return encode(self.text, self.codec) | |
def langs(self): | |
if self._langs is None: | |
try: | |
self._langs = dict((e.lang, e.prob) for e in langdetect.detect_langs(self.text)) | |
except langdetect.lang_detect_exception.LangDetectException: | |
self._langs = {'--': 0.0} | |
return self._langs | |
def __eq__(self, other): | |
return self.text == other.text | |
class Entry(object): | |
def __init__(self, title, contents, timestamp, filename): | |
self.title = title | |
self.contents = contents | |
self.timestamp = timestamp | |
self.filename = filename | |
def __str__(self): | |
return json.dumps( | |
dict(title=self.title, | |
contents=self.contents, | |
timestamp=self.timestamp.isoformat(), | |
filename=self.filename)) | |
def __eq__(self, other): | |
return self.title == other.title and other.timestamp == self.timestamp | |
@staticmethod | |
def from_backup(fp): | |
t = Text(decode_title(fp)) | |
with gzip.open(fp, 'rb') as c: | |
contents = b'' | |
ts = None | |
for l in c: | |
if l.startswith(b'>' * 10): | |
if ts: | |
yield Entry(t, Text(contents), ts, fp) | |
ts = decode_timestamp(l.strip().split(b' ')[1]) | |
contents = b'' | |
continue | |
contents += l | |
if ts: | |
yield Entry(t, Text(contents), ts, fp) | |
@staticmethod | |
def from_current(fp): | |
with open(fp, 'rb') as c: | |
yield Entry(Text(decode_title(fp)), Text(b''.join(l for l in c)), get_mtime(fp), fp) | |
targets = [(os.path.abspath(p), f) for p, f in | |
[(os.path.join('backup', '*.gz'), Entry.from_backup), | |
(os.path.join('wiki', '*.txt'), Entry.from_current)]] | |
docs = chain.from_iterable(zip(glob(t), repeat(f)) for t, f in targets) | |
# ここで展開されるよ! | |
entries = groupby(reversed(sorted(chain.from_iterable(f(p) for p, f in docs), | |
key=lambda e: (e.title.text, e.timestamp))), | |
lambda e: e.title) | |
WIKIOUT = 'wiki_out' | |
WIKIBAK = 'backup_out' | |
_ = [os.mkdir(p) for p in [WIKIOUT, WIKIBAK] if not os.path.exists(p)] | |
EPOCHTIME=decode_timestamp(1187590436) # pukiwiki開発当時の時刻(=これより新しいものが公開後に生成されたもの) | |
def proc(tup): | |
title, hist = tup | |
flags = ['-'] * len(hist) | |
histories = [(i, e) for i, e in enumerate(hist) | |
if e.timestamp < EPOCHTIME or 'ja' in e.contents.langs()] | |
if histories: | |
n, entry = histories[0] | |
flags[n] = 'O' | |
fp = os.path.join(WIKIOUT, decode(encode(title.encode(), 'hex'), 'latin-1').upper() + '.txt') | |
res = (entry.contents.langs(), | |
entry.timestamp) | |
with open(fp, 'wb') as f: | |
f.write(entry.contents.encode()) | |
os.utime(fp, (entry.timestamp.timestamp(), entry.timestamp.timestamp())) | |
left = histories[1:] | |
if left: | |
fp = os.path.join(WIKIBAK, decode(encode(title.encode(), 'hex'), 'latin-1').upper() + '.gz') | |
with gzip.open(fp, 'wb') as f: | |
for n, entry in left: | |
flags[n] = 'o' | |
f.write(encode('>>>>>>>>>> {}\n'.format(encode_timestamp(entry.timestamp)), 'latin-1')) | |
f.write(entry.contents.encode()) | |
_, entry = left[0] | |
os.utime(fp, (entry.timestamp.timestamp(), entry.timestamp.timestamp())) | |
else: | |
res = (hist[-1].contents.langs(), | |
'____:__:__ __:__:__') | |
return ('{0:12} -> {3[0]} {1} .. {3[1]} .. {2} [{4}]' | |
.format(title.text[:12], | |
hist[0].timestamp, | |
hist[-1].timestamp, | |
res, | |
''.join(flags))) | |
from multiprocessing import Pool, cpu_count | |
pool = Pool() | |
for log in pool.imap(proc, ((t, list(ents)) for t, ents in entries), cpu_count() * 4): | |
print(log) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment