Skip to content

Instantly share code, notes, and snippets.

@haruo31
Created April 1, 2016 19:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haruo31/0a62526dd49f4c6299dfcdaf599b7e2b to your computer and use it in GitHub Desktop.
Save haruo31/0a62526dd49f4c6299dfcdaf599b7e2b to your computer and use it in GitHub Desktop.
pukiwikiに投稿された英字のみからなるエントリを除去してページとバックアップを再構成するスクリプト wiki/ backup/ を読み込み、 wiki_out/ backup_out/ にフィルタ済み結果を吐き出す。 Text.langs を is_spam 的なメソッドに置換すれば、spam判断をとりこむこともできるかも。
# -*- coding: utf-8 -*-
# -*- eval: (setq flycheck-python-pylint-executable "/home/haruo31/app/caffe/bin/pylint") -*-
from codecs import decode, encode
from datetime import datetime, timedelta
from glob import glob
import gzip
from itertools import chain, repeat, groupby
import json
import logging
import os
import langdetect
logging.basicConfig(level=logging.DEBUG)
def decode_title(filepath):
fn = os.path.basename(filepath).split('.')[0]
try:
return decode(fn, 'hex')
except:
logging.error(filepath)
raise
def decode_timestamp(i):
return datetime.fromtimestamp(i if isinstance(i, int) else int(i))
POSIX_ZERO = datetime.fromtimestamp(0)
def encode_timestamp(d):
return (d - POSIX_ZERO) // timedelta(seconds=1)
def get_mtime(filepath):
return decode_timestamp(os.path.getmtime(filepath))
class Text(object):
def __init__(self, data):
self._langs = None
self.codec = None
for c in ['euc_jp', 'iso8859']:
try:
self.text = decode(data, c)
self.codec = c
break
except:
continue
if not self.codec:
raise Exception('no codec matched.')
def __str__(self):
return self.text
def __repr__(self):
return u'<Text[codec:"%s", text:"%s"]>' % (self.codec, self.text)
def encode(self):
return encode(self.text, self.codec)
def langs(self):
if self._langs is None:
try:
self._langs = dict((e.lang, e.prob) for e in langdetect.detect_langs(self.text))
except langdetect.lang_detect_exception.LangDetectException:
self._langs = {'--': 0.0}
return self._langs
def __eq__(self, other):
return self.text == other.text
class Entry(object):
def __init__(self, title, contents, timestamp, filename):
self.title = title
self.contents = contents
self.timestamp = timestamp
self.filename = filename
def __str__(self):
return json.dumps(
dict(title=self.title,
contents=self.contents,
timestamp=self.timestamp.isoformat(),
filename=self.filename))
def __eq__(self, other):
return self.title == other.title and other.timestamp == self.timestamp
@staticmethod
def from_backup(fp):
t = Text(decode_title(fp))
with gzip.open(fp, 'rb') as c:
contents = b''
ts = None
for l in c:
if l.startswith(b'>' * 10):
if ts:
yield Entry(t, Text(contents), ts, fp)
ts = decode_timestamp(l.strip().split(b' ')[1])
contents = b''
continue
contents += l
if ts:
yield Entry(t, Text(contents), ts, fp)
@staticmethod
def from_current(fp):
with open(fp, 'rb') as c:
yield Entry(Text(decode_title(fp)), Text(b''.join(l for l in c)), get_mtime(fp), fp)
targets = [(os.path.abspath(p), f) for p, f in
[(os.path.join('backup', '*.gz'), Entry.from_backup),
(os.path.join('wiki', '*.txt'), Entry.from_current)]]
docs = chain.from_iterable(zip(glob(t), repeat(f)) for t, f in targets)
# ここで展開されるよ!
entries = groupby(reversed(sorted(chain.from_iterable(f(p) for p, f in docs),
key=lambda e: (e.title.text, e.timestamp))),
lambda e: e.title)
WIKIOUT = 'wiki_out'
WIKIBAK = 'backup_out'
_ = [os.mkdir(p) for p in [WIKIOUT, WIKIBAK] if not os.path.exists(p)]
EPOCHTIME=decode_timestamp(1187590436) # pukiwiki開発当時の時刻(=これより新しいものが公開後に生成されたもの)
def proc(tup):
title, hist = tup
flags = ['-'] * len(hist)
histories = [(i, e) for i, e in enumerate(hist)
if e.timestamp < EPOCHTIME or 'ja' in e.contents.langs()]
if histories:
n, entry = histories[0]
flags[n] = 'O'
fp = os.path.join(WIKIOUT, decode(encode(title.encode(), 'hex'), 'latin-1').upper() + '.txt')
res = (entry.contents.langs(),
entry.timestamp)
with open(fp, 'wb') as f:
f.write(entry.contents.encode())
os.utime(fp, (entry.timestamp.timestamp(), entry.timestamp.timestamp()))
left = histories[1:]
if left:
fp = os.path.join(WIKIBAK, decode(encode(title.encode(), 'hex'), 'latin-1').upper() + '.gz')
with gzip.open(fp, 'wb') as f:
for n, entry in left:
flags[n] = 'o'
f.write(encode('>>>>>>>>>> {}\n'.format(encode_timestamp(entry.timestamp)), 'latin-1'))
f.write(entry.contents.encode())
_, entry = left[0]
os.utime(fp, (entry.timestamp.timestamp(), entry.timestamp.timestamp()))
else:
res = (hist[-1].contents.langs(),
'____:__:__ __:__:__')
return ('{0:12} -> {3[0]} {1} .. {3[1]} .. {2} [{4}]'
.format(title.text[:12],
hist[0].timestamp,
hist[-1].timestamp,
res,
''.join(flags)))
from multiprocessing import Pool, cpu_count
pool = Pool()
for log in pool.imap(proc, ((t, list(ents)) for t, ents in entries), cpu_count() * 4):
print(log)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment