haruo31/pukiwiki_cleaner.py

## pukiwiki_cleaner.py

# -*- coding: utf-8 -*-
# -*- eval: (setq flycheck-python-pylint-executable "/home/haruo31/app/caffe/bin/pylint") -*-

from codecs import decode, encode
from datetime import datetime, timedelta
from glob import glob
import gzip
from itertools import chain, repeat, groupby
import json
import logging
import os
import langdetect

logging.basicConfig(level=logging.DEBUG)

def decode_title(filepath):
    fn = os.path.basename(filepath).split('.')[0]
    try:
        return decode(fn, 'hex')
    except:
        logging.error(filepath)
        raise

def decode_timestamp(i):
    return datetime.fromtimestamp(i if isinstance(i, int) else int(i))

POSIX_ZERO = datetime.fromtimestamp(0)
def encode_timestamp(d):
    return (d - POSIX_ZERO) // timedelta(seconds=1)

def get_mtime(filepath):
    return decode_timestamp(os.path.getmtime(filepath))

class Text(object):
    def __init__(self, data):
        self._langs = None
        self.codec = None
        for c in ['euc_jp', 'iso8859']:
            try:
                self.text = decode(data, c)
                self.codec = c
                break
            except:
                continue
        if not self.codec:
            raise Exception('no codec matched.')
    def __str__(self):
        return self.text
    def __repr__(self):
        return u'<Text[codec:"%s", text:"%s"]>' % (self.codec, self.text)
    def encode(self):
        return encode(self.text, self.codec)
    def langs(self):
        if self._langs is None:
            try:
                self._langs = dict((e.lang, e.prob) for e in langdetect.detect_langs(self.text))
            except langdetect.lang_detect_exception.LangDetectException:
                self._langs = {'--': 0.0}
        return self._langs
    def __eq__(self, other):
        return self.text == other.text

class Entry(object):
    def __init__(self, title, contents, timestamp, filename):
        self.title = title
        self.contents = contents
        self.timestamp = timestamp
        self.filename = filename
    def __str__(self):
        return json.dumps(
            dict(title=self.title,
                 contents=self.contents,
                 timestamp=self.timestamp.isoformat(),
                 filename=self.filename))
    def __eq__(self, other):
        return self.title == other.title and other.timestamp == self.timestamp
    @staticmethod
    def from_backup(fp):
        t = Text(decode_title(fp))
        with gzip.open(fp, 'rb') as c:
            contents = b''
            ts = None
            for l in c:
                if l.startswith(b'>' * 10):
                    if ts:
                        yield Entry(t, Text(contents), ts, fp)
                    ts = decode_timestamp(l.strip().split(b' ')[1])
                    contents = b''
                    continue
                contents += l
            if ts:
                yield Entry(t, Text(contents), ts, fp)
    @staticmethod
    def from_current(fp):
        with open(fp, 'rb') as c:
            yield Entry(Text(decode_title(fp)), Text(b''.join(l for l in c)), get_mtime(fp), fp)

targets = [(os.path.abspath(p), f) for p, f in
           [(os.path.join('backup', '*.gz'), Entry.from_backup),
            (os.path.join('wiki', '*.txt'), Entry.from_current)]]
docs = chain.from_iterable(zip(glob(t), repeat(f))  for t, f in targets)

# ここで展開されるよ!
entries = groupby(reversed(sorted(chain.from_iterable(f(p) for p, f in docs),
                                  key=lambda e: (e.title.text, e.timestamp))),
                  lambda e: e.title)

WIKIOUT = 'wiki_out'
WIKIBAK = 'backup_out'
_ = [os.mkdir(p) for p in [WIKIOUT, WIKIBAK] if not os.path.exists(p)]

EPOCHTIME=decode_timestamp(1187590436) # pukiwiki開発当時の時刻(=これより新しいものが公開後に生成されたもの)

def proc(tup):
    title, hist = tup
    flags = ['-'] * len(hist)
    histories = [(i, e) for i, e in enumerate(hist)
                if e.timestamp < EPOCHTIME or 'ja' in e.contents.langs()]
    if histories:
        n, entry = histories[0]
        flags[n] = 'O'
        fp = os.path.join(WIKIOUT, decode(encode(title.encode(), 'hex'), 'latin-1').upper() + '.txt')
        res = (entry.contents.langs(),
               entry.timestamp)
        with open(fp, 'wb') as f:
            f.write(entry.contents.encode())
        os.utime(fp, (entry.timestamp.timestamp(), entry.timestamp.timestamp()))
        left = histories[1:]
        if left:
            fp = os.path.join(WIKIBAK, decode(encode(title.encode(), 'hex'), 'latin-1').upper() + '.gz')
            with gzip.open(fp, 'wb') as f:
                for n, entry in left:
                    flags[n] = 'o'
                    f.write(encode('>>>>>>>>>> {}\n'.format(encode_timestamp(entry.timestamp)), 'latin-1'))
                    f.write(entry.contents.encode())
            _, entry = left[0]
            os.utime(fp, (entry.timestamp.timestamp(), entry.timestamp.timestamp()))
    else:
        res = (hist[-1].contents.langs(),
               '____:__:__ __:__:__')
    return ('{0:12} -> {3[0]} {1} .. {3[1]} .. {2} [{4}]'
            .format(title.text[:12],
                    hist[0].timestamp,
                    hist[-1].timestamp,
                    res,
                    ''.join(flags)))

from multiprocessing import Pool, cpu_count
pool = Pool()
for log in pool.imap(proc, ((t, list(ents)) for t, ents in entries), cpu_count() * 4):
    print(log)

	# -- coding: utf-8 --
	# -- eval: (setq flycheck-python-pylint-executable "/home/haruo31/app/caffe/bin/pylint") --

	from codecs import decode, encode
	from datetime import datetime, timedelta
	from glob import glob
	import gzip
	from itertools import chain, repeat, groupby
	import json
	import logging
	import os
	import langdetect

	logging.basicConfig(level=logging.DEBUG)

	def decode_title(filepath):
	fn = os.path.basename(filepath).split('.')[0]
	try:
	return decode(fn, 'hex')
	except:
	logging.error(filepath)
	raise

	def decode_timestamp(i):
	return datetime.fromtimestamp(i if isinstance(i, int) else int(i))

	POSIX_ZERO = datetime.fromtimestamp(0)
	def encode_timestamp(d):
	return (d - POSIX_ZERO) // timedelta(seconds=1)

	def get_mtime(filepath):
	return decode_timestamp(os.path.getmtime(filepath))

	class Text(object):
	def __init__(self, data):
	self._langs = None
	self.codec = None
	for c in ['euc_jp', 'iso8859']:
	try:
	self.text = decode(data, c)
	self.codec = c
	break
	except:
	continue
	if not self.codec:
	raise Exception('no codec matched.')
	def __str__(self):
	return self.text
	def __repr__(self):
	return u'<Text[codec:"%s", text:"%s"]>' % (self.codec, self.text)
	def encode(self):
	return encode(self.text, self.codec)
	def langs(self):
	if self._langs is None:
	try:
	self._langs = dict((e.lang, e.prob) for e in langdetect.detect_langs(self.text))
	except langdetect.lang_detect_exception.LangDetectException:
	self._langs = {'--': 0.0}
	return self._langs
	def __eq__(self, other):
	return self.text == other.text

	class Entry(object):
	def __init__(self, title, contents, timestamp, filename):
	self.title = title
	self.contents = contents
	self.timestamp = timestamp
	self.filename = filename
	def __str__(self):
	return json.dumps(
	dict(title=self.title,
	contents=self.contents,
	timestamp=self.timestamp.isoformat(),
	filename=self.filename))
	def __eq__(self, other):
	return self.title == other.title and other.timestamp == self.timestamp
	@staticmethod
	def from_backup(fp):
	t = Text(decode_title(fp))
	with gzip.open(fp, 'rb') as c:
	contents = b''
	ts = None
	for l in c:
	if l.startswith(b'>' * 10):
	if ts:
	yield Entry(t, Text(contents), ts, fp)
	ts = decode_timestamp(l.strip().split(b' ')[1])
	contents = b''
	continue
	contents += l
	if ts:
	yield Entry(t, Text(contents), ts, fp)
	@staticmethod
	def from_current(fp):
	with open(fp, 'rb') as c:
	yield Entry(Text(decode_title(fp)), Text(b''.join(l for l in c)), get_mtime(fp), fp)

	targets = [(os.path.abspath(p), f) for p, f in
	[(os.path.join('backup', '*.gz'), Entry.from_backup),
	(os.path.join('wiki', '*.txt'), Entry.from_current)]]
	docs = chain.from_iterable(zip(glob(t), repeat(f)) for t, f in targets)

	# ここで展開されるよ!
	entries = groupby(reversed(sorted(chain.from_iterable(f(p) for p, f in docs),
	key=lambda e: (e.title.text, e.timestamp))),
	lambda e: e.title)

	WIKIOUT = 'wiki_out'
	WIKIBAK = 'backup_out'
	_ = [os.mkdir(p) for p in [WIKIOUT, WIKIBAK] if not os.path.exists(p)]

	EPOCHTIME=decode_timestamp(1187590436) # pukiwiki開発当時の時刻(=これより新しいものが公開後に生成されたもの)

	def proc(tup):
	title, hist = tup
	flags = ['-'] * len(hist)
	histories = [(i, e) for i, e in enumerate(hist)
	if e.timestamp < EPOCHTIME or 'ja' in e.contents.langs()]
	if histories:
	n, entry = histories[0]
	flags[n] = 'O'
	fp = os.path.join(WIKIOUT, decode(encode(title.encode(), 'hex'), 'latin-1').upper() + '.txt')
	res = (entry.contents.langs(),
	entry.timestamp)
	with open(fp, 'wb') as f:
	f.write(entry.contents.encode())
	os.utime(fp, (entry.timestamp.timestamp(), entry.timestamp.timestamp()))
	left = histories[1:]
	if left:
	fp = os.path.join(WIKIBAK, decode(encode(title.encode(), 'hex'), 'latin-1').upper() + '.gz')
	with gzip.open(fp, 'wb') as f:
	for n, entry in left:
	flags[n] = 'o'
	f.write(encode('>>>>>>>>>> {}\n'.format(encode_timestamp(entry.timestamp)), 'latin-1'))
	f.write(entry.contents.encode())
	_, entry = left[0]
	os.utime(fp, (entry.timestamp.timestamp(), entry.timestamp.timestamp()))
	else:
	res = (hist[-1].contents.langs(),
	'____:__:__ __:__:__')
	return ('{0:12} -> {3[0]} {1} .. {3[1]} .. {2} [{4}]'
	.format(title.text[:12],
	hist[0].timestamp,
	hist[-1].timestamp,
	res,
	''.join(flags)))

	from multiprocessing import Pool, cpu_count
	pool = Pool()
	for log in pool.imap(proc, ((t, list(ents)) for t, ents in entries), cpu_count() * 4):
	print(log)