mooware/codepage_unzip.py

## codepage_unzip.py
# A little script to extract old japanese zip files
# with the correct text encoding for filenames.
# Most applications assume a US text codepage for
# old zip formats, which is not always correct.

import os
import shutil
import sys
import zipfile
from datetime import datetime

# codepage 437 (aka 'DOS Latin US') seems to be the default
# for old zip files in most applications
_DEFAULT_CODEPAGE = 'cp437'
_JAPANESE_CODEPAGE = 'cp932'


# zipfile module does not support custom filenames,
# so we have to read/write the file ourselves
def extract_one_file(zip_file, entry, path):
    # create subdirs if they don't exist yet
    d = os.path.dirname(path)
    if d and not os.path.isdir(d):
        os.makedirs(d, exist_ok=True)
    with zip_file.open(entry.filename) as infile:
        with open(path, 'wb') as outfile:
            shutil.copyfileobj(infile, outfile)
    # set mtime/ctime as would happen when extracting
    ctime = os.path.getctime(path)
    mtime = datetime(*entry.date_time).timestamp()
    os.utime(path, (ctime, mtime))


def extract_with_codepage(zip_file, verbose=False, target_codepage=None):
    if target_codepage is None:
        target_codepage = _JAPANESE_CODEPAGE
    files = zip_file.infolist()
    for f in files:
        if f.is_dir():
            continue
        try:
            filename = f.filename.encode(_DEFAULT_CODEPAGE).decode(target_codepage)
        except UnicodeEncodeError as e:
            if verbose:
                print('encode error:', e)
            filename = f.filename
        except UnicodeDecodeError as e:
            if verbose:
                print('decode error:', e)
            filename = f.filename
        if verbose:
            # on python < 3.6, printing to windows console does not work with unicode
            try:
                print(filename)
            except UnicodeEncodeError:
                enc = sys.stdout.encoding
                if not enc:
                    enc = 'ascii'
                print(filename.encode(enc, 'backslashreplace').decode(enc))
        extract_one_file(zip_file, f, filename)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('usage:', __name__, 'zipfile [target codepage]')
        sys.exit(1)
    path = sys.argv[1]
    codepage = sys.argv[2] if len(sys.argv) > 2 else None
    zf = zipfile.ZipFile(path)
    extract_with_codepage(zf, verbose=True, target_codepage=codepage)
	# A little script to extract old japanese zip files
	# with the correct text encoding for filenames.
	# Most applications assume a US text codepage for
	# old zip formats, which is not always correct.

	import os
	import shutil
	import sys
	import zipfile
	from datetime import datetime

	# codepage 437 (aka 'DOS Latin US') seems to be the default
	# for old zip files in most applications
	_DEFAULT_CODEPAGE = 'cp437'
	_JAPANESE_CODEPAGE = 'cp932'


	# zipfile module does not support custom filenames,
	# so we have to read/write the file ourselves
	def extract_one_file(zip_file, entry, path):
	# create subdirs if they don't exist yet
	d = os.path.dirname(path)
	if d and not os.path.isdir(d):
	os.makedirs(d, exist_ok=True)
	with zip_file.open(entry.filename) as infile:
	with open(path, 'wb') as outfile:
	shutil.copyfileobj(infile, outfile)
	# set mtime/ctime as would happen when extracting
	ctime = os.path.getctime(path)
	mtime = datetime(*entry.date_time).timestamp()
	os.utime(path, (ctime, mtime))


	def extract_with_codepage(zip_file, verbose=False, target_codepage=None):
	if target_codepage is None:
	target_codepage = _JAPANESE_CODEPAGE
	files = zip_file.infolist()
	for f in files:
	if f.is_dir():
	continue
	try:
	filename = f.filename.encode(_DEFAULT_CODEPAGE).decode(target_codepage)
	except UnicodeEncodeError as e:
	if verbose:
	print('encode error:', e)
	filename = f.filename
	except UnicodeDecodeError as e:
	if verbose:
	print('decode error:', e)
	filename = f.filename
	if verbose:
	# on python < 3.6, printing to windows console does not work with unicode
	try:
	print(filename)
	except UnicodeEncodeError:
	enc = sys.stdout.encoding
	if not enc:
	enc = 'ascii'
	print(filename.encode(enc, 'backslashreplace').decode(enc))
	extract_one_file(zip_file, f, filename)


	if __name__ == '__main__':
	if len(sys.argv) < 2:
	print('usage:', __name__, 'zipfile [target codepage]')
	sys.exit(1)
	path = sys.argv[1]
	codepage = sys.argv[2] if len(sys.argv) > 2 else None
	zf = zipfile.ZipFile(path)
	extract_with_codepage(zf, verbose=True, target_codepage=codepage)