Skip to content

Instantly share code, notes, and snippets.

@mooware
Last active August 22, 2023 17:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mooware/f84abfdc31dd63a56d1bb7c78cbf781a to your computer and use it in GitHub Desktop.
Save mooware/f84abfdc31dd63a56d1bb7c78cbf781a to your computer and use it in GitHub Desktop.
A little script to extract old japanese zip files with the correct text encoding for filenames. Most applications assume a US text codepage for old zip formats, which is not always correct.
# A little script to extract old japanese zip files
# with the correct text encoding for filenames.
# Most applications assume a US text codepage for
# old zip formats, which is not always correct.
import os
import shutil
import sys
import zipfile
from datetime import datetime
# codepage 437 (aka 'DOS Latin US') seems to be the default
# for old zip files in most applications
_DEFAULT_CODEPAGE = 'cp437'
_JAPANESE_CODEPAGE = 'cp932'
# zipfile module does not support custom filenames,
# so we have to read/write the file ourselves
def extract_one_file(zip_file, entry, path):
# create subdirs if they don't exist yet
d = os.path.dirname(path)
if d and not os.path.isdir(d):
os.makedirs(d, exist_ok=True)
with zip_file.open(entry.filename) as infile:
with open(path, 'wb') as outfile:
shutil.copyfileobj(infile, outfile)
# set mtime/ctime as would happen when extracting
ctime = os.path.getctime(path)
mtime = datetime(*entry.date_time).timestamp()
os.utime(path, (ctime, mtime))
def extract_with_codepage(zip_file, verbose=False, target_codepage=None):
if target_codepage is None:
target_codepage = _JAPANESE_CODEPAGE
files = zip_file.infolist()
for f in files:
if f.is_dir():
continue
try:
filename = f.filename.encode(_DEFAULT_CODEPAGE).decode(target_codepage)
except UnicodeEncodeError as e:
if verbose:
print('encode error:', e)
filename = f.filename
except UnicodeDecodeError as e:
if verbose:
print('decode error:', e)
filename = f.filename
if verbose:
# on python < 3.6, printing to windows console does not work with unicode
try:
print(filename)
except UnicodeEncodeError:
enc = sys.stdout.encoding
if not enc:
enc = 'ascii'
print(filename.encode(enc, 'backslashreplace').decode(enc))
extract_one_file(zip_file, f, filename)
if __name__ == '__main__':
if len(sys.argv) < 2:
print('usage:', __name__, 'zipfile [target codepage]')
sys.exit(1)
path = sys.argv[1]
codepage = sys.argv[2] if len(sys.argv) > 2 else None
zf = zipfile.ZipFile(path)
extract_with_codepage(zf, verbose=True, target_codepage=codepage)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment