Jookia/shiftjis.py

## shiftjis.py
# Originally written on 2017-04-26 19:03.

# Welcome to my fancy Shift JIS explanation and simple implementation in Python.
#
# If you haven't heard of Shift JIS, it's a single or double-byte encoding for
# the JIS X 0201 and JIS X 0208 character sets, a bit like UTF-8 is for Unicode.
#
# The key feature of Shift JIS is being able to use both character sets at once,
# retaining legacy half-width ASCII and Katakana characters from JIS X 0201.
#
# First we're going to implement the JIS X 0201 standard. ASCII[1] only uses 7
# bits, meaning only 128 characters are defined. JIS X 0201 uses 8 bits like
# most ASCII variants, meaning it has space for 256 instead.[2] ASCII variants
# are generally split to two halves: The 'lower' half and 'upper' half.
#
# For JIS X 0201, the lower half is a modified ASCII layout. It contains control
# characters which are used to change the text layout and make your terminal go
# beep and printable characters which you're reading right now.
#
# The upper half (which gets its name by having its first of eight bits set)
# contains some symbols and Katakana, 62 in total of the available 127 spaces.
# JIS X 0211 adds more control characters[3] but we won't be implementing that.
#
# [1]: https://en.wikipedia.org/wiki/ASCII
# [2]: https://en.wikipedia.org/wiki/JIS_X_0201
# [3]: https://en.wikipedia.org/wiki/JIS_X_0211

# Code time! For simplicity's sake (it'll become important later), characters
# will be specified in hexadecimal. This is useful since you can line them up
# with the Wikipedia graphs but also instantly tell if a number fits in to a
# single byte since a single byte is two hexadecimal digits long. I'll also do
# my best to make any range checks look like a number line. Error checking is
# important, which is why this code throws a lot of exceptions if something
# exceptional happens, as is the Python way.
#
# In addition, most the functions (if not all) are going to be tested using
# exhaustive test functions. The idea is to knowingly generate some invalid
# inputs and some valid inputs then compare to see if our functions can tell the
# difference properly. In a lot of cases this will be done just by quickly
# looping through all possible characters in a search space, as well as invalid
# ones to verify that those are picked up too. When the tests fail, they'll just
# print out some variables (see the source code to decipher them.)
#
# Overall, this source file is going to be divided on a per-codec basis, so
# let's get started with JIS X 0201.

# You'll notice there's no jisx0201_encode or decode functions. Since JIS X 0201
# can fit in a byte, there's no need for any. It maps to itself nicely.

def jisx0201_valid(char):
  ascii  = (0x00 <= char and char <= 0x7F)
  custom = (0xA1 <= char and char <= 0xDF)
  return (ascii or custom)

def test_jisx0201_valid():
  char = -0x100
  while char <= 0x100:
    valid = jisx0201_valid(char)
    try:
      # This encoding should always be correct, so decode errors are just about
      # invalid characters and not invalid encodings.
      enc = bytes([char])
      enc.decode("shiftjis")
      realValid = True
    except ValueError as e:
      # Converting to either bytes or Unicode failed, either way it's invalid.
      realValid = False
    if valid != realValid:
      print("%s %x %s" % (valid, char, realValid))
      return False
    char += 0x01
  return True
print("test_jisx0201_valid: %s" % (test_jisx0201_valid()))

# JIS X 0208 requires us to handle double bytes, so let's set up some functions
# to be able to convert numbers to and from tuples of bytes like (b1, b2).

def byte_valid(byte):
  return (0x00 <= byte and byte <= 0xFF)

def test_byte_valid():
  b = -0x100
  while b <= 0x100:
    valid = byte_valid(b)
    if (b < 0x00 or 0xFF < b) and valid:
      print("%x" % (b))
      return False
    b += 0x01
  return True
print("test_byte_valid: %s" % (test_byte_valid()))

# Now we need some functions to pack and unpack two bytes. In hexadecimal, a
# byte is two digits, so this code is equivalent to the decimal version of
# multiplying a two digits by 100 then adding the second number below back.

def db_pack(b1, b2):
  if byte_valid(b1) and byte_valid(b2):
    return ((b1 * 0x100) + b2)
  else:
    raise ValueError("db_pack: Invalid byte(s) %x %x" % (b1, b2))

def db_unpack(bytes):
  if 0xFFFF < bytes:
    raise ValueError("db_unpack: 'bytes' larger than two bytes" % (bytes))

  # To unpack bytes divide by 100 to get the first digits then modulo to get the
  # remainder for the last two digits.
  return (bytes // 0x100, bytes % 0x100)

def test_dbpacking():
  b1 = -0x10
  while b1 < 0x110:
    b2 = -0x10
    while b2 <= 0x110:
      valid = (byte_valid(b1) and byte_valid(b2))
      try:
        packed   = db_pack(b1, b2)
        (b3, b4) = db_unpack(packed)

        if b1 != b3 or b2 != b4:
          print("%x-%x %x %x-%x" % (b1, b2, packed, b3, b4))
          return False
        realValid = True
      except ValueError as e:
        realValid = False

      if valid != realValid:
        print("%x-%x" % (b1, b2))
      b2 +=1
    b1 += 1
  return True
print("test_dbpacking: %s" % (test_dbpacking()))

# So far it's pretty simple. Next up we need to look at implementing some
# functions for JIS X 0208.[4] Unlike Unicode which is one-dimensional with
# sections marked as planes, JIS X 0208 is divided in to a 94x94 grid, with each
# character being put in a particular row in a particular cell.
#
# Why 94? The reason is ISO/IECC 2022.[5] Instead of using 8 bits like JIS X
# 0201, JIS X 0208 and some other standards define their character characters by
# fitting their rows and cells in to two bytes compatible with 7-bit printable
# ASCII. Control characters can then be used to switch between different
# encodings in the middle of text. Because of this, a JIS X 0208 character is
# actually two bytes stuck together in the printable ASCII range (0x20 - 0x7E,
# 0x7F is 'delete' for some reason) rather than being a continuous set like
# Unicode. This means that the character '0x2121' is actually row 1, cell 1 and
# the character '0x7E7E' is row 94, 94.
#
# It also means that if we were to lay out the characters continuously for every
# number like Unicode does, we'd find that each row (say, 0x21XX) is actually a
# byte long, containing 256 with only 94 used, ignoring 162. This tradeoff is
# made so that you can take any JIS X 0208 character and display it as mangled
# ASCII text rather than random control characters that make your text mess up.
# For example, row 3, cell 16 is '０' (converted to Unicode) or '#0' in mangled
# ASCII which can then be converted back to JIS X 0208.
#
# There's a table of JIS X 0208 characters online[6] which shows all the rows
# and cells. Don't be fooled, the cells don't have their own rows and columns
# (confusingly the Wikipedia page seems to imply this), that's just for display.
# In it you can see the JIS character code and the Shift JIS encoded character,
# and a few interesting issues: The entire character set is wide, not narrow
# like JIS X 0201, and it contains unallocated blocks. The first issue is why
# people switch between JIS X 0201 and JIS X 0208, and the second issue is why a
# lot of systems still use Shift JIS: They use these blocks for themselves.
#
# [4]: https://en.wikipedia.org/wiki/JIS_X_0208
# [5]: https://en.wikipedia.org/wiki/ISO/IEC_2022
# [6]: http://www.asahi-net.or.jp/~AX2S-KMTN/ref/jisx0208.html

# For understanding's sake, we're going to deal with JIS X 0208 by specifying
# rows and cells rather than encoded characters in hexadecimal.

def jisx0208_valid(row, cell):
  valid_row  = (1 <= row  and row  <= 94)
  valid_cell = (1 <= cell and cell <= 94)
  return (valid_row and valid_cell)

def jisx0208_encode(row, cell):
  if jisx0208_valid(row, cell):
    # Add 0x20 to align to ASCII printable characters, then pack it.
    return db_pack(row + 0x20, cell + 0x20)
  else:
    raise ValueError("jisx0208_encode: invalid character %i,%i" % (row, cell))

def jisx0208_decode(character):
  (row, cell) = db_unpack(character)
  return (row - 0x20, cell - 0x20)

def test_jisx0208_codec():
  row = -10
  while row <= 100:
    cell = -10
    while cell <= -100:
      valid = ((1 <= row and row <= 94) and (1 <= cell and cell <= 94))
      try:
        character = jisx0208_encode(row, cell)
        (row2, cell2) = jisx0208_decode(character)

        if row != row2 or cell != cell2:
          print("%i,%i %x %i,%i" % (row, cell, character, row2, cell2))
          return False
        realValid = True
      except ValueError as e:
        realValid = False

      if valid != realValid:
        print("%i-%i" % (row, cell))
      cell +=1
    row += 1
  return True
print("test_jisx0208_codec: %s" % (test_jisx0208_codec()))

# Ok, now that we can encode both character sets it's time to encode Shift JIS!
# First, let's study the map of Shift JIS.[7] Take a moment to drink it in.
#
# We can see that Shift JIS is a superset of JIS X 0201, rather than 7-bit
# ASCII. The first byte can either be JIS X 0201 or the start of a shifted JIS X
# 0208 character. It gets its name from 'shifting' the first byte of its
# encoding around the upper half of JIS X 0201 in the unallocated 65 characters.
#
# When encoding, the first byte only uses 47 of the 65 characters that could be
# used for encoding, and the second byte maps 188 characters for use. If you add
# them together, 47x188 and 94x94 are both 8836, so all together there's enough
# room to store a JIS X 0208 character.
#
# The key to Shift JIS is how it redistributes the bits of a JIS X 0208
# character: You can't fit the row number in to one of the 65 characters, so
# part of the number needs to be stored in the second byte. It does this by
# moving whether the row is odd or even from a bit in the row number to a
# position-based indication in the second byte. This halves the first byte's
# needed characters to 47 and doubles the second byte's to 188.
#
# It's worth noting that unlike ASCII or UTF-8, there's no bit packing or
# masking that allow you to extract the character directly from the bytes.
# Instead, there's some weird offsets that actually affect the code: The first
# byte starts being encoded at 0x81 instead of 0x80. All the other indices start
# at 0, but this one just starts at 1 for some reason. I don't know why.
#
# The second byte is a lot stranger: The section for odd rows starts at 0x40,
# meaning the code now needs to skip the 0x7F 'DEL' control code. It also
# overlaps with JIS X 0211 control codes, meaning it could actually mess up your
# terminal unlike a 7-bit encoding. Moving that section back to 0x20 would solve
# both issues. Additionally, the section for even rows starts straight after at
# 0x9F. Moving that forward to 0xA0 would mean you could check determine if the
# row is odd by whether the 8th bit is set instead of checking a range.
#
# All of this hassle gets us an encoding that supports multiple character sets
# without the stateful control codes used in ISO/IEC 2022 and backwards
# compatibility with JIS X 0201 text. As far as I can tell, a much nicer
# encoding (EUC-JP)[8] was available since 1993 but instead Shift JIS was
# standardized by Microsoft in 1997 just to have that backwards compatibility.
# Interestingly enough, Microsoft uses a non-standard extension of Shift JIS.[9]
#
# [7]: https://en.wikipedia.org/wiki/Shift_JIS#Shift_JIS_byte_map
# [8]: https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-JP
# [9]: https://en.wikipedia.org/wiki/Code_page_943

def shiftjis_encode_jisx0208(character):
  (row, cell) = jisx0208_decode(character)

  if not jisx0208_valid(row, cell):
    raise ValueError("shiftjis_encode: invalid character %i,%i" % (row, cell))

  # For the first byte...
  b1 = row + 1 # Add 1 so the encoding starts at 0x81.
  b1 = b1 // 2 # Shrink the value to fit, losing the odd/even bit.

  # Shift the byte based on whether it can fit behind the JIS X 0201 symbols.
  if b1 < 32:
    b1 = b1 + 0x80
  else:
    b1 = b1 + 0xE0
    b1 = b1 - 32 # Bytes ahead of the symbols are implied to start at 32.

  # For the second byte...
  row_odd = (row % 2 == 1) # Check if the row is odd.
  offset = 0x40 if row_odd else 0x9F # Pick the indicative offset.
  b2 = offset + cell - 1 # Shift to the appropriate location, cells start at 1.

  # For odd rows, shift past the 'DEL' control character at 0x7F.
  if row_odd and 0x7F <= b2:
    b2 += 1

  return db_pack(b1, b2)

def shiftjis_decode_jisx0208(db):
  (b1, b2) = db_unpack(db)

  # For the first byte...
  if 0x81 <= b1 and b1 <= 0x9F:
    row = b1 - 0x80 # Shift its value from behind JIS X 0201 symbols.
  elif 0xE0 <= b1 and b1 <= 0xFF:
    row = b1 - 0xE0 # Shift its value from ahead of the symbols.
    row = row + 32 # Add 32 to compensate for values ahead starting from 0.
  else:
    raise ValueError("shiftjis_decode_jisx0208: invalid first byte %x" % (b1))

  # Expand the row back to an even number. While dividing by 2 would floor the
  # value meaning the odd number corresponding to this cell would be next,
  # adding 1 so the encoding starts at 0x81 means it actually gets round up to
  # the ceiling, making the odd number below this number.
  row = row * 2

  # For the second byte...
  if 0x40 <= b2 and b2 <= 0x9E: # Odd row!
    row = row - 1 # Set to corresponding odd number.
    cell = b2 - 0x40

    # Compensate for shifting past the 'DEL' control character.
    if 0x7F < b2:
      cell = cell - 1
    elif b2 == 0x7F:
      raise ValueError("shiftjis_decode_jisx0208: invalid second byte %x" % (b2))
  elif 0x9F <= b2 and b2 <= 0xFC:
    cell = b2 - 0x9F
  else:
    raise ValueError("shiftjis_decode_jisx0208: invalid second byte %x" % (b2))

  # Shift forward since cells start at 1.
  cell = cell + 1

  return jisx0208_encode(row, cell)

# Make sure the decoder doesn't accept rubbish input.
def test_shiftjis_jisx0208_decoder():
  b1 = -0x10
  while b1 <= 100:
    b2 = -0x10
    while b2 <= 100:
      bs_valid = (byte_valid(b1) and byte_valid(b2))
      b1_valid = ((0x81 <= b1 and b1 <= 0x9F) or (0xE0 <= b1 and b1 <= 0xFF))
      b2_valid = ((0x40 <= b2 and b1 <= 0x9E) or (0x9F <= b1 and b1 <= 0xFC))
      b2_not_del = (b2 != 0x7F)
      valid = (bs_valid and b1_valid and b2_valid and b2_not_del)

      try:
        enc = db_pack(b1, b2)
        decoded = shiftjis_decode_jisx0208(enc)
        realValid = True

        if valid != realValid:
          print("%x-%x %x %x %s" % (b1, b2, enc, decoded, realValid))
          return False
      except ValueError as e:
        realValid = False

      if valid != realValid:
        print("%x-%x %s" % (b1, b2, realValid))
        return False
      b2 +=1
    b1 += 1
  return True
print("test_shiftjis_jisx0208_decoder: %s" % (test_shiftjis_jisx0208_decoder()))

# Make sure the encoder and decoder don't mangle the data.
def test_shiftjis_jisx0208_encoder():
  row = 1
  while row <= 94:
    cell = 1
    while cell <= 94:
      valid = True
      try:
        character = jisx0208_encode(row, cell)
        enc = shiftjis_encode_jisx0208(character)
        character2 = shiftjis_decode_jisx0208(enc)
        (row2, cell2) = jisx0208_decode(character2)

        if row != row2 or cell != cell2 or character != character2:
          print("%i,%i %x %x %x %i,%i" %
            (row, cell, character, enc, character2, row2, cell2))
          return False
        realValid = True
      except ValueError as e:
        realValid = False

      if valid != realValid:
        print("%i,%i %s" % (row, cell, realValid))
        return False
      cell +=1
    row += 1
  return True
print("test_shiftjis_jisx0208_encoder: %s" % (test_shiftjis_jisx0208_encoder()))

# Okay, so that's the hard part done. We now have the ability to encode and
# decode text in the two ways Shift JIS does: JIS X 0201 and mangled JIS X 0208.
# Now all we need is to write the Shift JIS bytes encoder! The idea is that the
# accepted input will be encoded characters from either JIS X 0201 or JIS X 0208
# and output will be bytes, and vice-versa for decoding. These are just simple
# wrappers and don't need much error checking since the other functions do that.

def shiftjis_encode(characters):
  encoded = bytearray()

  for character in characters:
    if jisx0201_valid(character):
      encoded.append(character)
    else:
      (row, cell) = jisx0208_decode(character)

      if jisx0208_valid(row, cell):
        (b1, b2) = db_unpack(shiftjis_encode_jisx0208(character))
        encoded.append(b1)
        encoded.append(b2)
      else:
        raise ValueError("shiftjis_encode: invalid character %x" % (character))

  return encoded

def shiftjis_decode(bytes):
  characters = []
  head = None

  for byte in bytes:
    if head == None:
      if jisx0201_valid(byte):
        characters.append(byte)
      else:
        head = byte
    else:
      encoded = db_pack(head, byte)
      decoded = shiftjis_decode_jisx0208(encoded)
      characters.append(decoded)
      head = None

  if head != None: # Trailing first byte?
    raise ValueError("shiftjis_decode: trailing first byte %x" % (head))

  return characters

# Testing the mix of JIS X 0201 and JIS X 0208 characters is a bit hard without
# fuzz testing, and the tests I've been doing so far have just been exhaustive.
# I don't want to write a random testing framework right now, so let's just use
# traditional unit tests with some examples that should shake out bad code.

def test_shiftjis_codec():
  valid_tests = [
    # Just ASCII.
    [ord('H'), ord('e'), ord('l'), ord('l'), ord('o'), 0x00],
    # ASCII with a JIS X 0208 character.
    [ord('H'), ord('e'), ord('y'), jisx0208_encode(0x1, 0xA), 0x00],
    # JIS X 0201 characters with a JIS X 0208 character.
    [0xA2, 0xA3, jisx0208_encode(0x1, 0xA), 0x00]
  ]

  for test in valid_tests:
    try:
      enc = shiftjis_encode(test)
      dec = shiftjis_decode(enc)
    except ValueError as e:
      print("%s" % (test))
      return False

  # Have to have invalid tests made up of bytes to avoid all the safety checks
  # in the functions we've created.

  invalid_tests = [
    # Invalid second byte.
    bytearray(b'hello!\x81\x10\x00'),
    # Unused first bytes.
    bytearray(b'hello!\x80\x10\x21'),
    bytearray(b'hello!\x80\xFE\x21'),
    # Unused second bytes.
    bytearray(b'hello!\x80\x81\x30'),
    bytearray(b'hello!\x80\x81\xFF'),
    # Trailing first byte.
    bytearray(b'hello!\x91')
  ]

  for test in invalid_tests:
    try:
      dec = shiftjis_decode(test)
      print("%s" % (test))
      return False
    except ValueError as e:
      pass
  return True
print("test_shiftjis_codec: %s" % (test_shiftjis_codec()))

# Whew! Everything should be fine, and the encoder should work. Of course, it's
# still possible that it's outputting garbage that it can somehow encode and
# decode losslessly. (I really doubt that, though.) But just in case there's
# more development to do, we're going to do a final test by using Python's built
# in encoder/decoder to do a round trip for comparison.
#
# A small snag is that Python will fail characters that can't be converted to
# Unicode, even if they can be encoded in Shift JIS (this includes private use
# characters like emoji). The solution to this is for our tests to check if the
# character in a row or cell is allocated in the standard, since the standard
# has since been incorporated to Unicode.

def jisx0208_allocated(row, cell):
  if not jisx0208_valid(row, cell):
    return False

  if (8 < row and row < 16) or (84 < row): # Unassigned rows.
    return False

  # These are all transcribed based on the JIS 0208 chart listed earlier.
  unallocated = {
    '2': [(15, 25), (34, 41), (49, 59), (75, 81), (90, 93)],
    '3': [(1, 15), (26, 32), (59, 64), (91, 94)],
    '4': [(84, 94)],
    '5': [(87, 94)],
    '6': [(25, 32), (57, 94)],
    '7': [(34, 48), (82, 94)],
    '8': [(33, 94)],
    '47': [(52, 94)],
    '84': [(7, 95)]}

  if unallocated.get(str(row)):
    for i in unallocated.get(str(row)):
      if i[0] <= cell and cell <= i[1]:
        return False

  return True

def test_shiftjis_external():
  # Loop through all the rows and make sure that we can do a round trip of
  # encoding and decoding of valid characters.

  row = 1
  while row <= 94:
    points = []

    cell = 0
    while cell <= 94:
      if jisx0208_allocated(row, cell):
        encoded = jisx0208_encode(row, cell)
        points.append(encoded)
      cell += 1

    # Mix in some JIS X 0201 characters!
    points.append(ord('H'))
    points.append(ord('e'))
    points.append(ord('y'))
    points.append(0xA2)
    points.append(0xA3)

    try:
      enc = shiftjis_encode(points)
      enc2 = enc.decode("shift_jis").encode("shift_jis")

      if enc != enc2:
        print("%s %s %s %s" % (points, enc, dec, dec2))
        return False
    except Exception as e:
      print("%s" % (points))

    row += 1
  return True
print("test_shiftjis_external: %s" % (test_shiftjis_external()))

# Well, that's all. I hope you learned something today. If you want to mess with
# the code some more, I suggest adding support for Microsoft's Code Page 943
# variant and other non-standard variants. I think some of them just add more
# rows so it only works in Shift JIS and not ISO/IECC 2022, or use unallocated
# rows. There's also JIS X 0213 which expands the character set in to two
# planes, but I haven't found much documentation on how this works since I can't
# read Japanese. I hope if I've gotten any point to you across, it's that Shift
# JIS isn't that scary once you stop using formulas and decimal numbers. The
# only bizarre moments come from the choice of offsets. See you later!

# The author of this file has dedicated its contents to the public domain
# using the CC0 Public Domain Dedication 1.0. For full legal information see
# <https://creativecommons.org/publicdomain/zero/1.0/>.