RoadrunnerWMC/nloc.py

## nloc.py
# 10/29/16
# NLOC haxxor script


# ~~~~~~~~~~
# NLOC: Next level LOCalization? Localizable text strings.
#     Probably does, since if you get a hash ID wrong, it defaults to
#     "missing loc string". Which certainly looks like "localized" in
#     that context.
# 00-03    "NLOC"
# 04-07    1... always? (TODO: check if it is.)
# 08-0B    Looks like a hash of the language name or ID, in some way.
#          (considering it's the same in both english.data's but different in dutch.data)
# 0C-0F    Number of strings
# String info table:
#     00-03    String offset, relative to the end of this table
#     04-07    Some sort of hash/ID. Entries are sorted by this.
#              Consistent across languages for equivalent strings.
# Then UTF-16 (UCS-2?) null-terminated text strings.
# Strings appear to be in whatever order NLG decided to put them in:
#     similar strings are grouped together in locally logical order.


import struct


NLOCTS_IN = [
    'nlocts/ukenglish.nloct',
    'patches.nloct',
]
DICT_IN  = 'ukenglish.dict'

NLOC_OUT = 'ukenglish_modified.data'
DICT_OUT = 'ukenglish_modified.dict'


def nlgHashFunction(data):
    """
    Reverse-engineered from the binary. `data` must be a bytes object.
    This function has some interesting properties:
    - case-insensitive
    - insensitive to leading whitespace (but not trailing whitespace)
    """
    h = -1

    for c in data:
        if (c - 65) & 0xFFFFFFFF <= 0x19:
            c |= 0x20

        h = (h * 33 + c) & 0xFFFFFFFF

    return h


def strToHash(s):
    return nlgHashFunction(s.encode('latin-1'))


def keyAndLine(line):
    if line.startswith('"'):
        # Hash a string
        strEnd = line.index('"', 1)
        s = line[1:strEnd]
        return strToHash(s), line[strEnd+1:].lstrip()
    return int(line.split(' ')[0], 16), line[line.index(' ')+1:].lstrip()


nlocKeyRepo = None
def hashToStr(h):
    global nlocKeyRepo
    if nlocKeyRepo is None:
        # Load it
        try:
            with open('nlocKeyRepo.txt', 'r', encoding='utf-8') as f:
                keys = f.read().splitlines()
        except Exception:
            print('Could not open nlocKeyRepo.txt. This file is used to '
                  'list strings that can be used in place of key hash '
                  'values where possible.')
        nlocKeyRepo = {}
        for k in keys:
            nlocKeyRepo[strToHash(k)] = k
    if h in nlocKeyRepo:
        return '"%s"' % nlocKeyRepo[h]
    else:
        return hex(h).upper()[2:]


def generateNLOC(nloct):
    """
    Convert nloct to nloc.
    To apply multiple NLOCTs as patches, simply append the patches to
    the original!
    """
    langId = 0
    blockcomment = False
    strs = {}
    for line in nloct.splitlines():
        # Allow ### to start/end block-comments a la Coffeescript
        if line.startswith('###'):
            blockcomment = not blockcomment
            continue
        if blockcomment: continue

        if line.startswith('#') or not line: continue

        if line.lower().startswith('langid:'):
            # Language ID line; this is handled entirely differently
            langIdStr = line[len('langid:'):]
            if langIdStr.startswith('"'):
                langId = strToHash(langIdStr[1:-1])
            else:
                langId = int(langIdStr, 16)
            continue

        idHash, msg = keyAndLine(line)

        strs[idHash] = msg

    if len(strs) < 1935:
        print('WARNING: The NLOC being generated only contains %d strings, '
            'whereas retail LMDM NLOCs should have at least 1935!' % len(strs))
    elif len(strs) > 1935:
        print('WARNING: Your NLOC has more than 1935 strings! (%d)'
              % len(strs))
    if langId == 0:
        print('WARNING: Language ID was not set.')

    newNlocHeader = bytearray()
    newNlocContents = bytearray()
    newNlocHeader.extend(struct.pack('<4s3I', b'NLOC', 1, langId, len(strs)))
    # Code RE suggests that the second value there is compared against
    # a literal value 1, and fails if it's different.
    for id in sorted(strs):
        newNlocHeader.extend(struct.pack('<II', id, len(newNlocContents) // 2))
        newNlocContents.extend(strs[id].encode('utf-16-le') + b'\0\0')
    return bytes(newNlocHeader + newNlocContents)


def readNLOC(data, *, endian='<'):
    """
    Convert an NLOC file to NLOCT.
    """
    endianName = 'le' if endian == '<' else 'be'
    output = []

    langId, strCount = struct.unpack_from(endian + 'II', data, 8)
    output.append('langid:' + hashToStr(langId))

    fullStr = data[0x14 + 8 * strCount:].decode('utf-16-' + endianName)

    for i in range(strCount):
        sId, sStart = struct.unpack_from(endian + 'II', data, 0x14 + 8 * i)
        s = fullStr[sStart:fullStr.index('\x00', sStart + 1)]
        output.append(hashToStr(sId) + '    ' + s)

    return '\n'.join(output)


def main():
    """
    Main function to convert NLOC/NLOCT
    """

    # Read the NLOCTs
    nlocts = []
    for n in NLOCTS_IN:
        with open(n, 'r', encoding='utf-8') as f:
            nlocts.append(f.read())

    # Generate the NLOC
    nloc = generateNLOC('\n'.join(nlocts))

    # Generate the new .data
    data = struct.pack('<4I', 0x12027020, len(nloc), 0, 0) + nloc + b'\0\0'
    with open(NLOC_OUT, 'wb') as f:
        f.write(data)

    # Update the .dict horrifically hackily (sorry)
    with open(DICT_IN, 'rb') as f:
        dict = bytearray(f.read())
    dict[0x68:0x6C] = struct.pack('<I', len(data) - 16)
    dict[0x74:0x78] = struct.pack('<I', len(data))
    dict[0x84:0x88] = struct.pack('<I', len(data))
    with open(DICT_OUT, 'wb') as f:
        f.write(dict)

main()
	# 10/29/16
	# NLOC haxxor script


	# ~~~~~~~~~~
	# NLOC: Next level LOCalization? Localizable text strings.
	# Probably does, since if you get a hash ID wrong, it defaults to
	# "missing loc string". Which certainly looks like "localized" in
	# that context.
	# 00-03 "NLOC"
	# 04-07 1... always? (TODO: check if it is.)
	# 08-0B Looks like a hash of the language name or ID, in some way.
	# (considering it's the same in both english.data's but different in dutch.data)
	# 0C-0F Number of strings
	# String info table:
	# 00-03 String offset, relative to the end of this table
	# 04-07 Some sort of hash/ID. Entries are sorted by this.
	# Consistent across languages for equivalent strings.
	# Then UTF-16 (UCS-2?) null-terminated text strings.
	# Strings appear to be in whatever order NLG decided to put them in:
	# similar strings are grouped together in locally logical order.


	import struct


	NLOCTS_IN = [
	'nlocts/ukenglish.nloct',
	'patches.nloct',
	]
	DICT_IN = 'ukenglish.dict'

	NLOC_OUT = 'ukenglish_modified.data'
	DICT_OUT = 'ukenglish_modified.dict'


	def nlgHashFunction(data):
	"""
	Reverse-engineered from the binary. `data` must be a bytes object.
	This function has some interesting properties:
	- case-insensitive
	- insensitive to leading whitespace (but not trailing whitespace)
	"""
	h = -1

	for c in data:
	if (c - 65) & 0xFFFFFFFF <= 0x19:
	c \|= 0x20

	h = (h * 33 + c) & 0xFFFFFFFF

	return h


	def strToHash(s):
	return nlgHashFunction(s.encode('latin-1'))


	def keyAndLine(line):
	if line.startswith('"'):
	# Hash a string
	strEnd = line.index('"', 1)
	s = line[1:strEnd]
	return strToHash(s), line[strEnd+1:].lstrip()
	return int(line.split(' ')[0], 16), line[line.index(' ')+1:].lstrip()


	nlocKeyRepo = None
	def hashToStr(h):
	global nlocKeyRepo
	if nlocKeyRepo is None:
	# Load it
	try:
	with open('nlocKeyRepo.txt', 'r', encoding='utf-8') as f:
	keys = f.read().splitlines()
	except Exception:
	print('Could not open nlocKeyRepo.txt. This file is used to '
	'list strings that can be used in place of key hash '
	'values where possible.')
	nlocKeyRepo = {}
	for k in keys:
	nlocKeyRepo[strToHash(k)] = k
	if h in nlocKeyRepo:
	return '"%s"' % nlocKeyRepo[h]
	else:
	return hex(h).upper()[2:]


	def generateNLOC(nloct):
	"""
	Convert nloct to nloc.
	To apply multiple NLOCTs as patches, simply append the patches to
	the original!
	"""
	langId = 0
	blockcomment = False
	strs = {}
	for line in nloct.splitlines():
	# Allow ### to start/end block-comments a la Coffeescript
	if line.startswith('###'):
	blockcomment = not blockcomment
	continue
	if blockcomment: continue

	if line.startswith('#') or not line: continue

	if line.lower().startswith('langid:'):
	# Language ID line; this is handled entirely differently
	langIdStr = line[len('langid:'):]
	if langIdStr.startswith('"'):
	langId = strToHash(langIdStr[1:-1])
	else:
	langId = int(langIdStr, 16)
	continue

	idHash, msg = keyAndLine(line)

	strs[idHash] = msg

	if len(strs) < 1935:
	print('WARNING: The NLOC being generated only contains %d strings, '
	'whereas retail LMDM NLOCs should have at least 1935!' % len(strs))
	elif len(strs) > 1935:
	print('WARNING: Your NLOC has more than 1935 strings! (%d)'
	% len(strs))
	if langId == 0:
	print('WARNING: Language ID was not set.')

	newNlocHeader = bytearray()
	newNlocContents = bytearray()
	newNlocHeader.extend(struct.pack('<4s3I', b'NLOC', 1, langId, len(strs)))
	# Code RE suggests that the second value there is compared against
	# a literal value 1, and fails if it's different.
	for id in sorted(strs):
	newNlocHeader.extend(struct.pack('<II', id, len(newNlocContents) // 2))
	newNlocContents.extend(strs[id].encode('utf-16-le') + b'\0\0')
	return bytes(newNlocHeader + newNlocContents)


	def readNLOC(data, *, endian='<'):
	"""
	Convert an NLOC file to NLOCT.
	"""
	endianName = 'le' if endian == '<' else 'be'
	output = []

	langId, strCount = struct.unpack_from(endian + 'II', data, 8)
	output.append('langid:' + hashToStr(langId))

	fullStr = data[0x14 + 8 * strCount:].decode('utf-16-' + endianName)

	for i in range(strCount):
	sId, sStart = struct.unpack_from(endian + 'II', data, 0x14 + 8 * i)
	s = fullStr[sStart:fullStr.index('\x00', sStart + 1)]
	output.append(hashToStr(sId) + ' ' + s)

	return '\n'.join(output)


	def main():
	"""
	Main function to convert NLOC/NLOCT
	"""

	# Read the NLOCTs
	nlocts = []
	for n in NLOCTS_IN:
	with open(n, 'r', encoding='utf-8') as f:
	nlocts.append(f.read())

	# Generate the NLOC
	nloc = generateNLOC('\n'.join(nlocts))

	# Generate the new .data
	data = struct.pack('<4I', 0x12027020, len(nloc), 0, 0) + nloc + b'\0\0'
	with open(NLOC_OUT, 'wb') as f:
	f.write(data)

	# Update the .dict horrifically hackily (sorry)
	with open(DICT_IN, 'rb') as f:
	dict = bytearray(f.read())
	dict[0x68:0x6C] = struct.pack('<I', len(data) - 16)
	dict[0x74:0x78] = struct.pack('<I', len(data))
	dict[0x84:0x88] = struct.pack('<I', len(data))
	with open(DICT_OUT, 'wb') as f:
	f.write(dict)

	main()