Skip to content

Instantly share code, notes, and snippets.

@choryuidentify
Last active December 19, 2019 13:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save choryuidentify/1e6f3117e4f936e52ff9d8445d08cb6a to your computer and use it in GitHub Desktop.
Save choryuidentify/1e6f3117e4f936e52ff9d8445d08cb6a to your computer and use it in GitHub Desktop.
MobiPocket Analyze, Target file is KF8 Format.
import locale, struct
file_data = None
#################################################################################
# #
# MOBI HEADER PARSER - EXAMPLE #
# #
# 2019 Choryu Park (Kyoungkyu Park) #
# #
#################################################################################
# This dirty source code is not a perfect implementaion of MOBI parse.
# It target KF8 Mobi eBook. Maybe you need modify code for your needs.
#
# This source code is based on these documents;
# https://wiki.mobileread.com/wiki/MOBI
# https://wiki.mobileread.com/wiki/PDB
# https://www.mobileread.com/forums/showpost.php?p=3114050&postcount=1145
with open("target_file_2.azw3", "rb") as fi:
file_data = fi.read()
offset = 0
length = 0
print("PalmDOC HEADER")
length = 32 # PalmDOC NAME
print(" name ->", file_data[offset:length].decode("CP1252"), "/ offset ->", offset, "length ->", length)
offset += length
length = 2 # attribute
print(" attribute ->", struct.unpack_from(">H", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 2 # version
print(" version ->", struct.unpack_from(">H", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # creation date
print(" creation date ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # modification date
print(" modification date ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # last backup date
print(" last backup date ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # modificationNumber
print(" modificationNumber ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # appInfoID
print(" appInfoID ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # sortInfoID
print(" sortInfoID ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # type
print(" type ->", file_data[offset:offset + length], "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # creator
print(" creator ->", file_data[offset:offset + length], "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # uniqueIDseed
print(" uniqueIDseed ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 4 # nextRecordListID
print(" nextRecordListID ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
offset += length
length = 2 # number of Records
print(" number of Records ->", struct.unpack_from(">H", file_data, offset), "/ offset ->", offset, "length ->", length)
records, = struct.unpack_from(">H", file_data, offset)
offset += length
record_dict = dict()
last_pdb_record = 0
print()
for record_idx in range(0, records):
record_dict[record_idx] = dict()
length = 4
# print("", record_idx, "record offset ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length)
record_dict[record_idx]["OFFSET"], = struct.unpack_from(">L", file_data, offset)
offset += length
length = 1
# print("", record_idx, "record Attributes ->", file_data[offset:offset+length], "/ offset ->", offset, "length ->", length)
record_dict[record_idx]["ATTR"] = file_data[offset:offset+length]
offset += length
length = 3
# print("", record_idx, "UniqueID ->", file_data[offset:offset+length], "/ offset ->", offset, "length ->", length)
record_dict[record_idx]["UNIQUE"] = file_data[offset:offset+length]
offset += length
last_pdb_record = record_idx
# print()
print()
for idx, record_data in record_dict.items():
current_offset = record_data['OFFSET']
if idx == 0:
print("> PalmDOC Record 0 Header")
length = 2
print(" Compression ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
print(" > 1 == no compression, 2 = PalmDOC compression, 17480 = HUFF/CDIC compression")
current_offset += length
length = 2
print(" Unused ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" text length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
print(" > Uncompressed length of the entire text of the book")
current_offset += length
length = 2
print(" record count ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
print(" > Number of PDB records used for the text of the book")
current_offset += length
length = 2
print(" record size ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
print(" > Current reading position, as an offset into the uncompressed text")
current_offset += length
length = 2
print(" Encryption Type ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
print(" > 0 = no encryption, 1 = Old Mobipocket Encryption, 2 = Mobipocket Encryption")
current_offset += length
length = 2
print(" Unknown ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
print()
print("> MOBI Header")
length = 4
print(" identifier ->", struct.unpack_from(">ssss", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" header length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
header_length, = struct.unpack_from(">L", file_data, current_offset)
header_last_offset = current_offset + header_length - 4
current_offset += length
length = 4
print(" Mobi type ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
print(" > 2 = Mobipocket Book, 232 = mobipocket? generated by kindlegen1.2, 248 = KF8: generated by kindlegen2")
current_offset += length
length = 4
print(" text Encoding ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
print(" > 1252 = CP1252 (WinLatin1); 65001 = UTF-8")
if (struct.unpack_from(">L", file_data, current_offset) == (1252, )):
text_encoding = "CP1252"
elif (struct.unpack_from(">L", file_data, current_offset) == (65001, )):
text_encoding = "UTF-8"
current_offset += length
length = 4
print(" Unique-ID ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" File version ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Ortographic index ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Inflection index ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Index names ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Index keys ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Extra index 0 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Extra index 1 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Extra index 2 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Extra index 3 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Extra index 4 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Extra index 5 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" First Non-book index? ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Full Name Offset ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
full_name_offset, = struct.unpack_from(">L", file_data, current_offset)
current_offset += length
length = 4
print(" Full Name Length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
full_name_length, = struct.unpack_from(">L", file_data, current_offset)
current_offset += length
full_name_offset += record_data['OFFSET']
full_name_of_this_book = file_data[full_name_offset:full_name_offset + full_name_length].decode(text_encoding)
print(" > Book name -> ", full_name_of_this_book)
length = 4
print(" Locale ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Input Language ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Output Language ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Min version ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" First Image index ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
first_image_index, = struct.unpack_from(">L", file_data, current_offset)
print(struct.unpack_from(">sssssssssss", file_data, record_dict[first_image_index]["OFFSET"]))
current_offset += length
current_offset = header_last_offset
length = 4
if (struct.unpack_from(">ssss", file_data, current_offset) == (b'E', b'X', b'T', b'H')):
print()
exth_header_exists = True
print("> EXTH Header")
print(" identifier ->", struct.unpack_from(">ssss", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" header length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" record Count ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
exth_record_count, = struct.unpack_from(">L", file_data, current_offset)
current_offset += length
for exth_record_idx in range(0, exth_record_count):
print()
length = 4
print("", exth_record_idx, "EXTH record type ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
exth_record_type, = struct.unpack_from(">L", file_data, current_offset)
current_offset += length
length = 4
print("", exth_record_idx, "EXTH record length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
exth_header_length, = struct.unpack_from(">L", file_data, current_offset)
current_offset += length
length = exth_header_length - 8
print("", exth_record_idx, "EXTH record data offset ->", current_offset, "length ->", length)
if (exth_record_type == 501):
print(" > cotainer type ->", file_data[current_offset:current_offset + length])
elif (exth_record_type == 123):
print(" > book_type ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 504 or exth_record_type == 113):
print(" > ASIN ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 508):
print(" > Title_Furigana ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 517):
print(" > Creator_Furigana ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 522):
print(" > Publisher_Furigana ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 503):
print(" > Updated_Title ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 208):
print(" > Watermark ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 104):
print(" > ISBN ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 106):
print(" > Published ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 100):
print(" > Creator ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 101):
print(" > Publisher ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 102):
print(" > Imprint ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 103):
print(" > Description ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 524):
print(" > Language ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 201):
print(" > Cover record offset from First Image record ->", struct.unpack_from(">L", file_data, current_offset))
elif (exth_record_type == 202):
print(" > Thumbnail record offset from First Image record ->", struct.unpack_from(">L", file_data, current_offset))
elif (exth_record_type == 129):
print(" > KF8 cover URI ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 536):
print(" > HD Media Containers Info ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 542):
print(" > Container ID ->", file_data[current_offset:current_offset + length].decode("utf-8"))
elif (exth_record_type == 121):
print(" > KF8 Boundery offset ->", struct.unpack_from(">L", file_data, current_offset))
else:
print(" > Unknown", exth_record_type,"->", file_data[current_offset:current_offset + length])
current_offset += length
print()
# current_offset = record_data['OFFSET'] + 240
# print(" Unknown data ->", file_data[current_offset:current_offset+4])
import imghdr
if (struct.unpack_from(">ssss", file_data, record_data["OFFSET"]) == (b'\xe9', b'\x8e', b'\r', b'\n')):
print()
print("> End of Records")
print(idx, record_data, "\r\n ", struct.unpack_from(">ssss", file_data, record_data["OFFSET"]))
elif (struct.unpack_from(">ssss", file_data, record_data["OFFSET"]) == (b'I', b'N', b'D', b'X')):
print()
print("> Index Record")
print(idx, record_data, "\r\n ", struct.unpack_from(">ssssssssss", file_data, record_data["OFFSET"]))
current_offset = record_data["OFFSET"] + 4
length = 4
print(" header length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" index type ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Unknown 1 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Unknown 2 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" IDXT start ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" index count ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" index encoding ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" index language ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" total index count ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" ordt start ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" ligt start ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Unknown 3 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
length = 4
print(" Unknown 4 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length)
current_offset += length
elif imghdr.what(None, file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]]):
print(idx, "imghdr ->", imghdr.what(None, file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]]), "\r\n ", file_data[record_data["OFFSET"]:record_data["OFFSET"]+10])
with open('./' + str(idx) + '.' + imghdr.what(None, file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]]), "wb") as file:
file.write(file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]])
else:
print(idx, record_data, "\r\n ", file_data[record_data["OFFSET"]:record_data["OFFSET"]+8])
# with open('./' + str(idx) + '-sec.dump', "wb") as file:
# file.write(file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment