Last active
December 19, 2019 13:45
-
-
Save choryuidentify/1e6f3117e4f936e52ff9d8445d08cb6a to your computer and use it in GitHub Desktop.
MobiPocket Analyze, Target file is KF8 Format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import locale, struct | |
file_data = None | |
################################################################################# | |
# # | |
# MOBI HEADER PARSER - EXAMPLE # | |
# # | |
# 2019 Choryu Park (Kyoungkyu Park) # | |
# # | |
################################################################################# | |
# This dirty source code is not a perfect implementaion of MOBI parse. | |
# It target KF8 Mobi eBook. Maybe you need modify code for your needs. | |
# | |
# This source code is based on these documents; | |
# https://wiki.mobileread.com/wiki/MOBI | |
# https://wiki.mobileread.com/wiki/PDB | |
# https://www.mobileread.com/forums/showpost.php?p=3114050&postcount=1145 | |
with open("target_file_2.azw3", "rb") as fi: | |
file_data = fi.read() | |
offset = 0 | |
length = 0 | |
print("PalmDOC HEADER") | |
length = 32 # PalmDOC NAME | |
print(" name ->", file_data[offset:length].decode("CP1252"), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 2 # attribute | |
print(" attribute ->", struct.unpack_from(">H", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 2 # version | |
print(" version ->", struct.unpack_from(">H", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # creation date | |
print(" creation date ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # modification date | |
print(" modification date ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # last backup date | |
print(" last backup date ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # modificationNumber | |
print(" modificationNumber ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # appInfoID | |
print(" appInfoID ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # sortInfoID | |
print(" sortInfoID ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # type | |
print(" type ->", file_data[offset:offset + length], "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # creator | |
print(" creator ->", file_data[offset:offset + length], "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # uniqueIDseed | |
print(" uniqueIDseed ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 4 # nextRecordListID | |
print(" nextRecordListID ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
offset += length | |
length = 2 # number of Records | |
print(" number of Records ->", struct.unpack_from(">H", file_data, offset), "/ offset ->", offset, "length ->", length) | |
records, = struct.unpack_from(">H", file_data, offset) | |
offset += length | |
record_dict = dict() | |
last_pdb_record = 0 | |
print() | |
for record_idx in range(0, records): | |
record_dict[record_idx] = dict() | |
length = 4 | |
# print("", record_idx, "record offset ->", struct.unpack_from(">L", file_data, offset), "/ offset ->", offset, "length ->", length) | |
record_dict[record_idx]["OFFSET"], = struct.unpack_from(">L", file_data, offset) | |
offset += length | |
length = 1 | |
# print("", record_idx, "record Attributes ->", file_data[offset:offset+length], "/ offset ->", offset, "length ->", length) | |
record_dict[record_idx]["ATTR"] = file_data[offset:offset+length] | |
offset += length | |
length = 3 | |
# print("", record_idx, "UniqueID ->", file_data[offset:offset+length], "/ offset ->", offset, "length ->", length) | |
record_dict[record_idx]["UNIQUE"] = file_data[offset:offset+length] | |
offset += length | |
last_pdb_record = record_idx | |
# print() | |
print() | |
for idx, record_data in record_dict.items(): | |
current_offset = record_data['OFFSET'] | |
if idx == 0: | |
print("> PalmDOC Record 0 Header") | |
length = 2 | |
print(" Compression ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
print(" > 1 == no compression, 2 = PalmDOC compression, 17480 = HUFF/CDIC compression") | |
current_offset += length | |
length = 2 | |
print(" Unused ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" text length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
print(" > Uncompressed length of the entire text of the book") | |
current_offset += length | |
length = 2 | |
print(" record count ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
print(" > Number of PDB records used for the text of the book") | |
current_offset += length | |
length = 2 | |
print(" record size ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
print(" > Current reading position, as an offset into the uncompressed text") | |
current_offset += length | |
length = 2 | |
print(" Encryption Type ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
print(" > 0 = no encryption, 1 = Old Mobipocket Encryption, 2 = Mobipocket Encryption") | |
current_offset += length | |
length = 2 | |
print(" Unknown ->", struct.unpack_from(">H", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
print() | |
print("> MOBI Header") | |
length = 4 | |
print(" identifier ->", struct.unpack_from(">ssss", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" header length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
header_length, = struct.unpack_from(">L", file_data, current_offset) | |
header_last_offset = current_offset + header_length - 4 | |
current_offset += length | |
length = 4 | |
print(" Mobi type ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
print(" > 2 = Mobipocket Book, 232 = mobipocket? generated by kindlegen1.2, 248 = KF8: generated by kindlegen2") | |
current_offset += length | |
length = 4 | |
print(" text Encoding ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
print(" > 1252 = CP1252 (WinLatin1); 65001 = UTF-8") | |
if (struct.unpack_from(">L", file_data, current_offset) == (1252, )): | |
text_encoding = "CP1252" | |
elif (struct.unpack_from(">L", file_data, current_offset) == (65001, )): | |
text_encoding = "UTF-8" | |
current_offset += length | |
length = 4 | |
print(" Unique-ID ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" File version ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Ortographic index ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Inflection index ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Index names ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Index keys ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Extra index 0 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Extra index 1 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Extra index 2 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Extra index 3 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Extra index 4 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Extra index 5 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" First Non-book index? ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Full Name Offset ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
full_name_offset, = struct.unpack_from(">L", file_data, current_offset) | |
current_offset += length | |
length = 4 | |
print(" Full Name Length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
full_name_length, = struct.unpack_from(">L", file_data, current_offset) | |
current_offset += length | |
full_name_offset += record_data['OFFSET'] | |
full_name_of_this_book = file_data[full_name_offset:full_name_offset + full_name_length].decode(text_encoding) | |
print(" > Book name -> ", full_name_of_this_book) | |
length = 4 | |
print(" Locale ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Input Language ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Output Language ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Min version ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" First Image index ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
first_image_index, = struct.unpack_from(">L", file_data, current_offset) | |
print(struct.unpack_from(">sssssssssss", file_data, record_dict[first_image_index]["OFFSET"])) | |
current_offset += length | |
current_offset = header_last_offset | |
length = 4 | |
if (struct.unpack_from(">ssss", file_data, current_offset) == (b'E', b'X', b'T', b'H')): | |
print() | |
exth_header_exists = True | |
print("> EXTH Header") | |
print(" identifier ->", struct.unpack_from(">ssss", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" header length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" record Count ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
exth_record_count, = struct.unpack_from(">L", file_data, current_offset) | |
current_offset += length | |
for exth_record_idx in range(0, exth_record_count): | |
print() | |
length = 4 | |
print("", exth_record_idx, "EXTH record type ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
exth_record_type, = struct.unpack_from(">L", file_data, current_offset) | |
current_offset += length | |
length = 4 | |
print("", exth_record_idx, "EXTH record length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
exth_header_length, = struct.unpack_from(">L", file_data, current_offset) | |
current_offset += length | |
length = exth_header_length - 8 | |
print("", exth_record_idx, "EXTH record data offset ->", current_offset, "length ->", length) | |
if (exth_record_type == 501): | |
print(" > cotainer type ->", file_data[current_offset:current_offset + length]) | |
elif (exth_record_type == 123): | |
print(" > book_type ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 504 or exth_record_type == 113): | |
print(" > ASIN ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 508): | |
print(" > Title_Furigana ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 517): | |
print(" > Creator_Furigana ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 522): | |
print(" > Publisher_Furigana ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 503): | |
print(" > Updated_Title ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 208): | |
print(" > Watermark ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 104): | |
print(" > ISBN ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 106): | |
print(" > Published ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 100): | |
print(" > Creator ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 101): | |
print(" > Publisher ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 102): | |
print(" > Imprint ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 103): | |
print(" > Description ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 524): | |
print(" > Language ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 201): | |
print(" > Cover record offset from First Image record ->", struct.unpack_from(">L", file_data, current_offset)) | |
elif (exth_record_type == 202): | |
print(" > Thumbnail record offset from First Image record ->", struct.unpack_from(">L", file_data, current_offset)) | |
elif (exth_record_type == 129): | |
print(" > KF8 cover URI ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 536): | |
print(" > HD Media Containers Info ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 542): | |
print(" > Container ID ->", file_data[current_offset:current_offset + length].decode("utf-8")) | |
elif (exth_record_type == 121): | |
print(" > KF8 Boundery offset ->", struct.unpack_from(">L", file_data, current_offset)) | |
else: | |
print(" > Unknown", exth_record_type,"->", file_data[current_offset:current_offset + length]) | |
current_offset += length | |
print() | |
# current_offset = record_data['OFFSET'] + 240 | |
# print(" Unknown data ->", file_data[current_offset:current_offset+4]) | |
import imghdr | |
if (struct.unpack_from(">ssss", file_data, record_data["OFFSET"]) == (b'\xe9', b'\x8e', b'\r', b'\n')): | |
print() | |
print("> End of Records") | |
print(idx, record_data, "\r\n ", struct.unpack_from(">ssss", file_data, record_data["OFFSET"])) | |
elif (struct.unpack_from(">ssss", file_data, record_data["OFFSET"]) == (b'I', b'N', b'D', b'X')): | |
print() | |
print("> Index Record") | |
print(idx, record_data, "\r\n ", struct.unpack_from(">ssssssssss", file_data, record_data["OFFSET"])) | |
current_offset = record_data["OFFSET"] + 4 | |
length = 4 | |
print(" header length ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" index type ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Unknown 1 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Unknown 2 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" IDXT start ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" index count ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" index encoding ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" index language ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" total index count ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" ordt start ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" ligt start ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Unknown 3 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
length = 4 | |
print(" Unknown 4 ->", struct.unpack_from(">L", file_data, current_offset), "/ offset ->", current_offset, "length ->", length) | |
current_offset += length | |
elif imghdr.what(None, file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]]): | |
print(idx, "imghdr ->", imghdr.what(None, file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]]), "\r\n ", file_data[record_data["OFFSET"]:record_data["OFFSET"]+10]) | |
with open('./' + str(idx) + '.' + imghdr.what(None, file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]]), "wb") as file: | |
file.write(file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]]) | |
else: | |
print(idx, record_data, "\r\n ", file_data[record_data["OFFSET"]:record_data["OFFSET"]+8]) | |
# with open('./' + str(idx) + '-sec.dump', "wb") as file: | |
# file.write(file_data[record_data["OFFSET"]:record_dict[idx+1]["OFFSET"]]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment