Last active
March 22, 2020 03:11
-
-
Save squaresmile/cb31fd9d597af56518e5c7a4e72f3201 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai | |
import sys | |
import os, getopt, struct | |
import imghdr | |
def get_image_type(imgname, imgdata=None): | |
imgtype = imghdr.what(imgname, imgdata) | |
# horrible hack since imghdr detects jxr/wdp as tiffs | |
if imgtype is not None and imgtype == "tiff": | |
imgtype = "wdp" | |
# imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some | |
# with only the magic JPEG bytes out there... | |
# ImageMagick handles those, so, do it too. | |
if imgtype is None: | |
if imgdata[0:2] == b'\xFF\xD8': | |
# Get last non-null bytes | |
last = len(imgdata) | |
while (imgdata[last-1:last] == b'\x00'): | |
last-=1 | |
# Be extra safe, check the trailing bytes, too. | |
if imgdata[last-2:last] == b'\xFF\xD9': | |
imgtype = "jpeg" | |
return imgtype | |
def processCRES(i, data): | |
data = data[12:] | |
imgtype = get_image_type(None, data) | |
if imgtype == "jpeg": | |
imgtype = "jpg" | |
if imgtype is None: | |
print " Warning: CRES Section %s does not contain a recognised resource" % i | |
imgtype = "dat" | |
imgname = "HDimage%05d.%s" % (i, imgtype) | |
imgdir = os.path.join(".", "azw6_images") | |
if not os.path.exists(imgdir): | |
os.mkdir(imgdir) | |
print " Extracting HD image: {0:s} from section {1:d}".format(imgname,i) | |
imgpath = os.path.join(imgdir, imgname) | |
with open(imgpath, 'wb') as f: | |
f.write(data) | |
return | |
# this is just guesswork so far, making big assumption that | |
# metavalue key numbers reamin the same in the CONT EXTH | |
def dump_contexth(codec, extheader): | |
# determine text encoding | |
if extheader == '': | |
return | |
id_map_strings = { | |
1 : 'Drm Server Id (1)', | |
2 : 'Drm Commerce Id (2)', | |
3 : 'Drm Ebookbase Book Id(3)', | |
100 : 'Creator_(100)', | |
101 : 'Publisher_(101)', | |
102 : 'Imprint_(102)', | |
103 : 'Description_(103)', | |
104 : 'ISBN_(104)', | |
105 : 'Subject_(105)', | |
106 : 'Published_(106)', | |
107 : 'Review_(107)', | |
108 : 'Contributor_(108)', | |
109 : 'Rights_(109)', | |
110 : 'SubjectCode_(110)', | |
111 : 'Type_(111)', | |
112 : 'Source_(112)', | |
113 : 'ASIN_(113)', | |
114 : 'versionNumber_(114)', | |
117 : 'Adult_(117)', | |
118 : 'Price_(118)', | |
119 : 'Currency_(119)', | |
122 : 'fixed-layout_(122)', | |
123 : 'book-type_(123)', | |
124 : 'orientation-lock_(124)', | |
126 : 'original-resolution_(126)', | |
127 : 'zero-gutter_(127)', | |
128 : 'zero-margin_(128)', | |
129 : 'K8_Masthead/Cover_Image_(129)', | |
132 : 'RegionMagnification_(132)', | |
200 : 'DictShortName_(200)', | |
208 : 'Watermark_(208)', | |
501 : 'cdeType_(501)', | |
502 : 'last_update_time_(502)', | |
503 : 'Updated_Title_(503)', | |
504 : 'ASIN_(504)', | |
508 : 'Unknown_Title_Furigana?_(508)', | |
517 : 'Unknown_Creator_Furigana?_(517)', | |
522 : 'Unknown_Publisher_Furigana?_(522)', | |
524 : 'Language_(524)', | |
525 : 'primary-writing-mode_(525)', | |
526 : 'Unknown_(526)', | |
527 : 'page-progression-direction_(527)', | |
528 : 'override-kindle_fonts_(528)', | |
529 : 'Unknown_(529)', | |
534 : 'Input_Source_Type_(534)', | |
535 : 'Kindlegen_BuildRev_Number_(535)', | |
536 : 'Container_Info_(536)', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) | |
538 : 'Container_Resolution_(538)', | |
539 : 'Container_Mimetype_(539)', | |
542 : 'Unknown_but_changes_with_filename_only_(542)', | |
543 : 'Container_id_(543)', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER | |
544 : 'Unknown_(544)', | |
} | |
id_map_values = { | |
115 : 'sample_(115)', | |
116 : 'StartOffset_(116)', | |
121 : 'K8(121)_Boundary_Section_(121)', | |
125 : 'K8_Count_of_Resources_Fonts_Images_(125)', | |
131 : 'K8_Unidentified_Count_(131)', | |
201 : 'CoverOffset_(201)', | |
202 : 'ThumbOffset_(202)', | |
203 : 'Fake_Cover_(203)', | |
204 : 'Creator_Software_(204)', | |
205 : 'Creator_Major_Version_(205)', | |
206 : 'Creator_Minor_Version_(206)', | |
207 : 'Creator_Build_Number_(207)', | |
401 : 'Clipping_Limit_(401)', | |
402 : 'Publisher_Limit_(402)', | |
404 : 'Text_to_Speech_Disabled_(404)', | |
} | |
id_map_hexstrings = { | |
209 : 'Tamper_Proof_Keys_(209_in_hex)', | |
300 : 'Font_Signature_(300_in_hex)', | |
} | |
_length, num_items = struct.unpack('>LL', extheader[4:12]) | |
extheader = extheader[12:] | |
pos = 0 | |
for _ in range(num_items): | |
id, size = struct.unpack('>LL', extheader[pos:pos+8]) | |
content = extheader[pos + 8: pos + size] | |
if id in id_map_strings.keys(): | |
name = id_map_strings[id] | |
print '\n Key: "%s"\n Value: "%s"' % (name, unicode(content, codec).encode("utf-8")) | |
elif id in id_map_values.keys(): | |
name = id_map_values[id] | |
if size == 9: | |
value, = struct.unpack('B',content) | |
print '\n Key: "%s"\n Value: 0x%01x' % (name, value) | |
elif size == 10: | |
value, = struct.unpack('>H',content) | |
print '\n Key: "%s"\n Value: 0x%02x' % (name, value) | |
elif size == 12: | |
value, = struct.unpack('>L',content) | |
print '\n Key: "%s"\n Value: 0x%04x' % (name, value) | |
else: | |
print "\nError: Value for %s has unexpected size of %s" % (name, size) | |
elif id in id_map_hexstrings.keys(): | |
name = id_map_hexstrings[id] | |
print '\n Key: "%s"\n Value: 0x%s' % (name, content.encode('hex')) | |
else: | |
print "\nWarning: Unknown metadata with id %s found" % id | |
name = str(id) + ' (hex)' | |
print ' Key: "%s"\n Value: 0x%s' % (name, content.encode('hex')) | |
pos += size | |
return | |
def sortedHeaderKeys(mheader): | |
hdrkeys = sorted(mheader.keys(), key=lambda akey: mheader[akey][0]) | |
return hdrkeys | |
class dumpHeaderException(Exception): | |
pass | |
class PalmDB: | |
# important palmdb header offsets | |
unique_id_seed = 68 | |
number_of_pdb_records = 76 | |
first_pdb_record = 78 | |
def __init__(self, palmdata): | |
self.data = palmdata | |
self.nsec, = struct.unpack_from('>H',self.data,PalmDB.number_of_pdb_records) | |
def getsecaddr(self,secno): | |
secstart, = struct.unpack_from('>L', self.data, PalmDB.first_pdb_record+secno*8) | |
if secno == self.nsec-1: | |
secend = len(self.data) | |
else: | |
secend, = struct.unpack_from('>L',self.data,PalmDB.first_pdb_record+(secno+1)*8) | |
return secstart,secend | |
def readsection(self,secno): | |
if secno < self.nsec: | |
secstart, secend = self.getsecaddr(secno) | |
return self.data[secstart:secend] | |
return '' | |
def getnumsections(self): | |
return self.nsec | |
class HdrParser: | |
cont_header = { | |
'magic' : (0x00, '4s', 4), | |
'record_size' : (0x04, '>L', 4), | |
'type' : (0x08, '>H', 2), | |
'count' : (0x0A, '>H', 2), | |
'codepage' : (0x0C, '>L', 4), | |
'unknown0' : (0x10, '>L', 4), | |
'unknown1' : (0x14, '>L', 4), | |
'num_resc_recs' : (0x18, '>L', 4), | |
'num_wo_placeholders' : (0x1C, '>L', 4), | |
'offset_to_hrefs' : (0x20, '>L', 4), | |
'unknown2' : (0x24, '>L', 4), | |
'title_offset' : (0x28, '>L', 4), | |
'title_length' : (0x2C, '>L', 4), | |
} | |
cont_header_sorted_keys = sortedHeaderKeys(cont_header) | |
def __init__(self, header, start): | |
self.header = header | |
self.start = start | |
self.hdr = {} | |
# set it up for the proper header version | |
self.header_sorted_keys = HdrParser.cont_header_sorted_keys | |
self.cont_header = HdrParser.cont_header | |
# parse the header information | |
for key in self.header_sorted_keys: | |
(pos, format, tot_len) = self.cont_header[key] | |
if pos < 48: | |
val, = struct.unpack_from(format, self.header, pos) | |
self.hdr[key] = val | |
self.exth = self.header[48:] | |
self.title_offset = self.hdr['title_offset'] | |
self.title_length = self.hdr['title_length'] | |
self.title = self.header[self.title_offset: self.title_offset + self.title_length] | |
self.codec = 'windows-1252' | |
self.codec_map = { | |
1252 : 'windows-1252', | |
65001: 'utf-8', | |
} | |
if self.hdr['codepage'] in self.codec_map.keys(): | |
self.codec = self.codec_map[self.hdr['codepage']] | |
self.title = self.title.decode(self.codec).encode('utf-8') | |
def dumpHeaderInfo(self): | |
for key in self.cont_header_sorted_keys: | |
(pos, format, tot_len) = self.cont_header[key] | |
if pos < 48: | |
if key != 'magic': | |
fmt_string = " Field: %20s Offset: 0x%03x Width: %d Value: 0x%0" + str(tot_len) + "x" | |
else: | |
fmt_string = " Field: %20s Offset: 0x%03x Width: %d Value: %s" | |
print fmt_string % (key, pos, tot_len, self.hdr[key]) | |
print "EXTH Region Length: 0x%0x" % len(self.exth) | |
print "EXTH MetaData" | |
print self.title | |
dump_contexth(self.codec, self.exth) | |
return | |
def usage(progname): | |
print "" | |
print "Description:" | |
print " Dump the image from an AZW6 HD container file" | |
print " " | |
print "Usage:" | |
print " %s -h infile.azw6" % progname | |
print " " | |
print "Options:" | |
print " -h print this help message" | |
def main(argv=sys.argv): | |
print "DumpAZW6 v01" | |
progname = os.path.basename(argv[0]) | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], "h") | |
except getopt.GetoptError, err: | |
print str(err) | |
usage(progname) | |
sys.exit(2) | |
if len(args) != 1: | |
usage(progname) | |
sys.exit(2) | |
for o, a in opts: | |
if o == "-h": | |
usage(progname) | |
sys.exit(0) | |
infile = args[0] | |
infileext = os.path.splitext(infile)[1].upper() | |
print infile, infileext | |
if infileext not in ['.AZW6', '.RES']: | |
print "Error: first parameter must be a Kindle AZW6 HD container file." | |
return 1 | |
try: | |
# make sure it is really an hd container file | |
contdata = file(infile, 'rb').read() | |
palmheader = contdata[0:78] | |
ident = palmheader[0x3C:0x3C+8] | |
if ident != 'RBINCONT': | |
raise dumpHeaderException('invalid file format') | |
headers = {} | |
pp = PalmDB(contdata) | |
header = pp.readsection(0) | |
print "\n\nFirst Header Dump from Section %d" % 0 | |
hp = HdrParser(header, 0) | |
hp.dumpHeaderInfo() | |
# now dump a basic sector map of the palmdb | |
n = pp.getnumsections() | |
dtmap = { | |
"FONT": "FONT", | |
"RESC": "RESC", | |
"CRES": "CRES", | |
"CONT": "CONT", | |
chr(0xa0) + chr(0xa0) + chr(0xa0) + chr(0xa0): "Empty_Image/Resource_Placeholder", | |
chr(0xe9) + chr(0x8e) + "\r\n" : "EOF_RECORD", | |
} | |
dtmap2 = { | |
"kindle:embed" : "KINDLE:EMBED", | |
} | |
tr = -1 | |
off = -1 | |
hp = None | |
secmap = {} | |
print "\nMap of Palm DB Sections" | |
print " Dec - Hex : Description" | |
print " ---- - ---- -----------" | |
for i in xrange(n): | |
before, after = pp.getsecaddr(i) | |
data = pp.readsection(i) | |
dlen = len(data) | |
dt = data[0:4] | |
dtext = data[0:12] | |
desc = '' | |
if dtext in dtmap2.keys(): | |
desc = data | |
linkhrefs = [] | |
hreflist = desc.split('|') | |
for href in hreflist: | |
if href != "": | |
linkhrefs.append(" " + href) | |
desc = "\n" + "\n".join(linkhrefs) | |
elif dt in dtmap.keys(): | |
desc = dtmap[dt] | |
if dt == "CONT": | |
desc="Cont Header" | |
elif dt == "CRES": | |
processCRES(i, data) | |
else: | |
desc = dtext.encode('hex') | |
desc = desc + " " + dtext | |
if desc != "CONT": | |
print " %04d - %04x: %s [%d]" % (i, i, desc, dlen) | |
except Exception, e: | |
print "Error: %s" % e | |
return 1 | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment