public
Created

Decoding #pdfloc for highlighted text in the Sony PRS-T1

  • Download Gist
highlightedtext.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
#!/usr/bin/env python
 
"""
%s mount-point
 
Print to a file information on the highlighted text of a selected file on
the reader at mount-point. The output file is tab-separated data of the
form:
page highlight-range mark-start mark-end mark-type marked-text
mark-start and mark-end are the 'pdfloc' data defining where the marked
text starts and stops. Some of this information is extracted into the
highlight-range field, which has format:
c,d,e,g -> C,D,E,G
We think 'c' (and 'C') tell us the text line, 'd' and 'e' tell us the
character, and 'g' is some kind of flag, but we don't understand how this
information is encoded. If you figure something out, please tell us:
https://github.com/rschroll/prsannots/issues/4
"""
 
# Copyright 2012 Robert Schroll
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
 
import os
import sys
from prsannots.prst1 import Reader
 
MT = {10: 'highlight', 11: 'text', 12: 'drawing'}
 
def u_raw_input(prompt):
"""raw_input with unicode encoding/decoding."""
return raw_input(prompt.encode(sys.stdout.encoding)).decode(sys.stdin.encoding)
 
def select_book(books):
print "Please select which book to get:"
for i, book in enumerate(books):
title = book.title or book.file.split('/')[-1]
print " %i. %s" % (i+1, title)
which = u_raw_input("> ")
try:
return books[int(which) - 1]
except (ValueError, IndexError):
print "Could not understand your response. Aborting."
sys.exit(1)
 
def main(path):
reader = Reader(path)
book = select_book(reader.books)
c = reader.db.cursor()
c.execute('''select page, mark, mark_end, markup_type, marked_text
from annotation
where content_id = ?
order by page''', (book.id,))
outfn = os.path.splitext(os.path.basename(book.file))[0] + '.txt'
userfn = u_raw_input("Enter output file name [%s]: " % outfn)
if userfn:
outfn = userfn
f = open(outfn, 'w')
for line in c:
nums = [s[8:-2].split(',') for s in line[1:3]]
hls = '%s,%s,%s,%s -> %s,%s,%s,%s' % tuple(nums[0][2:5] + nums[0][6:7] + nums[1][2:5] + nums[0][6:7])
f.write('%i\t%s\t%s\t%s\t%s\t%s\n' % (int(line[0]+1), hls, line[1][:-1], line[2][:-1], MT[line[3]], line[4].encode('utf-8')))
f.close()
 
if __name__ == '__main__':
if len(sys.argv) != 2:
print __doc__ % sys.argv[0]
sys.exit(0)
if not os.path.ismount(sys.argv[1]):
print "First argument must be mount point of Sony Reader."
print "(%s does not appear to be a mount point.)" % sys.argv[1]
sys.exit(1)
main(sys.argv[1])

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.