Skip to content

Instantly share code, notes, and snippets.

@cdrini
Last active July 14, 2023 16:28
Show Gist options
  • Save cdrini/482fad6b0657e366fca3c1a83a259da8 to your computer and use it in GitHub Desktop.
Save cdrini/482fad6b0657e366fca3c1a83a259da8 to your computer and use it in GitHub Desktop.
"""
Examples:
# Get first 20 pages
python djvu_to_single_line.py ~/Downloads/isbn_9781531610494_djvu.xml -- :20
# Get first 20 pages and last 5 pages
python djvu_to_single_line.py ~/Downloads/isbn_9781531610494_djvu.xml -- :20 -5:
"""
import xml.etree.ElementTree as ET
from typing import List
def page_to_line(page_xml_tree):
"""
Convert e.g.
<OBJECT data="file://localhost/var/tmp/autoclean/derive/isbn_9781531610494/isbn_9781531610494.djvu" type="image/x.djvu" usemap="isbn_9781531610494_0003.djvu" width="2103" height="3347">
<PARAM name="PAGE" value="isbn_9781531610494_0003.djvu" />
<PARAM name="DPI" value="360" />
<HIDDENTEXT>
<PAGECOLUMN>
<REGION>
<PARAGRAPH>
<LINE>
<WORD coords="467,1604,779,1525" x-confidence="96">Digitized</WORD>
<WORD coords="811,1604,893,1525" x-confidence="96">by</WORD>
<WORD coords="919,1588,1031,1525" x-confidence="96">the</WORD>
<WORD coords="1066,1588,1340,1525" x-confidence="94">Internet</WORD>
<WORD coords="1366,1588,1639,1525" x-confidence="96">Archive</WORD>
</LINE>
<LINE>
<WORD coords="586,1695,641,1634" x-confidence="84">in</WORD>
<WORD coords="673,1697,854,1635" x-confidence="94">2022</WORD>
<WORD coords="882,1697,1025,1634" x-confidence="96">with</WORD>
<WORD coords="1055,1713,1325,1634" x-confidence="96">funding</WORD>
<WORD coords="1355,1697,1516,1634" x-confidence="96">from</WORD>
</LINE>
<LINE>
<WORD coords="601,1806,1059,1743" x-confidence="91">Kahle/Austin</WORD>
<WORD coords="1095,1806,1502,1743" x-confidence="96">Foundation</WORD>
</LINE>
</PARAGRAPH>
</REGION>
</PAGECOLUMN>
<PAGECOLUMN>
<REGION>
<PARAGRAPH>
<LINE>
<WORD coords="190,3188,1502,3109" x-confidence="67">https://archive.org/details/ison_</WORD>
<WORD coords="1517,3172,1916,3110" x-confidence="96">9781531610494</WORD>
</LINE>
</PARAGRAPH>
</REGION>
</PAGECOLUMN>
</HIDDENTEXT>
</OBJECT>
to:
0003: Digitized by the Internet Archive \n in 2022 with funding from \n Kahle/Austin Foundation \n\n https://archive.org/details/ison_9781531610494
"""
page_number = page_xml_tree.find('PARAM[@name="PAGE"]').attrib['value'].split('_')[-1].split('.')[0]
text = ''
for column in page_xml_tree.find('HIDDENTEXT').findall('PAGECOLUMN'):
for region in column.findall('REGION'):
for paragraph in region.findall('PARAGRAPH'):
for line in paragraph.findall('LINE'):
text += ' '.join([word.text for word in line.findall('WORD')]) + ' '
text += '\\n'
text += '\\n'
return f'{page_number}: {text}'
def extract_pages(xml_file: str, ranges: List[str], truncate=500) -> str:
"""
Return single-line representations of the pages in the given ranges as a string.
Ranges can be of the form:
- :10 (first 10 pages)
- 10: (pages 10 and up)
- 10:20 (pages 10 through 20)
- 10 (just page 10)
- -10: (last 10 pages)
:param truncate: Truncate lines to this length
"""
tree = ET.parse(xml_file)
root = tree.getroot()
pages = root.findall('.//OBJECT')
output_str = ""
for page_range in ranges:
if ':' in page_range:
start, end = page_range.split(':')
start = int(start) if start else 0
end = int(end) if end else len(pages)
else:
start = int(page_range)
end = start + 1
for page in pages[start:end]:
line = page_to_line(page)
if len(line) > truncate:
line = line[:truncate//2] + '[...]' + line[-truncate//2:]
output_str += line + "\n"
return output_str
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('xml_file')
parser.add_argument('ranges', nargs='+')
parser.add_argument('--truncate', type=int, default=500)
args = parser.parse_args()
print(extract_pages(args.xml_file, args.ranges, args.truncate))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment