Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Part of the xml output from pdftoxml against the NYT crossword PDF download from 2021-08-06
<?xml version="1.0" encoding="utf-8"?>
<!-- This is an excerpt from the output of pdftoxml against the NYT crossword PDF download from 2021-08-06.
-->
<!-- the XML header. Get the page width from PAGE; you can use it to separate clues from boxes.
Note that all PDF coordinates are in points (72 points to the inch). -->
<DOCUMENT>
<METADATA>
<PDFFILENAME>Aug0621.pdf</PDFFILENAME>
<PROCESS name="pdftoxml" cmd="">
<VERSION value="2.0">
<COMMENT />
</VERSION>
<CREATIONDATE>Fri Aug 6 09:23:54 2021</CREATIONDATE>
</PROCESS>
</METADATA>
<PAGE width="612" height="792" number="1" id="p1">
<MEDIABOX x1="0" y1="0" x2="612" y2="792" />
<CROPBOX x1="0" y1="0" x2="612" y2="792" />
<BLEEDBOX x1="0" y1="0" x2="612" y2="792" />
<ARTBOX x1="0" y1="0" x2="612" y2="792" />
<TRIMBOX x1="0" y1="0" x2="612" y2="792" />
<!-- snip many lines -->
<!-- This is clue #1. The first token has the text "1" and is in font "nytfranklin".
Other lines are the words of the clue in font "arialunicodems". -->
<TEXT width="71.038" height="9.64818" id="p1_t59" x="30.1301" y="65.7503">
<TOKEN sid="p1_s998" id="p1_w69" font-name="nytfranklin" bold="yes" italic="no" font-size="9.9" font-color="#000000" rotation="0" angle="0" x="30.1301" y="66.6511" base="73.6702" width="5.643" height="7.1676">1</TOKEN>
<TOKEN sid="p1_s999" id="p1_w70" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="65.7503" base="74.0003" width="22" height="10.549">Fruit</TOKEN>
<TOKEN sid="p1_s1000" id="p1_w71" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="65.0881" y="65.7503" base="74.0003" width="23.848" height="10.549">used</TOKEN>
<TOKEN sid="p1_s1001" id="p1_w72" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="91.9941" y="65.7503" base="74.0003" width="9.174" height="10.549">to</TOKEN>
</TEXT>
<TEXT width="81.312" height="10.549" id="p1_t60" x="40.0301" y="79.5804">
<TOKEN sid="p1_s1002" id="p1_w73" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="79.5804" base="87.8304" width="26.895" height="10.549">flavor</TOKEN>
<TOKEN sid="p1_s1003" id="p1_w74" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="69.9831" y="79.5804" base="87.8304" width="15.29" height="10.549">the</TOKEN>
<TOKEN sid="p1_s1004" id="p1_w75" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="88.3311" y="79.5804" base="87.8304" width="33.011" height="10.549">liqueur</TOKEN>
</TEXT>
<TEXT width="42.801" height="10.549" id="p1_t61" x="40.0301" y="93.4105">
<TOKEN sid="p1_s1005" id="p1_w76" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="93.4105" base="101.66" width="42.801" height="10.549">patxaran</TOKEN>
</TEXT>
<!-- snip many more lines -->
<!-- This is box #1. The text content is a plain number and the font is "arialunicodems".
To confirm, check the x location is greater than 1/3 the page width. -->
<TEXT width="4.27008" height="7.36512" id="p1_t3" x="264.89" y="50.83">
<TOKEN sid="p1_s20" id="p1_w13" font-name="arialunicodems" bold="no" italic="no" font-size="7.68" font-color="#000000" rotation="0" angle="0" x="264.89" y="50.83" base="56.59" width="4.27008" height="7.36512">1</TOKEN>
</TEXT>
<!-- snip many more lines -->
<!-- This include goes to a "vectorial image" in XML format, which is very similar to SVG.
See https://sourceforge.net/p/pdf2xml/discussion/714681/thread/dd35f74f/
This needs to be parsed to get the filled and non-numbered empty squares in the grid.
-->
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="Aug0621.xml_data/image-1.vec" />
<!-- the XML footer -->
</PAGE>
</DOCUMENT>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment