Skip to content

Instantly share code, notes, and snippets.

@rpresser
Created August 6, 2021 13:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rpresser/e7d22b0e7a3e84927fdcd46dec0ee068 to your computer and use it in GitHub Desktop.
Save rpresser/e7d22b0e7a3e84927fdcd46dec0ee068 to your computer and use it in GitHub Desktop.
Part of the xml output from pdftoxml against the NYT crossword PDF download from 2021-08-06
<?xml version="1.0" encoding="utf-8"?>
<!-- This is an excerpt from the output of pdftoxml against the NYT crossword PDF download from 2021-08-06.
-->
<!-- the XML header. Get the page width from PAGE; you can use it to separate clues from boxes.
Note that all PDF coordinates are in points (72 points to the inch). -->
<DOCUMENT>
<METADATA>
<PDFFILENAME>Aug0621.pdf</PDFFILENAME>
<PROCESS name="pdftoxml" cmd="">
<VERSION value="2.0">
<COMMENT />
</VERSION>
<CREATIONDATE>Fri Aug 6 09:23:54 2021</CREATIONDATE>
</PROCESS>
</METADATA>
<PAGE width="612" height="792" number="1" id="p1">
<MEDIABOX x1="0" y1="0" x2="612" y2="792" />
<CROPBOX x1="0" y1="0" x2="612" y2="792" />
<BLEEDBOX x1="0" y1="0" x2="612" y2="792" />
<ARTBOX x1="0" y1="0" x2="612" y2="792" />
<TRIMBOX x1="0" y1="0" x2="612" y2="792" />
<!-- snip many lines -->
<!-- This is clue #1. The first token has the text "1" and is in font "nytfranklin".
Other lines are the words of the clue in font "arialunicodems". -->
<TEXT width="71.038" height="9.64818" id="p1_t59" x="30.1301" y="65.7503">
<TOKEN sid="p1_s998" id="p1_w69" font-name="nytfranklin" bold="yes" italic="no" font-size="9.9" font-color="#000000" rotation="0" angle="0" x="30.1301" y="66.6511" base="73.6702" width="5.643" height="7.1676">1</TOKEN>
<TOKEN sid="p1_s999" id="p1_w70" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="65.7503" base="74.0003" width="22" height="10.549">Fruit</TOKEN>
<TOKEN sid="p1_s1000" id="p1_w71" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="65.0881" y="65.7503" base="74.0003" width="23.848" height="10.549">used</TOKEN>
<TOKEN sid="p1_s1001" id="p1_w72" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="91.9941" y="65.7503" base="74.0003" width="9.174" height="10.549">to</TOKEN>
</TEXT>
<TEXT width="81.312" height="10.549" id="p1_t60" x="40.0301" y="79.5804">
<TOKEN sid="p1_s1002" id="p1_w73" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="79.5804" base="87.8304" width="26.895" height="10.549">flavor</TOKEN>
<TOKEN sid="p1_s1003" id="p1_w74" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="69.9831" y="79.5804" base="87.8304" width="15.29" height="10.549">the</TOKEN>
<TOKEN sid="p1_s1004" id="p1_w75" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="88.3311" y="79.5804" base="87.8304" width="33.011" height="10.549">liqueur</TOKEN>
</TEXT>
<TEXT width="42.801" height="10.549" id="p1_t61" x="40.0301" y="93.4105">
<TOKEN sid="p1_s1005" id="p1_w76" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="93.4105" base="101.66" width="42.801" height="10.549">patxaran</TOKEN>
</TEXT>
<!-- snip many more lines -->
<!-- This is box #1. The text content is a plain number and the font is "arialunicodems".
To confirm, check the x location is greater than 1/3 the page width. -->
<TEXT width="4.27008" height="7.36512" id="p1_t3" x="264.89" y="50.83">
<TOKEN sid="p1_s20" id="p1_w13" font-name="arialunicodems" bold="no" italic="no" font-size="7.68" font-color="#000000" rotation="0" angle="0" x="264.89" y="50.83" base="56.59" width="4.27008" height="7.36512">1</TOKEN>
</TEXT>
<!-- snip many more lines -->
<!-- This include goes to a "vectorial image" in XML format, which is very similar to SVG.
See https://sourceforge.net/p/pdf2xml/discussion/714681/thread/dd35f74f/
This needs to be parsed to get the filled and non-numbered empty squares in the grid.
-->
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="Aug0621.xml_data/image-1.vec" />
<!-- the XML footer -->
</PAGE>
</DOCUMENT>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment