g-simmons/scipdf_coordinates.ipynb

## scipdf_coordinates.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "from scipdf.pdf import parse_pdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "  XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning\n"
     ]
    }
   ],
   "source": [
    "soup = parse_pdf(\"tests/attention_is_all_you_need.pdf\",soup=True,fulltext=True,return_coordinates=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "formula = soup.find(\"formula\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<formula coords=\"4,219.97,436.17,284.03,25.41\" xml:id=\"formula_0\">Attention(Q, K, V ) = softmax( QK T √ d k )V (1)</formula>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "formula"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipdf.pdf import parse_pdf_to_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict = parse_pdf_to_dict(\"tests/attention_is_all_you_need.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\n",
      "    {\n",
      "        \"formula_id\": \"formula_0\",\n",
      "        \"formula_text\": \"Attention(Q, K, V ) = softmax( QK T \\u221a d k )V (1)\",\n",
      "        \"formula_coordinates\": [\n",
      "            4.0,\n",
      "            219.97,\n",
      "            436.17,\n",
      "            284.03,\n",
      "            25.41\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_1\",\n",
      "        \"formula_text\": \"1 \\u221a d k .\",\n",
      "        \"formula_coordinates\": [\n",
      "            4.0,\n",
      "            120.06,\n",
      "            493.07,\n",
      "            18.76,\n",
      "            14.92\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_2\",\n",
      "        \"formula_text\": \"1 \\u221a d k .\",\n",
      "        \"formula_coordinates\": [\n",
      "            4.0,\n",
      "            440.73,\n",
      "            577.17,\n",
      "            18.73,\n",
      "            14.92\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_3\",\n",
      "        \"formula_text\": \"MultiHead(Q, K, V ) = Concat(head 1 , ..., head h )W O where head i = Attention(QW Q i , KW K i , V W V i )\",\n",
      "        \"formula_coordinates\": [\n",
      "            5.0,\n",
      "            186.94,\n",
      "            114.25,\n",
      "            238.12,\n",
      "            29.82\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_4\",\n",
      "        \"formula_text\": \"W Q i \\u2208 R dmodel\\u00d7d k , W K i \\u2208 R dmodel\\u00d7d k , W V i \\u2208 R dmodel\\u00d7dv and W O \\u2208 R hdv\\u00d7dmodel .\",\n",
      "        \"formula_coordinates\": [\n",
      "            5.0,\n",
      "            108.0,\n",
      "            166.55,\n",
      "            394.82,\n",
      "            24.27\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_5\",\n",
      "        \"formula_text\": \"d k = d v = d model /h = 64.\",\n",
      "        \"formula_coordinates\": [\n",
      "            5.0,\n",
      "            108.0,\n",
      "            208.83,\n",
      "            106.16,\n",
      "            9.8\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_6\",\n",
      "        \"formula_text\": \"FFN(x) = max(0, xW 1 + b 1 )W 2 + b 2(2)\",\n",
      "        \"formula_coordinates\": [\n",
      "            5.0,\n",
      "            226.9,\n",
      "            520.54,\n",
      "            277.1,\n",
      "            9.65\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_7\",\n",
      "        \"formula_text\": \"-Attention O(n 2 \\u00b7 d) O(1) O(1) Recurrent O(n \\u00b7 d 2 ) O(n) O(n) Convolutional O(k \\u00b7 n \\u00b7 d 2 ) O(1) O(log k (n)) Self-Attention (restricted) O(r \\u00b7 n \\u00b7 d) O(1) O(n/r)\",\n",
      "        \"formula_coordinates\": [\n",
      "            6.0,\n",
      "            124.55,\n",
      "            139.24,\n",
      "            340.27,\n",
      "            44.2\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_8\",\n",
      "        \"formula_text\": \"P E (pos,2i) = sin(pos/10000 2i/dmodel ) P E (pos,2i+1) = cos(pos/10000 2i/dmodel )\",\n",
      "        \"formula_coordinates\": [\n",
      "            6.0,\n",
      "            225.69,\n",
      "            293.92,\n",
      "            160.61,\n",
      "            28.9\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_9\",\n",
      "        \"formula_text\": \"considerably, to O(k \\u00b7 n \\u00b7 d + n \\u00b7 d 2 )\",\n",
      "        \"formula_coordinates\": [\n",
      "            7.0,\n",
      "            108.0,\n",
      "            155.43,\n",
      "            151.27,\n",
      "            10.53\n",
      "        ]\n",
      "    },\n",
      "    {\n",
      "        \"formula_id\": \"formula_10\",\n",
      "        \"formula_text\": \"lrate = d \\u22120.5 model \\u00b7 min(step_num \\u22120.5 , step_num \\u00b7 warmup_steps \\u22121.5 )(3)\",\n",
      "        \"formula_coordinates\": [\n",
      "            7.0,\n",
      "            162.89,\n",
      "            564.84,\n",
      "            341.11,\n",
      "            13.42\n",
      "        ]\n",
      "    }\n",
      "]\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "print(json.dumps(dict[\"formulas\"], indent=4))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "papercast",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10 (default, Feb 26 2021, 10:16:00) \n[Clang 10.0.0 ]"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "cebfbe6fdaf5e818be5ff56dbc3aba09cee6fb0310a9b42ef532877cd8382055"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The autoreload extension is already loaded. To reload it, use:\n",
	" %reload_ext autoreload\n"
	]
	}
	],
	"source": [
	"%load_ext autoreload\n",
	"%autoreload 2\n",
	"from scipdf.pdf import parse_pdf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"\n",
	" XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning\n"
	]
	}
	],
	"source": [
	"soup = parse_pdf(\"tests/attention_is_all_you_need.pdf\",soup=True,fulltext=True,return_coordinates=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"formula = soup.find(\"formula\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<formula coords=\"4,219.97,436.17,284.03,25.41\" xml:id=\"formula_0\">Attention(Q, K, V ) = softmax( QK T √ d k )V (1)</formula>"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"formula"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"from scipdf.pdf import parse_pdf_to_dict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"dict = parse_pdf_to_dict(\"tests/attention_is_all_you_need.pdf\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[\n",
	" {\n",
	" \"formula_id\": \"formula_0\",\n",
	" \"formula_text\": \"Attention(Q, K, V ) = softmax( QK T \\u221a d k )V (1)\",\n",
	" \"formula_coordinates\": [\n",
	" 4.0,\n",
	" 219.97,\n",
	" 436.17,\n",
	" 284.03,\n",
	" 25.41\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_1\",\n",
	" \"formula_text\": \"1 \\u221a d k .\",\n",
	" \"formula_coordinates\": [\n",
	" 4.0,\n",
	" 120.06,\n",
	" 493.07,\n",
	" 18.76,\n",
	" 14.92\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_2\",\n",
	" \"formula_text\": \"1 \\u221a d k .\",\n",
	" \"formula_coordinates\": [\n",
	" 4.0,\n",
	" 440.73,\n",
	" 577.17,\n",
	" 18.73,\n",
	" 14.92\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_3\",\n",
	" \"formula_text\": \"MultiHead(Q, K, V ) = Concat(head 1 , ..., head h )W O where head i = Attention(QW Q i , KW K i , V W V i )\",\n",
	" \"formula_coordinates\": [\n",
	" 5.0,\n",
	" 186.94,\n",
	" 114.25,\n",
	" 238.12,\n",
	" 29.82\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_4\",\n",
	" \"formula_text\": \"W Q i \\u2208 R dmodel\\u00d7d k , W K i \\u2208 R dmodel\\u00d7d k , W V i \\u2208 R dmodel\\u00d7dv and W O \\u2208 R hdv\\u00d7dmodel .\",\n",
	" \"formula_coordinates\": [\n",
	" 5.0,\n",
	" 108.0,\n",
	" 166.55,\n",
	" 394.82,\n",
	" 24.27\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_5\",\n",
	" \"formula_text\": \"d k = d v = d model /h = 64.\",\n",
	" \"formula_coordinates\": [\n",
	" 5.0,\n",
	" 108.0,\n",
	" 208.83,\n",
	" 106.16,\n",
	" 9.8\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_6\",\n",
	" \"formula_text\": \"FFN(x) = max(0, xW 1 + b 1 )W 2 + b 2(2)\",\n",
	" \"formula_coordinates\": [\n",
	" 5.0,\n",
	" 226.9,\n",
	" 520.54,\n",
	" 277.1,\n",
	" 9.65\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_7\",\n",
	" \"formula_text\": \"-Attention O(n 2 \\u00b7 d) O(1) O(1) Recurrent O(n \\u00b7 d 2 ) O(n) O(n) Convolutional O(k \\u00b7 n \\u00b7 d 2 ) O(1) O(log k (n)) Self-Attention (restricted) O(r \\u00b7 n \\u00b7 d) O(1) O(n/r)\",\n",
	" \"formula_coordinates\": [\n",
	" 6.0,\n",
	" 124.55,\n",
	" 139.24,\n",
	" 340.27,\n",
	" 44.2\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_8\",\n",
	" \"formula_text\": \"P E (pos,2i) = sin(pos/10000 2i/dmodel ) P E (pos,2i+1) = cos(pos/10000 2i/dmodel )\",\n",
	" \"formula_coordinates\": [\n",
	" 6.0,\n",
	" 225.69,\n",
	" 293.92,\n",
	" 160.61,\n",
	" 28.9\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_9\",\n",
	" \"formula_text\": \"considerably, to O(k \\u00b7 n \\u00b7 d + n \\u00b7 d 2 )\",\n",
	" \"formula_coordinates\": [\n",
	" 7.0,\n",
	" 108.0,\n",
	" 155.43,\n",
	" 151.27,\n",
	" 10.53\n",
	" ]\n",
	" },\n",
	" {\n",
	" \"formula_id\": \"formula_10\",\n",
	" \"formula_text\": \"lrate = d \\u22120.5 model \\u00b7 min(step_num \\u22120.5 , step_num \\u00b7 warmup_steps \\u22121.5 )(3)\",\n",
	" \"formula_coordinates\": [\n",
	" 7.0,\n",
	" 162.89,\n",
	" 564.84,\n",
	" 341.11,\n",
	" 13.42\n",
	" ]\n",
	" }\n",
	"]\n"
	]
	}
	],
	"source": [
	"import json\n",
	"print(json.dumps(dict[\"formulas\"], indent=4))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "papercast",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.10 (default, Feb 26 2021, 10:16:00) \n[Clang 10.0.0 ]"
	},
	"orig_nbformat": 4,
	"vscode": {
	"interpreter": {
	"hash": "cebfbe6fdaf5e818be5ff56dbc3aba09cee6fb0310a9b42ef532877cd8382055"
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}