Last active
December 15, 2022 03:33
-
-
Save g-simmons/373d61066f4e60cec30e4a524af57b68 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The autoreload extension is already loaded. To reload it, use:\n", | |
" %reload_ext autoreload\n" | |
] | |
} | |
], | |
"source": [ | |
"%load_ext autoreload\n", | |
"%autoreload 2\n", | |
"from scipdf.pdf import parse_pdf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
" XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning\n" | |
] | |
} | |
], | |
"source": [ | |
"soup = parse_pdf(\"tests/attention_is_all_you_need.pdf\",soup=True,fulltext=True,return_coordinates=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"formula = soup.find(\"formula\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<formula coords=\"4,219.97,436.17,284.03,25.41\" xml:id=\"formula_0\">Attention(Q, K, V ) = softmax( QK T √ d k )V (1)</formula>" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"formula" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from scipdf.pdf import parse_pdf_to_dict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dict = parse_pdf_to_dict(\"tests/attention_is_all_you_need.pdf\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[\n", | |
" {\n", | |
" \"formula_id\": \"formula_0\",\n", | |
" \"formula_text\": \"Attention(Q, K, V ) = softmax( QK T \\u221a d k )V (1)\",\n", | |
" \"formula_coordinates\": [\n", | |
" 4.0,\n", | |
" 219.97,\n", | |
" 436.17,\n", | |
" 284.03,\n", | |
" 25.41\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_1\",\n", | |
" \"formula_text\": \"1 \\u221a d k .\",\n", | |
" \"formula_coordinates\": [\n", | |
" 4.0,\n", | |
" 120.06,\n", | |
" 493.07,\n", | |
" 18.76,\n", | |
" 14.92\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_2\",\n", | |
" \"formula_text\": \"1 \\u221a d k .\",\n", | |
" \"formula_coordinates\": [\n", | |
" 4.0,\n", | |
" 440.73,\n", | |
" 577.17,\n", | |
" 18.73,\n", | |
" 14.92\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_3\",\n", | |
" \"formula_text\": \"MultiHead(Q, K, V ) = Concat(head 1 , ..., head h )W O where head i = Attention(QW Q i , KW K i , V W V i )\",\n", | |
" \"formula_coordinates\": [\n", | |
" 5.0,\n", | |
" 186.94,\n", | |
" 114.25,\n", | |
" 238.12,\n", | |
" 29.82\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_4\",\n", | |
" \"formula_text\": \"W Q i \\u2208 R dmodel\\u00d7d k , W K i \\u2208 R dmodel\\u00d7d k , W V i \\u2208 R dmodel\\u00d7dv and W O \\u2208 R hdv\\u00d7dmodel .\",\n", | |
" \"formula_coordinates\": [\n", | |
" 5.0,\n", | |
" 108.0,\n", | |
" 166.55,\n", | |
" 394.82,\n", | |
" 24.27\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_5\",\n", | |
" \"formula_text\": \"d k = d v = d model /h = 64.\",\n", | |
" \"formula_coordinates\": [\n", | |
" 5.0,\n", | |
" 108.0,\n", | |
" 208.83,\n", | |
" 106.16,\n", | |
" 9.8\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_6\",\n", | |
" \"formula_text\": \"FFN(x) = max(0, xW 1 + b 1 )W 2 + b 2(2)\",\n", | |
" \"formula_coordinates\": [\n", | |
" 5.0,\n", | |
" 226.9,\n", | |
" 520.54,\n", | |
" 277.1,\n", | |
" 9.65\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_7\",\n", | |
" \"formula_text\": \"-Attention O(n 2 \\u00b7 d) O(1) O(1) Recurrent O(n \\u00b7 d 2 ) O(n) O(n) Convolutional O(k \\u00b7 n \\u00b7 d 2 ) O(1) O(log k (n)) Self-Attention (restricted) O(r \\u00b7 n \\u00b7 d) O(1) O(n/r)\",\n", | |
" \"formula_coordinates\": [\n", | |
" 6.0,\n", | |
" 124.55,\n", | |
" 139.24,\n", | |
" 340.27,\n", | |
" 44.2\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_8\",\n", | |
" \"formula_text\": \"P E (pos,2i) = sin(pos/10000 2i/dmodel ) P E (pos,2i+1) = cos(pos/10000 2i/dmodel )\",\n", | |
" \"formula_coordinates\": [\n", | |
" 6.0,\n", | |
" 225.69,\n", | |
" 293.92,\n", | |
" 160.61,\n", | |
" 28.9\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_9\",\n", | |
" \"formula_text\": \"considerably, to O(k \\u00b7 n \\u00b7 d + n \\u00b7 d 2 )\",\n", | |
" \"formula_coordinates\": [\n", | |
" 7.0,\n", | |
" 108.0,\n", | |
" 155.43,\n", | |
" 151.27,\n", | |
" 10.53\n", | |
" ]\n", | |
" },\n", | |
" {\n", | |
" \"formula_id\": \"formula_10\",\n", | |
" \"formula_text\": \"lrate = d \\u22120.5 model \\u00b7 min(step_num \\u22120.5 , step_num \\u00b7 warmup_steps \\u22121.5 )(3)\",\n", | |
" \"formula_coordinates\": [\n", | |
" 7.0,\n", | |
" 162.89,\n", | |
" 564.84,\n", | |
" 341.11,\n", | |
" 13.42\n", | |
" ]\n", | |
" }\n", | |
"]\n" | |
] | |
} | |
], | |
"source": [ | |
"import json\n", | |
"print(json.dumps(dict[\"formulas\"], indent=4))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "papercast", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.10 (default, Feb 26 2021, 10:16:00) \n[Clang 10.0.0 ]" | |
}, | |
"orig_nbformat": 4, | |
"vscode": { | |
"interpreter": { | |
"hash": "cebfbe6fdaf5e818be5ff56dbc3aba09cee6fb0310a9b42ef532877cd8382055" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment