Created
October 4, 2025 09:35
-
-
Save chottokun/9bfe17bbd9c0499479e49363dd37617e to your computer and use it in GitHub Desktop.
Docling Reference Embedding for Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/chottokun/9bfe17bbd9c0499479e49363dd37617e/docling-reference-embedding-for-markdown.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## サンプルデータのダウンロード" | |
| ], | |
| "metadata": { | |
| "id": "OEGK9LWHe7zo" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# arXiv から PDF ファイルをダウンロードします。(例: Attention is All You Need)\n", | |
| "# 必要に応じて、別の arXiv ID に変更してください。\n", | |
| "ARXIV_PDF_URL = \"https://arxiv.org/pdf/1706.03762.pdf\"\n", | |
| "from pathlib import Path\n", | |
| "PDF_PATH = Path(\"sample_arxiv.pdf\")\n", | |
| "\n", | |
| "# Colab 環境では wget コマンドを使用して簡単にダウンロードできます。\n", | |
| "!wget -q -O \"{PDF_PATH}\" \"{ARXIV_PDF_URL}\"\n", | |
| "print(\"Downloaded:\", PDF_PATH.exists(), \"->\", PDF_PATH)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "cBAYKoxQZ-2E", | |
| "outputId": "49c78810-bb82-4e26-82c3-fa4b2806cd3d" | |
| }, | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Downloaded: True -> sample_arxiv.pdf\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# doclingによるmarkdown" | |
| ], | |
| "metadata": { | |
| "id": "aFXAuZVTfCos" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# 必要なライブラリのインストール\n", | |
| "!pip install docling pillow\n", | |
| "\n", | |
| "import logging\n", | |
| "import time\n", | |
| "from pathlib import Path\n", | |
| "from docling_core.types.doc import ImageRefMode, PictureItem, TableItem\n", | |
| "from docling.datamodel.base_models import InputFormat\n", | |
| "from docling.datamodel.pipeline_options import PdfPipelineOptions\n", | |
| "from docling.document_converter import DocumentConverter, PdfFormatOption\n", | |
| "\n", | |
| "# =================================================================\n", | |
| "# 1. 設定\n", | |
| "# =================================================================\n", | |
| "\n", | |
| "# 処理対象のPDFファイルURL(例として、Doclingの技術論文を使用)\n", | |
| "# ローカルファイルを使用する場合は、Colabにアップロードしてパスを指定してください。\n", | |
| "INPUT_PDF_URL = \"sample_arxiv.pdf\"\n", | |
| "\n", | |
| "# 出力ディレクトリの定義\n", | |
| "# Colabのファイルシステム上に 'output' フォルダを作成し、ここに結果を保存します\n", | |
| "OUTPUT_DIR = Path(\"output\")\n", | |
| "OUTPUT_DIR.mkdir(exist_ok=True)\n", | |
| "\n", | |
| "# 画像解像度スケール (2.0 = 144 DPI 相当。高解像度画像が必要な場合に設定)\n", | |
| "IMAGE_RESOLUTION_SCALE = 2.0\n", | |
| "\n", | |
| "# ログ設定\n", | |
| "logging.basicConfig(level=logging.INFO)\n", | |
| "_log = logging.getLogger(__name__)\n", | |
| "\n", | |
| "# =================================================================\n", | |
| "# 2. パイプラインオプションの設定\n", | |
| "# =================================================================\n", | |
| "\n", | |
| "# PDFパイプラインオプションのインスタンス化\n", | |
| "pipeline_options = PdfPipelineOptions()\n", | |
| "# 画像スケールを設定(レンダリング解像度の制御)\n", | |
| "pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE\n", | |
| "# ページ全体の画像抽出を有効化 (Markdownにはページ画像を含まないが、構成要素の画像を抽出するために必要)\n", | |
| "pipeline_options.generate_page_images = True\n", | |
| "# 図(Picture)の画像抽出を有効化\n", | |
| "pipeline_options.generate_picture_images = True\n", | |
| "# 表(Table)の画像抽出を有効化\n", | |
| "pipeline_options.generate_table_images = True\n", | |
| "\n", | |
| "# PdfFormatOptionにパイプラインオプションを適用\n", | |
| "pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)\n", | |
| "\n", | |
| "# DocumentConverterの初期化\n", | |
| "doc_converter = DocumentConverter(\n", | |
| " format_options={InputFormat.PDF: pdf_format_option}\n", | |
| ")\n", | |
| "\n", | |
| "# =================================================================\n", | |
| "# 3. 変換の実行と保存\n", | |
| "# =================================================================\n", | |
| "\n", | |
| "_log.info(f\"PDFファイルの変換を開始: {INPUT_PDF_URL}\")\n", | |
| "start_time = time.time()\n", | |
| "\n", | |
| "# PDFの変換を実行\n", | |
| "conversion_result = doc_converter.convert(INPUT_PDF_URL)\n", | |
| "doc = conversion_result.document\n", | |
| "\n", | |
| "end_time = time.time()\n", | |
| "_log.info(f\"変換完了。所要時間: {end_time - start_time:.2f}秒\")\n", | |
| "\n", | |
| "# 出力ファイル名の設定\n", | |
| "# 入力URLのベース名から拡張子を除いた部分を取得\n", | |
| "pdf_stem = Path(INPUT_PDF_URL).stem\n", | |
| "md_filename = OUTPUT_DIR / f\"{pdf_stem}-referenced.md\"\n", | |
| "\n", | |
| "# Markdownとして保存\n", | |
| "# ImageRefMode.REFERENCED を指定することで、外部ファイルへの参照を生成する\n", | |
| "# Doclingは、画像ファイルを自動的に生成し、Markdown内から相対パスで参照します [1]。\n", | |
| "_log.info(f\"Markdownと画像参照ファイルを {OUTPUT_DIR} に保存中...\")\n", | |
| "\n", | |
| "doc.save_as_markdown(\n", | |
| " md_filename,\n", | |
| " image_mode=ImageRefMode.REFERENCED\n", | |
| ")\n", | |
| "\n", | |
| "# 画像ファイルが生成されていることを確認するために、要素画像を個別に保存する処理を組み込む [1]\n", | |
| "# Doclingのsave_as_markdown(image_mode=REFERENCED)メソッドは、\n", | |
| "# 通常、画像を自動的に生成し、Markdownファイルと同じディレクトリ(または出力ディレクトリ)に\n", | |
| "# 相対パスで保存します。\n", | |
| "# 以下のループは、保存された画像ファイルを確認するための追加のデバッグ/確認用コードです。\n", | |
| "\n", | |
| "picture_counter = 0\n", | |
| "table_counter = 0\n", | |
| "\n", | |
| "for item, _ in doc.iterate_items():\n", | |
| " if isinstance(item, PictureItem) and item.get_image(doc):\n", | |
| " picture_counter += 1\n", | |
| " # 画像ファイル名は自動的に '{doc_filename}-picture-{picture_counter}.png' の形式で生成されます [1]。\n", | |
| " elif isinstance(item, TableItem) and item.get_image(doc):\n", | |
| " table_counter += 1\n", | |
| " # 画像ファイル名は自動的に '{doc_filename}-table-{table_counter}.png' の形式で生成されます [1]。\n", | |
| "\n", | |
| "_log.info(f\"処理された図(Picture)の数: {picture_counter}\")\n", | |
| "_log.info(f\"処理された表(Table)の数: {table_counter}\")\n", | |
| "_log.info(f\"Markdownファイル: {md_filename}\")\n", | |
| "_log.info(f\"出力ディレクトリの内容:\")\n", | |
| "# Colabのファイルブラウザで確認しやすいように、出力ディレクトリ内のファイルを一覧表示\n", | |
| "!ls -R {OUTPUT_DIR}\n", | |
| "\n", | |
| "# 最終的なMarkdownの内容の一部を表示して確認\n", | |
| "print(\"\\n--- 抽出されたMarkdown(抜粋) ---\")\n", | |
| "with open(md_filename, 'r', encoding='utf-8') as f:\n", | |
| " print(f.read()[:2000] + \"\\n...\")\n", | |
| "\n", | |
| "# Doclingの参照埋め込み形式の例 (Markdownファイル内に相対パスで記述される):\n", | |
| "#" | |
| ], | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Collecting docling\n", | |
| " Downloading docling-2.55.1-py3-none-any.whl.metadata (11 kB)\n", | |
| "Requirement already satisfied: pillow in /usr/local/lib/python3.12/dist-packages (11.3.0)\n", | |
| "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from docling) (2.11.9)\n", | |
| "Collecting docling-core<3.0.0,>=2.48.2 (from docling-core[chunking]<3.0.0,>=2.48.2->docling)\n", | |
| " Downloading docling_core-2.48.4-py3-none-any.whl.metadata (6.5 kB)\n", | |
| "Collecting docling-parse<5.0.0,>=4.4.0 (from docling)\n", | |
| " Downloading docling_parse-4.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.8 kB)\n", | |
| "Collecting docling-ibm-models<4,>=3.9.1 (from docling)\n", | |
| " Downloading docling_ibm_models-3.9.1-py3-none-any.whl.metadata (6.7 kB)\n", | |
| "Collecting filetype<2.0.0,>=1.2.0 (from docling)\n", | |
| " Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)\n", | |
| "Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)\n", | |
| " Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hRequirement already satisfied: pydantic-settings<3.0.0,>=2.3.0 in /usr/local/lib/python3.12/dist-packages (from docling) (2.11.0)\n", | |
| "Requirement already satisfied: huggingface_hub<1,>=0.23 in /usr/local/lib/python3.12/dist-packages (from docling) (0.35.3)\n", | |
| "Requirement already satisfied: requests<3.0.0,>=2.32.2 in /usr/local/lib/python3.12/dist-packages (from docling) (2.32.4)\n", | |
| "Collecting easyocr<2.0,>=1.7 (from docling)\n", | |
| " Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)\n", | |
| "Requirement already satisfied: certifi>=2024.7.4 in /usr/local/lib/python3.12/dist-packages (from docling) (2025.8.3)\n", | |
| "Collecting rtree<2.0.0,>=1.3.0 (from docling)\n", | |
| " Downloading rtree-1.4.1-py3-none-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (2.1 kB)\n", | |
| "Requirement already satisfied: typer<0.20.0,>=0.12.5 in /usr/local/lib/python3.12/dist-packages (from docling) (0.19.2)\n", | |
| "Collecting python-docx<2.0.0,>=1.1.2 (from docling)\n", | |
| " Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)\n", | |
| "Collecting python-pptx<2.0.0,>=1.0.2 (from docling)\n", | |
| " Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)\n", | |
| "Requirement already satisfied: beautifulsoup4<5.0.0,>=4.12.3 in /usr/local/lib/python3.12/dist-packages (from docling) (4.13.5)\n", | |
| "Requirement already satisfied: pandas<3.0.0,>=2.1.4 in /usr/local/lib/python3.12/dist-packages (from docling) (2.2.2)\n", | |
| "Collecting marko<3.0.0,>=2.1.2 (from docling)\n", | |
| " Downloading marko-2.2.0-py3-none-any.whl.metadata (4.5 kB)\n", | |
| "Requirement already satisfied: openpyxl<4.0.0,>=3.1.5 in /usr/local/lib/python3.12/dist-packages (from docling) (3.1.5)\n", | |
| "Requirement already satisfied: lxml<6.0.0,>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from docling) (5.4.0)\n", | |
| "Requirement already satisfied: tqdm<5.0.0,>=4.65.0 in /usr/local/lib/python3.12/dist-packages (from docling) (4.67.1)\n", | |
| "Requirement already satisfied: pluggy<2.0.0,>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from docling) (1.6.0)\n", | |
| "Collecting pylatexenc<3.0,>=2.10 (from docling)\n", | |
| " Downloading pylatexenc-2.10.tar.gz (162 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m162.6/162.6 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| "Requirement already satisfied: scipy<2.0.0,>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from docling) (1.16.2)\n", | |
| "Requirement already satisfied: accelerate<2,>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from docling) (1.10.1)\n", | |
| "Collecting polyfactory>=2.22.2 (from docling)\n", | |
| " Downloading polyfactory-2.22.2-py3-none-any.whl.metadata (27 kB)\n", | |
| "Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.12/dist-packages (from accelerate<2,>=1.0.0->docling) (2.0.2)\n", | |
| "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from accelerate<2,>=1.0.0->docling) (25.0)\n", | |
| "Requirement already satisfied: psutil in /usr/local/lib/python3.12/dist-packages (from accelerate<2,>=1.0.0->docling) (5.9.5)\n", | |
| "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (from accelerate<2,>=1.0.0->docling) (6.0.3)\n", | |
| "Requirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from accelerate<2,>=1.0.0->docling) (2.8.0+cu126)\n", | |
| "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from accelerate<2,>=1.0.0->docling) (0.6.2)\n", | |
| "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4<5.0.0,>=4.12.3->docling) (2.8)\n", | |
| "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4<5.0.0,>=4.12.3->docling) (4.15.0)\n", | |
| "Requirement already satisfied: jsonschema<5.0.0,>=4.16.0 in /usr/local/lib/python3.12/dist-packages (from docling-core<3.0.0,>=2.48.2->docling-core[chunking]<3.0.0,>=2.48.2->docling) (4.25.1)\n", | |
| "Collecting jsonref<2.0.0,>=1.1.0 (from docling-core<3.0.0,>=2.48.2->docling-core[chunking]<3.0.0,>=2.48.2->docling)\n", | |
| " Downloading jsonref-1.1.0-py3-none-any.whl.metadata (2.7 kB)\n", | |
| "Requirement already satisfied: tabulate<0.10.0,>=0.9.0 in /usr/local/lib/python3.12/dist-packages (from docling-core<3.0.0,>=2.48.2->docling-core[chunking]<3.0.0,>=2.48.2->docling) (0.9.0)\n", | |
| "Collecting latex2mathml<4.0.0,>=3.77.0 (from docling-core<3.0.0,>=2.48.2->docling-core[chunking]<3.0.0,>=2.48.2->docling)\n", | |
| " Downloading latex2mathml-3.78.1-py3-none-any.whl.metadata (15 kB)\n", | |
| "Collecting semchunk<3.0.0,>=2.2.0 (from docling-core[chunking]<3.0.0,>=2.48.2->docling)\n", | |
| " Downloading semchunk-2.2.2-py3-none-any.whl.metadata (10 kB)\n", | |
| "Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /usr/local/lib/python3.12/dist-packages (from docling-core[chunking]<3.0.0,>=2.48.2->docling) (4.56.2)\n", | |
| "Requirement already satisfied: torchvision<1,>=0 in /usr/local/lib/python3.12/dist-packages (from docling-ibm-models<4,>=3.9.1->docling) (0.23.0+cu126)\n", | |
| "Collecting jsonlines<4.0.0,>=3.1.0 (from docling-ibm-models<4,>=3.9.1->docling)\n", | |
| " Downloading jsonlines-3.1.0-py3-none-any.whl.metadata (1.7 kB)\n", | |
| "Requirement already satisfied: opencv-python-headless<5.0.0.0,>=4.6.0.66 in /usr/local/lib/python3.12/dist-packages (from docling-ibm-models<4,>=3.9.1->docling) (4.12.0.88)\n", | |
| "Requirement already satisfied: scikit-image in /usr/local/lib/python3.12/dist-packages (from easyocr<2.0,>=1.7->docling) (0.25.2)\n", | |
| "Collecting python-bidi (from easyocr<2.0,>=1.7->docling)\n", | |
| " Downloading python_bidi-0.6.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n", | |
| "Requirement already satisfied: Shapely in /usr/local/lib/python3.12/dist-packages (from easyocr<2.0,>=1.7->docling) (2.1.2)\n", | |
| "Collecting pyclipper (from easyocr<2.0,>=1.7->docling)\n", | |
| " Downloading pyclipper-1.3.0.post6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)\n", | |
| "Collecting ninja (from easyocr<2.0,>=1.7->docling)\n", | |
| " Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)\n", | |
| "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface_hub<1,>=0.23->docling) (3.19.1)\n", | |
| "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface_hub<1,>=0.23->docling) (2025.3.0)\n", | |
| "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface_hub<1,>=0.23->docling) (1.1.10)\n", | |
| "Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.12/dist-packages (from openpyxl<4.0.0,>=3.1.5->docling) (2.0.0)\n", | |
| "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas<3.0.0,>=2.1.4->docling) (2.9.0.post0)\n", | |
| "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas<3.0.0,>=2.1.4->docling) (2025.2)\n", | |
| "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas<3.0.0,>=2.1.4->docling) (2025.2)\n", | |
| "Collecting faker>=5.0.0 (from polyfactory>=2.22.2->docling)\n", | |
| " Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)\n", | |
| "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.0.0->docling) (0.7.0)\n", | |
| "Requirement already satisfied: pydantic-core==2.33.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.0.0->docling) (2.33.2)\n", | |
| "Requirement already satisfied: typing-inspection>=0.4.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.0.0->docling) (0.4.2)\n", | |
| "Requirement already satisfied: python-dotenv>=0.21.0 in /usr/local/lib/python3.12/dist-packages (from pydantic-settings<3.0.0,>=2.3.0->docling) (1.1.1)\n", | |
| "Collecting XlsxWriter>=0.5.7 (from python-pptx<2.0.0,>=1.0.2->docling)\n", | |
| " Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)\n", | |
| "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.32.2->docling) (3.4.3)\n", | |
| "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.32.2->docling) (3.10)\n", | |
| "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.32.2->docling) (2.5.0)\n", | |
| "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.12/dist-packages (from typer<0.20.0,>=0.12.5->docling) (8.3.0)\n", | |
| "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from typer<0.20.0,>=0.12.5->docling) (1.5.4)\n", | |
| "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.12/dist-packages (from typer<0.20.0,>=0.12.5->docling) (13.9.4)\n", | |
| "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonlines<4.0.0,>=3.1.0->docling-ibm-models<4,>=3.9.1->docling) (25.3.0)\n", | |
| "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.16.0->docling-core<3.0.0,>=2.48.2->docling-core[chunking]<3.0.0,>=2.48.2->docling) (2025.9.1)\n", | |
| "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.16.0->docling-core<3.0.0,>=2.48.2->docling-core[chunking]<3.0.0,>=2.48.2->docling) (0.36.2)\n", | |
| "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.16.0->docling-core<3.0.0,>=2.48.2->docling-core[chunking]<3.0.0,>=2.48.2->docling) (0.27.1)\n", | |
| "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas<3.0.0,>=2.1.4->docling) (1.17.0)\n", | |
| "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=10.11.0->typer<0.20.0,>=0.12.5->docling) (4.0.0)\n", | |
| "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=10.11.0->typer<0.20.0,>=0.12.5->docling) (2.19.2)\n", | |
| "Collecting mpire[dill] (from semchunk<3.0.0,>=2.2.0->docling-core[chunking]<3.0.0,>=2.48.2->docling)\n", | |
| " Downloading mpire-2.10.2-py3-none-any.whl.metadata (14 kB)\n", | |
| "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (75.2.0)\n", | |
| "Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (1.13.3)\n", | |
| "Requirement already satisfied: networkx in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (3.5)\n", | |
| "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (3.1.6)\n", | |
| "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (12.6.77)\n", | |
| "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (12.6.77)\n", | |
| "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (12.6.80)\n", | |
| "Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (9.10.2.21)\n", | |
| "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (12.6.4.1)\n", | |
| "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (11.3.0.4)\n", | |
| "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (10.3.7.77)\n", | |
| "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (11.7.1.2)\n", | |
| "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (12.5.4.2)\n", | |
| "Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (0.7.1)\n", | |
| "Requirement already satisfied: nvidia-nccl-cu12==2.27.3 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (2.27.3)\n", | |
| "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (12.6.77)\n", | |
| "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (12.6.85)\n", | |
| "Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (1.11.1.6)\n", | |
| "Requirement already satisfied: triton==3.4.0 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate<2,>=1.0.0->docling) (3.4.0)\n", | |
| "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.12/dist-packages (from transformers<5.0.0,>=4.34.0->docling-core[chunking]<3.0.0,>=2.48.2->docling) (2024.11.6)\n", | |
| "Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /usr/local/lib/python3.12/dist-packages (from transformers<5.0.0,>=4.34.0->docling-core[chunking]<3.0.0,>=2.48.2->docling) (0.22.1)\n", | |
| "Requirement already satisfied: imageio!=2.35.0,>=2.33 in /usr/local/lib/python3.12/dist-packages (from scikit-image->easyocr<2.0,>=1.7->docling) (2.37.0)\n", | |
| "Requirement already satisfied: tifffile>=2022.8.12 in /usr/local/lib/python3.12/dist-packages (from scikit-image->easyocr<2.0,>=1.7->docling) (2025.9.30)\n", | |
| "Requirement already satisfied: lazy-loader>=0.4 in /usr/local/lib/python3.12/dist-packages (from scikit-image->easyocr<2.0,>=1.7->docling) (0.4)\n", | |
| "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<0.20.0,>=0.12.5->docling) (0.1.2)\n", | |
| "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch>=2.0.0->accelerate<2,>=1.0.0->docling) (1.3.0)\n", | |
| "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch>=2.0.0->accelerate<2,>=1.0.0->docling) (3.0.3)\n", | |
| "Requirement already satisfied: multiprocess>=0.70.15 in /usr/local/lib/python3.12/dist-packages (from mpire[dill]->semchunk<3.0.0,>=2.2.0->docling-core[chunking]<3.0.0,>=2.48.2->docling) (0.70.16)\n", | |
| "Requirement already satisfied: dill>=0.3.8 in /usr/local/lib/python3.12/dist-packages (from multiprocess>=0.70.15->mpire[dill]->semchunk<3.0.0,>=2.2.0->docling-core[chunking]<3.0.0,>=2.48.2->docling) (0.3.8)\n", | |
| "Downloading docling-2.55.1-py3-none-any.whl (239 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.4/239.4 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading docling_core-2.48.4-py3-none-any.whl (164 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.4/164.4 kB\u001b[0m \u001b[31m16.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading docling_ibm_models-3.9.1-py3-none-any.whl (86 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading docling_parse-4.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.1 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.1/15.1 MB\u001b[0m \u001b[31m99.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.9/2.9 MB\u001b[0m \u001b[31m100.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)\n", | |
| "Downloading marko-2.2.0-py3-none-any.whl (42 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.7/42.7 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading polyfactory-2.22.2-py3-none-any.whl (63 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.7/63.7 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m90.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m253.0/253.0 kB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading python_pptx-1.0.2-py3-none-any.whl (472 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m472.8/472.8 kB\u001b[0m \u001b[31m31.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading rtree-1.4.1-py3-none-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (507 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.6/507.6 kB\u001b[0m \u001b[31m35.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading faker-37.8.0-py3-none-any.whl (2.0 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m73.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)\n", | |
| "Downloading jsonref-1.1.0-py3-none-any.whl (9.4 kB)\n", | |
| "Downloading latex2mathml-3.78.1-py3-none-any.whl (73 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.9/73.9 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading semchunk-2.2.2-py3-none-any.whl (10 kB)\n", | |
| "Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m175.3/175.3 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m180.7/180.7 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading pyclipper-1.3.0.post6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (963 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m963.8/963.8 kB\u001b[0m \u001b[31m53.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading python_bidi-0.6.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (292 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m292.1/292.1 kB\u001b[0m \u001b[31m20.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading mpire-2.10.2-py3-none-any.whl (272 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m272.8/272.8 kB\u001b[0m \u001b[31m22.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hBuilding wheels for collected packages: pylatexenc\n", | |
| " Building wheel for pylatexenc (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Created wheel for pylatexenc: filename=pylatexenc-2.10-py3-none-any.whl size=136817 sha256=077332a08ec5073e99aa1613449ed218256b979c5bac2cf3bfb478f2f557d7a9\n", | |
| " Stored in directory: /root/.cache/pip/wheels/06/3e/78/fa1588c1ae991bbfd814af2bcac6cef7a178beee1939180d46\n", | |
| "Successfully built pylatexenc\n", | |
| "Installing collected packages: python-bidi, pylatexenc, pyclipper, filetype, XlsxWriter, rtree, python-docx, pypdfium2, ninja, mpire, marko, latex2mathml, jsonref, jsonlines, faker, python-pptx, polyfactory, semchunk, docling-core, easyocr, docling-parse, docling-ibm-models, docling\n", | |
| "Successfully installed XlsxWriter-3.2.9 docling-2.55.1 docling-core-2.48.4 docling-ibm-models-3.9.1 docling-parse-4.5.0 easyocr-1.7.2 faker-37.8.0 filetype-1.2.0 jsonlines-3.1.0 jsonref-1.1.0 latex2mathml-3.78.1 marko-2.2.0 mpire-2.10.2 ninja-1.13.0 polyfactory-2.22.2 pyclipper-1.3.0.post6 pylatexenc-2.10 pypdfium2-4.30.0 python-bidi-0.6.6 python-docx-1.2.0 python-pptx-1.0.2 rtree-1.4.1 semchunk-2.2.2\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "WARNING:easyocr.easyocr:Downloading detection model, please wait. This may take several minutes depending upon your network connection.\n", | |
| "WARNING:easyocr.easyocr:Downloading recognition model, please wait. This may take several minutes depending upon your network connection.\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "output:\n", | |
| "output\tsample_arxiv-referenced.md\n", | |
| "\n", | |
| "output/output:\n", | |
| "sample_arxiv-referenced_artifacts\n", | |
| "\n", | |
| "output/output/sample_arxiv-referenced_artifacts:\n", | |
| "image_000000_536d6dc5957170c29984f94ad0ddf7c2faaaf2cd88b962d1b385f13acf5ba66f.png\n", | |
| "image_000001_ab6952f60f9937a6cad7e523765f8a1d968ec2953937416f7596a50aeb0a4c63.png\n", | |
| "image_000002_35b5fe76ef3c23cddd7788e62be4d03daa9e37cdae00a416b15cbfce314a544d.png\n", | |
| "image_000003_a59ae69f61d5b9ff2a45d99aee95527084debe0f3266f5f61df061f2622c3523.png\n", | |
| "image_000004_bdd23a5bd56a22c91f196ea3e78377d8e7eee842e1ba64020910524801c7db64.png\n", | |
| "image_000005_327640766cc26f2688c87a83103ecb39cc75036c36d13b044a9c4ad8e45a88f3.png\n", | |
| "\n", | |
| "--- 抽出されたMarkdown(抜粋) ---\n", | |
| "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n", | |
| "\n", | |
| "## Attention Is All You Need\n", | |
| "\n", | |
| "Ashish Vaswani ∗ Google Brain avaswani@google.com\n", | |
| "\n", | |
| "Noam Shazeer ∗ Google Brain noam@google.com\n", | |
| "\n", | |
| "Llion Jones ∗ Google Research llion@google.com\n", | |
| "\n", | |
| "Niki Parmar ∗ Google Research nikip@google.com\n", | |
| "\n", | |
| "Aidan N. Gomez ∗ † University of Toronto aidan@cs.toronto.edu\n", | |
| "\n", | |
| "Jakob Uszkoreit ∗ Google Research usz@google.com\n", | |
| "\n", | |
| "Łukasz Kaiser ∗ Google Brain lukaszkaiser@google.com\n", | |
| "\n", | |
| "Illia Polosukhin ∗ ‡\n", | |
| "\n", | |
| "illia.polosukhin@gmail.com\n", | |
| "\n", | |
| "## Abstract\n", | |
| "\n", | |
| "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.\n", | |
| "\n", | |
| "∗ Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and has been cru\n", | |
| "...\n" | |
| ] | |
| } | |
| ], | |
| "execution_count": 2, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "T8Z5Xj9SZ8k8", | |
| "outputId": "48129ff6-0366-4199-9c3e-54dbbf6efa85" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 199 | |
| }, | |
| "id": "49c7de8f", | |
| "outputId": "6ec23ca7-4da7-4df8-d47f-6203e993f190" | |
| }, | |
| "source": [ | |
| "# 出力ディレクトリをZIP圧縮\n", | |
| "zip_filename = \"output.zip\"\n", | |
| "!zip -r \"{zip_filename}\" \"{OUTPUT_DIR}\"\n", | |
| "\n", | |
| "# ZIPファイルをダウンロード\n", | |
| "from google.colab import files\n", | |
| "files.download(zip_filename)" | |
| ], | |
| "execution_count": 3, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| " adding: output/ (stored 0%)\n", | |
| " adding: output/sample_arxiv-referenced.md (deflated 69%)\n", | |
| " adding: output/output/ (stored 0%)\n", | |
| " adding: output/output/sample_arxiv-referenced_artifacts/ (stored 0%)\n", | |
| " adding: output/output/sample_arxiv-referenced_artifacts/image_000000_536d6dc5957170c29984f94ad0ddf7c2faaaf2cd88b962d1b385f13acf5ba66f.png (deflated 2%)\n", | |
| " adding: output/output/sample_arxiv-referenced_artifacts/image_000004_bdd23a5bd56a22c91f196ea3e78377d8e7eee842e1ba64020910524801c7db64.png (deflated 4%)\n", | |
| " adding: output/output/sample_arxiv-referenced_artifacts/image_000002_35b5fe76ef3c23cddd7788e62be4d03daa9e37cdae00a416b15cbfce314a544d.png (deflated 2%)\n", | |
| " adding: output/output/sample_arxiv-referenced_artifacts/image_000005_327640766cc26f2688c87a83103ecb39cc75036c36d13b044a9c4ad8e45a88f3.png (deflated 2%)\n", | |
| " adding: output/output/sample_arxiv-referenced_artifacts/image_000001_ab6952f60f9937a6cad7e523765f8a1d968ec2953937416f7596a50aeb0a4c63.png (deflated 0%)\n", | |
| " adding: output/output/sample_arxiv-referenced_artifacts/image_000003_a59ae69f61d5b9ff2a45d99aee95527084debe0f3266f5f61df061f2622c3523.png (deflated 7%)\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.Javascript object>" | |
| ], | |
| "application/javascript": [ | |
| "\n", | |
| " async function download(id, filename, size) {\n", | |
| " if (!google.colab.kernel.accessAllowed) {\n", | |
| " return;\n", | |
| " }\n", | |
| " const div = document.createElement('div');\n", | |
| " const label = document.createElement('label');\n", | |
| " label.textContent = `Downloading \"${filename}\": `;\n", | |
| " div.appendChild(label);\n", | |
| " const progress = document.createElement('progress');\n", | |
| " progress.max = size;\n", | |
| " div.appendChild(progress);\n", | |
| " document.body.appendChild(div);\n", | |
| "\n", | |
| " const buffers = [];\n", | |
| " let downloaded = 0;\n", | |
| "\n", | |
| " const channel = await google.colab.kernel.comms.open(id);\n", | |
| " // Send a message to notify the kernel that we're ready.\n", | |
| " channel.send({})\n", | |
| "\n", | |
| " for await (const message of channel.messages) {\n", | |
| " // Send a message to notify the kernel that we're ready.\n", | |
| " channel.send({})\n", | |
| " if (message.buffers) {\n", | |
| " for (const buffer of message.buffers) {\n", | |
| " buffers.push(buffer);\n", | |
| " downloaded += buffer.byteLength;\n", | |
| " progress.value = downloaded;\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " const blob = new Blob(buffers, {type: 'application/binary'});\n", | |
| " const a = document.createElement('a');\n", | |
| " a.href = window.URL.createObjectURL(blob);\n", | |
| " a.download = filename;\n", | |
| " div.appendChild(a);\n", | |
| " a.click();\n", | |
| " div.remove();\n", | |
| " }\n", | |
| " " | |
| ] | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.Javascript object>" | |
| ], | |
| "application/javascript": [ | |
| "download(\"download_f3c76071-de73-46d0-9cd2-d4e5174b7150\", \"output.zip\", 760067)" | |
| ] | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "gpuType": "T4", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "name": "python3" | |
| }, | |
| "accelerator": "GPU" | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment