ritwikraha/pdf-extractor.ipynb

## pdf-extractor.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyOM1MK/wsnFQZ9IDhe5wqdw",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ritwikraha/cedaa0304099b68947ea14fdda538dff/pdf-extractor.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3W6PqL7fnKoE",
        "outputId": "d1e0d8b6-d930-440d-f6b5-e8b444d83d63"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting PyPDF2\n",
            "  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: PyPDF2\n",
            "Successfully installed PyPDF2-3.0.1\n"
          ]
        }
      ],
      "source": [
        "!pip install PyPDF2"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "start_page = 12\n",
        "end_page =94"
      ],
      "metadata": {
        "id": "GowGZUpWrUDo"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import csv\n",
        "from PyPDF2 import PdfReader\n",
        "\n",
        "# Creating a pdf reader object\n",
        "reader = PdfReader('puzzles.pdf')\n",
        "\n",
        "# Open a new CSV file for writing\n",
        "with open('questions.csv', 'w', newline='', encoding='utf-8') as file:\n",
        "    writer = csv.writer(file)\n",
        "\n",
        "    # Writing the header row\n",
        "    writer.writerow(['page_number', 'questions'])\n",
        "\n",
        "    # Looping through the pages\n",
        "    for page_number in range(start_page, end_page):\n",
        "        page = reader.pages[page_number]\n",
        "        text = page.extract_text()\n",
        "\n",
        "        # Writing the page number and text to the CSV file\n",
        "        writer.writerow([page_number + 1, text])  # Adding 1 because page_number starts from 0\n"
      ],
      "metadata": {
        "id": "6G7OW0_qnaTT"
      },
      "execution_count": 5,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"authorship_tag": "ABX9TyOM1MK/wsnFQZ9IDhe5wqdw",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/ritwikraha/cedaa0304099b68947ea14fdda538dff/pdf-extractor.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "3W6PqL7fnKoE",
	"outputId": "d1e0d8b6-d930-440d-f6b5-e8b444d83d63"
	},
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Collecting PyPDF2\n",
	" Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25hInstalling collected packages: PyPDF2\n",
	"Successfully installed PyPDF2-3.0.1\n"
	]
	}
	],
	"source": [
	"!pip install PyPDF2"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"start_page = 12\n",
	"end_page =94"
	],
	"metadata": {
	"id": "GowGZUpWrUDo"
	},
	"execution_count": 4,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import csv\n",
	"from PyPDF2 import PdfReader\n",
	"\n",
	"# Creating a pdf reader object\n",
	"reader = PdfReader('puzzles.pdf')\n",
	"\n",
	"# Open a new CSV file for writing\n",
	"with open('questions.csv', 'w', newline='', encoding='utf-8') as file:\n",
	" writer = csv.writer(file)\n",
	"\n",
	" # Writing the header row\n",
	" writer.writerow(['page_number', 'questions'])\n",
	"\n",
	" # Looping through the pages\n",
	" for page_number in range(start_page, end_page):\n",
	" page = reader.pages[page_number]\n",
	" text = page.extract_text()\n",
	"\n",
	" # Writing the page number and text to the CSV file\n",
	" writer.writerow([page_number + 1, text]) # Adding 1 because page_number starts from 0\n"
	],
	"metadata": {
	"id": "6G7OW0_qnaTT"
	},
	"execution_count": 5,
	"outputs": []
	}
	]
	}