Skip to content

Instantly share code, notes, and snippets.

@ritwikraha
Created December 5, 2023 06:48
Show Gist options
  • Save ritwikraha/cedaa0304099b68947ea14fdda538dff to your computer and use it in GitHub Desktop.
Save ritwikraha/cedaa0304099b68947ea14fdda538dff to your computer and use it in GitHub Desktop.
PDF-Extractor.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyOM1MK/wsnFQZ9IDhe5wqdw",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ritwikraha/cedaa0304099b68947ea14fdda538dff/pdf-extractor.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3W6PqL7fnKoE",
"outputId": "d1e0d8b6-d930-440d-f6b5-e8b444d83d63"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting PyPDF2\n",
" Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: PyPDF2\n",
"Successfully installed PyPDF2-3.0.1\n"
]
}
],
"source": [
"!pip install PyPDF2"
]
},
{
"cell_type": "code",
"source": [
"start_page = 12\n",
"end_page =94"
],
"metadata": {
"id": "GowGZUpWrUDo"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import csv\n",
"from PyPDF2 import PdfReader\n",
"\n",
"# Creating a pdf reader object\n",
"reader = PdfReader('puzzles.pdf')\n",
"\n",
"# Open a new CSV file for writing\n",
"with open('questions.csv', 'w', newline='', encoding='utf-8') as file:\n",
" writer = csv.writer(file)\n",
"\n",
" # Writing the header row\n",
" writer.writerow(['page_number', 'questions'])\n",
"\n",
" # Looping through the pages\n",
" for page_number in range(start_page, end_page):\n",
" page = reader.pages[page_number]\n",
" text = page.extract_text()\n",
"\n",
" # Writing the page number and text to the CSV file\n",
" writer.writerow([page_number + 1, text]) # Adding 1 because page_number starts from 0\n"
],
"metadata": {
"id": "6G7OW0_qnaTT"
},
"execution_count": 5,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment