Skip to content

Instantly share code, notes, and snippets.

@myselfshravan
Last active September 17, 2023 11:07
Show Gist options
  • Save myselfshravan/53c09c95b40f974d34d853799767288f to your computer and use it in GitHub Desktop.
Save myselfshravan/53c09c95b40f974d34d853799767288f to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import fitz
import requests
import json
url = "https://mines.gov.in/webportal/rules"
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Run in headless mode
with webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) as driver:
driver.get(url)
driver.implicitly_wait(4)
print("Page URL:", driver.current_url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
pdf_links = []
table = soup.find_all('table', {'class': 'table table-bordered table-striped mb-0'})
for t in table:
all_tr = t.find_all('tr')
for tr in all_tr:
all_td = tr.find_all('td')
for td in all_td:
a = td.find('a')
if a:
pdf_links.append(a['href'])
print(pdf_links)
def extract_text_from_pdf_url(pdf_url_ap):
try:
response = requests.get(pdf_url_ap)
if response.status_code == 200:
pdf_data = response.content
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
text = ""
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
text += page.get_text()
return text
else:
print(f"Failed to download PDF: {pdf_url}")
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
extracted_text_list = []
json_file_path = 'extracted_text_all.json'
for pdf_url in pdf_links:
extracted_text = extract_text_from_pdf_url(pdf_url)
if extracted_text:
extracted_text_list.append({
"url": pdf_url,
'text': extracted_text,
})
print(f"Text extracted from {pdf_url}")
print("Text extraction complete.")
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(extracted_text_list, json_file, ensure_ascii=False, indent=4)
print(f"Text extraction and JSON file creation complete. Data saved to {json_file_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment