stevemclaugh/Gazette_of_India_scrape.py

## Gazette_of_India_scrape.py
#!/usr/bin/python3

from selenium import webdriver
import time
import random
import os
import csv

url = 'http://egazette.bih.nic.in/SearchAdvanceGazette.aspx'

# Launching Chrome and loading the search page
driver = webdriver.Chrome()
driver.get(url)

time.sleep(0.4) # Short pause to make sure the page is finished loading

# Clicking 'Search' button with all fields empty, which returns all records
search_button = driver.find_element_by_xpath('//input[@value="Search"]')
search_button.click()

time.sleep(0.4)

# At this point we're on the first page of search results.

master_metadata_table = []
errors = []

page_number = 1

for i in range(650): # There should be 615 pages of results; overshooting just in case.
    try:
        table_body = driver.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_DetailView"]/tbody')  # Finding the relevant table with an XPath query
        for row_number in range(3,18): # Looping through the 15 rows that contain PDF links
            row = table_body.find_element_by_xpath("tr[{}]".format(row_number))
            print(row.text)
            row_cells = []
            for cell_number in range(1,8): # Looping through 7 columns, beginning with index 1
                try:        # Try/except here is a precaution. If something goes wrong, we'll record the error and continue the scrape.
                    row_cells.append(row.find_element_by_xpath('td[{}]'.format(cell_number)).text) # Adding cell text to temporary row list
                except Exception as e:
                    print(row.text)
                    print(e)
                    errors.append([e, row.text, page_number]) # Recording the error so we can deal with it later
            master_metadata_table.append(row_cells)      # Adding the current row (as a list) to our running metadata table (a list of lists)
            row.find_element_by_xpath('td[1]/a').click() # Clicking the PDF link in cell 1, which will download to ~/Downloads
            time.sleep(1.5 + random.random())     # Waiting between 1.5 and 2.5 seconds before proceeding to the next row (as a courtesy & to avoid triggering a potential rate limit & to avoid hitting Chrome's cap on simultaneous downloads)
        time.sleep(11 + random.random())      # Waiting between 11 and 12 seconds before we load the next page. No rush.
        page_number += 1                      # Incrementing page number variable
        # The JS call below navigates to a given page number. Note that this only works for pages linked from the current page.
        # (We're doing this at the end of the loop because you can't navigate to page 1 from page 1.)
        driver.execute_script("javascript:__doPostBack('ctl00$ContentPlaceHolder1$DetailView','Page${}')".format(page_number))
        time.sleep(0.4)   # Short pause to make sure the page is finished loading
    except Exception as e:
        errors.append(e)
        print(e)
        print('Stopped on page: ' + str(page_number))  # In case the browser crashes, we can restart the scrape from here. This would require amending the code to start at page 1 and navigate through pages 2, 3, 4, etc. (without downloading PDFs) until we get to the new start page.
        break   # Breaking the loop when we reach the last page (or the browser crashes)

print(errors)

# Creating new metadata table with a column that reflects downloaded PDFs' filenames
filename_prefix = 'D__Websites_eGazette_GazettePublished_'
output_metadata_table = [row + [filename_prefix + row[-1]] for row in master_metadata_table]

# Writing metadata table (a list of lists) to CSV file in ~/Downloads
os.chdir(os.path.expanduser('~/Downloads'))

header = ['Gazette No.', 'Published Date', 'Type', 'Page Start', 'Page End', 'Year', 'Filename (listed)', 'Filename (download)']

with open('egazette.bih.nic.in_metadata.csv', 'w') as file_out:
    csv_writer = csv.writer(file_out)
    csv_writer.writerow(header)
    csv_writer.writerows(output_metadata_table)
	#!/usr/bin/python3

	from selenium import webdriver
	import time
	import random
	import os
	import csv

	url = 'http://egazette.bih.nic.in/SearchAdvanceGazette.aspx'

	# Launching Chrome and loading the search page
	driver = webdriver.Chrome()
	driver.get(url)

	time.sleep(0.4) # Short pause to make sure the page is finished loading

	# Clicking 'Search' button with all fields empty, which returns all records
	search_button = driver.find_element_by_xpath('//input[@value="Search"]')
	search_button.click()

	time.sleep(0.4)

	# At this point we're on the first page of search results.

	master_metadata_table = []
	errors = []

	page_number = 1

	for i in range(650): # There should be 615 pages of results; overshooting just in case.
	try:
	table_body = driver.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_DetailView"]/tbody') # Finding the relevant table with an XPath query
	for row_number in range(3,18): # Looping through the 15 rows that contain PDF links
	row = table_body.find_element_by_xpath("tr[{}]".format(row_number))
	print(row.text)
	row_cells = []
	for cell_number in range(1,8): # Looping through 7 columns, beginning with index 1
	try: # Try/except here is a precaution. If something goes wrong, we'll record the error and continue the scrape.
	row_cells.append(row.find_element_by_xpath('td[{}]'.format(cell_number)).text) # Adding cell text to temporary row list
	except Exception as e:
	print(row.text)
	print(e)
	errors.append([e, row.text, page_number]) # Recording the error so we can deal with it later
	master_metadata_table.append(row_cells) # Adding the current row (as a list) to our running metadata table (a list of lists)
	row.find_element_by_xpath('td[1]/a').click() # Clicking the PDF link in cell 1, which will download to ~/Downloads
	time.sleep(1.5 + random.random()) # Waiting between 1.5 and 2.5 seconds before proceeding to the next row (as a courtesy & to avoid triggering a potential rate limit & to avoid hitting Chrome's cap on simultaneous downloads)
	time.sleep(11 + random.random()) # Waiting between 11 and 12 seconds before we load the next page. No rush.
	page_number += 1 # Incrementing page number variable
	# The JS call below navigates to a given page number. Note that this only works for pages linked from the current page.
	# (We're doing this at the end of the loop because you can't navigate to page 1 from page 1.)
	driver.execute_script("javascript:__doPostBack('ctl00$ContentPlaceHolder1$DetailView','Page${}')".format(page_number))
	time.sleep(0.4) # Short pause to make sure the page is finished loading
	except Exception as e:
	errors.append(e)
	print(e)
	print('Stopped on page: ' + str(page_number)) # In case the browser crashes, we can restart the scrape from here. This would require amending the code to start at page 1 and navigate through pages 2, 3, 4, etc. (without downloading PDFs) until we get to the new start page.
	break # Breaking the loop when we reach the last page (or the browser crashes)

	print(errors)

	# Creating new metadata table with a column that reflects downloaded PDFs' filenames
	filename_prefix = 'D__Websites_eGazette_GazettePublished_'
	output_metadata_table = [row + [filename_prefix + row[-1]] for row in master_metadata_table]

	# Writing metadata table (a list of lists) to CSV file in ~/Downloads
	os.chdir(os.path.expanduser('~/Downloads'))

	header = ['Gazette No.', 'Published Date', 'Type', 'Page Start', 'Page End', 'Year', 'Filename (listed)', 'Filename (download)']

	with open('egazette.bih.nic.in_metadata.csv', 'w') as file_out:
	csv_writer = csv.writer(file_out)
	csv_writer.writerow(header)
	csv_writer.writerows(output_metadata_table)