Skip to content

Instantly share code, notes, and snippets.

@Seraph2000
Created September 17, 2018 11:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Seraph2000/5fc30cb9356e18cac6cf5fc29576c942 to your computer and use it in GitHub Desktop.
Save Seraph2000/5fc30cb9356e18cac6cf5fc29576c942 to your computer and use it in GitHub Desktop.
Extension of a scrapy spider, which automates the process of [i] downloading pdfs from website, [ii] converting pdfs to plaintext, [iii] extracting info using regular expressions, [iv] collates info into individuals csv files, [v]is able to load data to AWS server
# -*- coding: utf-8 -*-
import scrapy
import os
import re
import json
import csv
import time
import sys
import glob
import shutil
import argparse
import ConfigParser
import requests
import urllib
import subprocess
import logging
from os.path import abspath
from langdetect import detect
from os import listdir
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from gujarat_gov.items import GujaratGovItem
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ProductSpider(scrapy.Spider):
name = "gujarat"
start_urls = ["https://fir.gujarat.gov.in/"]
def __init__(self):
print("initiallising the driver...")
options = webdriver.ChromeOptions()
# replace with your web driver path
self.driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', chrome_options=options)
self.driver.wait = WebDriverWait(self.driver, 5)
def parse(self, response):
self.driver.get(response.url)
# options 1
drpdist = Select(self.driver.find_element_by_id("drpdist"))
drpdist_options = drpdist.options
drpdist_options = [option.text for option in drpdist_options]
drpdist_options = drpdist_options[1:]
for district in drpdist_options:
print("about to process option...", district)
drpdist.select_by_visible_text(district)
time.sleep(1)
# options 2
drppolicestation = Select(self.driver.find_element_by_id("drppolicestation"))
drppolicestation_options = drppolicestation.options
drppolicestation_options = [option.text for option in drppolicestation_options]
drppolicestation_options = drppolicestation_options[1:]
for station in drppolicestation_options:
print("about to process station...", station)
drppolicestation.select_by_visible_text(station)
time.sleep(1)
# from date
self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()
# set global variables
global year
global month
global day
# set to your requirements
year = "2018"
month = "Jan"
day = "12"
get_title = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_title"]')
title = get_title.text
get_title.click()
while year not in title:
prev = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_prevArrow"]').click()
get_title = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_title"]')
title = get_title.text
# refresh page
self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()
time.sleep(2)
print("testing month...", month)
month_expr = '//tbody[@id="CalendarExtender2_monthsBody"]//div[contains(text(), ' + '"' + month + '"' + ')]'
month = self.driver.find_element_by_xpath(month_expr)
month.click()
print("selected month!")
time.sleep(2)
self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()
print("testing day...", day)
day_expr = '//tbody[@id="CalendarExtender2_daysBody"]//div[contains(text(), ' + '"' + day + '"' + ')]'
day = self.driver.find_element_by_xpath(day_expr)
day.click()
print("selected day!")
### (B) download pdfs ###
# click on search to get results
search = self.driver.find_element_by_xpath('//input[@value="Search / શોધો"]')
search.click()
###click and download pdf###
get_data = self.driver.find_elements_by_xpath('//table[@id="getdata"]//tr/td/a')
pdfs = [pdf for pdf in get_data]
### test pdfs ###
pdfs = pdfs[:4]
# replace with your paths
# PATH = '/home/seraphina/Documents/CONTRACTS/UPWORK/JAY/gujarat_gov/gujarat_gov/TEXT/'
for i, pdf in enumerate(pdfs):
item = GujaratGovItem()
if os.path.isfile("file.tiff"):
print("deleting file.tiff")
os.remove("file.tiff")
if os.path.isfile("output.txt"):
print("deleting output.txt")
os.remove("output.txt")
if os.path.isfile("new_file.pdf"):
print("deleting new_file.pdf")
os.remove("new_file.pdf")
print("file number...", i)
pdf.click()
time.sleep(10)
# get most recent download, and rename file - replace with your download directory
file = glob.glob('/home/seraphina/Downloads/*')
latest_file = max(file, key=os.path.getctime)
### (C) convert pdf to text
count = 0
while '.crdownload' in latest_file and count < 60:
print count,
count += 1
time.sleep(1)
if os.path.isfile(latest_file):
# read file
print("we have downloaded the following file...", latest_file)
new_name = 'new_file.pdf'
os.rename(latest_file, new_name)
print "start processing pdf..."
subprocess.call("./run_tesseract.sh", shell=True)
print "end processing pdf"
else:
raise ValueError("%s isn't a file!" % file)
time.sleep(3)
## (D) process text and output to csv
file = abspath('output.txt')
# district, station and number, i
pdf_name = district + '-' + station + '-' + str(i) + '.pdf'
os.rename(new_name, pdf_name)
print("precessing file...",file)
#1 get text
# open and read file
text = open(file, 'r')
# create list
text = list(text)
if text != []:
print("testing text output...")
item['district_name'] = district
item['police_station'] = station
#2 parse unstructured text
info1 = [re.match('District.*Police.*?Date\s*\d{2}\/\d{2}\/(?:19|20)\d{2}.*',e).group(0) for e in text if re.match('District.*Police.*?Date\s*\d{2}\/\d{2}\/(?:19|20)\d{2}.*',e)]
if info1 != []:
info1 = info1[0]
if re.match('.*?Date\s*((?:[0-2][0-9]|3[0-1])\/(?:[0-2][0-9]|3[0-1])\/(?:19|20)\d{2}).*',info1):
item['fir_date'] = re.match('.*?Date\s*((?:[0-2][0-9]|3[0-1])\/(?:[0-2][0-9]|3[0-1])\/(?:19|20)\d{2}).*',info1).group(1)
else:
item['fir_date'] = ''
# Police station hierarchy-
if re.match('.*?FIR\s*No\.?\s*(.*?)\s*Date.*',info1):
fir_number = re.match('.*?FIR\s*No\.?\s*(.*?)\s*Date.*',info1).group(1)
item['fir_no'] = fir_number
else:
item['fir_no'] = ''
if re.match('.*Police\s*Stati\s*.*\s*Year\s*((?:20|19)\d{2})\s*FIR.*', info1):
year = re.match('.*Police\s*Stati\s*.*\s*Year\s*((?:20|19)\d{2})\s*FIR.*', info1).group(1)
item['fir_year'] = year
else:
item['fir_year'] = ''
else:
item['fir_date'] = ''
item['fir_no'] = ''
item['fir_year'] = ''
beg = [text.index(e) for e in text if 'Complai' in e]
if beg != []:
beg = beg[0]
end = [text.index(e) for e in text if 'Details of known' in e]
if end != []:
end = end[0]
pet_info = ''.join(text[beg:end])
if re.match('(?:.*\n)+.*?\(a\)\s*Name\s*((?:.*\n)+.*?)\(c\).*', pet_info):
item['petitioner_name'] = re.match('(?:.*\n)+.*?\(a\)\s*Name\s*((?:.*\n)+.*?)\(c\).*', pet_info).group(1)
else:
item['petitioner_name'] = ''
if re.match('(?:.*\n)+.*?\(c\)\s*Date\/Year\s*of\s*Birth\s*\(d\)\s*((?:.*\n)+.*)\(e\).*', pet_info):
item['petitioner_dob'] = re.match('(?:.*\n)+.*?\(c\)\s*Date\/Year\s*of\s*Birth\s*\(d\)\s*((?:.*\n)+.*)\(e\).*', pet_info).group(1).replace('\n','')
else:
item['petitioner_dob'] = ''
if re.match('(?:.*\n)+.*?Occupation((?:.*\n)+.*?)\(g\).*', pet_info):
item['petitioner_occupation'] = re.match('(?:.*\n)+.*?Occupation((?:.*\n)+.*?)\(g\).*', pet_info).group(1).strip()
else:
item['petitioner_occupation'] = ''
if re.match('(?:.*\n)+.*?Nationality\s*(.*\n.*?)\n+.*', pet_info):
item['petitioner_dob_nationality'] = re.match('(?:.*\n)+.*?Nationality\s*(.*\n.*?)\n+.*', pet_info).group(1)
else:
item['petitioner_dob_nationality'] = ''
else:
item['petitioner_name'] = ''
item['petitioner_dob'] = ''
item['petitioner_occupation'] = ''
item['petitioner_dob_nationality'] = ''
beg = [text.index(e) for e in text if 'Details of known/suspected' in e]
if beg != []:
beg = beg[0]
end = [text.index(e) for e in text if 'Accused Name' in e]
if end != []:
end = end[0]
details = ''.join(text[beg+1:end])
item['respondent_details'] = details.replace('(Attach seperate sheet, if necessary)','').strip()
else:
item['respondent_details'] = ''
else:
item['respondent_details'] = ''
beg = [text.index(e) for e in text if 'Accused Name Age' in e]
if beg != []:
beg = beg[0]
end = [text.index(e) for e in text if 'Reasons for delay' in e]
if end != []:
end = end[0]
item['respondent_name_age_address'] = ''.join(text[beg+1:end]).strip()
else:
item['respondent_name_age_address'] = ''
else:
item['respondent_name_age_address'] = ''
beg = [text.index(e) for e in text if 'Signature of Officer' in e]
if beg != []:
beg = beg[0]
end = [text.index(e) for e in text if 'Signature/Thumb' in e]
if end != []:
end = end[0]
sig = ''.join(text[beg:end])
if re.match('(?:.*\n)+.*?(\(.*\)).*', sig):
item['officer_signature'] = re.match('(?:.*\n)+.*?(\(.*\)).*', sig).group(1)
else:
item['officer_signature'] = ''
else:
item['officer_signature'] = ''
else:
item['officer_signature'] = ''
beg = [text.index(e) for e in text if 'Signature/Thumb' in e]
if beg != []:
beg = beg[0]
end = [text.index(e) for e in text if 'Rank' in e]
if end != []:
end = end[0]
thumb = ''.join(text[beg:end])
if re.match('Signature/Thumb.*?Name\s*((?:.*\n)+.*)',thumb):
item['officer_thumb'] = re.match('Signature/Thumb.*?Name\s*((?:.*\n)+.*)',thumb).group(1).strip()
else:
item['officer_thumb'] = ''
else:
item['officer_thumb'] = ''
else:
item['officer_thumb'] = ''
beg = [text.index(e) for e in text if 'Rank' in e]
if beg != []:
beg = beg[0]
end = [text.index(e) for e in text if 'Date and time of dispatch' in e]
if end != []:
end = end[0]
rank_no = ''.join(text[beg:end])
if re.match('Rank\s*(.*?)\s*GPF.*',rank_no):
item['officer_rank'] = re.match('Rank\s*(.*?)\s*GPF.*',rank_no).group(1)
else:
item['officer_rank'] = ''
if re.match('.*?GPF\s*No\s*(.*?)\n.*',rank_no):
item['officer_number'] = re.match('.*?GPF\s*No\s*(.*?)\n.*',rank_no).group(1)
else:
item['officer_number'] = ''
else:
item['officer_rank'] = ''
item['officer_number'] = ''
else:
item['officer_rank'] = ''
item['officer_number'] = ''
dispatch = [re.match('.*?Date\s*and\s*time\s*of\s*dispatch\s*to\s*the\s*cour\s*(.*)', e).group(1) for e in text if re.match('.*?Date\s*and\s*time\s*of\s*dispatch\s*to\s*the\s*cour\s*(.*)', e)]
if dispatch != []:
item['date_dispatch_to_court'] = dispatch[0]
else:
item['date_dispatch_to_court'] = ''
beg = [text.index(e) for e in text if 'case No.' in e]
if beg != []:
beg = beg[0]
item['case_number'] = text[beg+1].strip()
else:
item['case_number'] = ''
beg = [text.index(e) for e in text if 'received at PS' in e]
if beg != []:
beg = beg[0]
date = ''.join(text[beg])
if re.match('.*?PS:\s*(.*)\n*', date):
item['date_received_at_station'] = re.match('.*?PS:\s*(.*)\n*', date).group(1)
else:
item['date_received_at_station'] = ''
else:
item['date_received_at_station'] = ''
beg = [text.index(e) for e in text if 'Direction and distance' in e]
if beg != []:
beg = beg[0]
item['direction_distance_beat_no'] = text[beg+1].strip()
else:
item['direction_distance_beat_no'] = ''
index = [text.index(e) for e in text if 'Type of Information:' in e]
if index != []:
index = index[0]
type_info = ''.join(text[index:index+10])
if re.match('Type of Information:\s*(.*\n+.*)\n+',type_info):
case_type = re.match('Type of Information:\s*(.*\n+.*)\n+',type_info).group(1)
item['case_type'] = case_type
else:
item['case_type'] = ''
else:
item['case_type']
index = [text.index(e) for e in text if 'Details of known/suspected' in e]
if index != []:
index = index[0]
case_details = text[index:index+20]
end = [text.index(e) for e in case_details if 'Reasons for delay' in e]
if end != []:
end = end[0]
case_details = ''.join(text[index:end])
item['case_details'] = case_details
else:
item['case_details'] = ''
else:
item['case_details'] = ''
upload_file_path = '/home/seraphina/Documents/CONTRACTS/UPWORK/JAY/gujarat_gov/gujarat_gov/' + pdf_name
# post the file to the api
files = {
'state': (None, 'NJ'),
'district': (None, 'nyc'),
'file_type': (None, 'pdf'),
'uploadfile': (upload_file_path, open(upload_file_path, 'rb')),
'user': (None, 'Seraphina'),
}
try:
response = requests.post('https://verify24x7.in/live/dump_pdf/api.php', files=files)
logging.info('Response status successfull')
logging.info(response)
logging.info(response.text)
except:
logging.info('There was an error posting the data with status code: ', response.status_code)
yield item
drpdist = Select(self.driver.find_element_by_id("drpdist"))
drpdist_options = drpdist.options
drpdist_options = [option.text for option in drpdist_options]
drpdist_options = drpdist_options[1:]
drppolicestation = Select(self.driver.find_element_by_id("drppolicestation"))
drppolicestation_options = drppolicestation.options
drppolicestation_options = [option.text for option in drppolicestation_options]
drppolicestation_options = drppolicestation_options[1:]
# curl -X POST -F 'state=NJ' -F 'district=nyc' -F 'file_type=pdf' -F 'uploadfile=@/Users/ssss/Downloads/fir.pdf' -F 'user=Seraphina' https://verify24x7.in/live/dump_pdf/api.php
# note - enter
# scrapy crawl gujarat
# to run from terminal
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment