Created
September 17, 2018 11:26
-
-
Save Seraph2000/5fc30cb9356e18cac6cf5fc29576c942 to your computer and use it in GitHub Desktop.
Extension of a scrapy spider, which automates the process of [i] downloading pdfs from website, [ii] converting pdfs to plaintext, [iii] extracting info using regular expressions, [iv] collates info into individuals csv files, [v]is able to load data to AWS server
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
import os | |
import re | |
import json | |
import csv | |
import time | |
import sys | |
import glob | |
import shutil | |
import argparse | |
import ConfigParser | |
import requests | |
import urllib | |
import subprocess | |
import logging | |
from os.path import abspath | |
from langdetect import detect | |
from os import listdir | |
from datetime import datetime | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import Select | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.action_chains import ActionChains | |
from selenium.webdriver.common.keys import Keys | |
from selenium.common.exceptions import TimeoutException | |
from selenium.common.exceptions import NoSuchElementException | |
from scrapy import Spider | |
from scrapy.spiders import CrawlSpider, Rule | |
from scrapy.linkextractors import LinkExtractor | |
from scrapy.selector import Selector | |
from scrapy.http import Request | |
from gujarat_gov.items import GujaratGovItem | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class ProductSpider(scrapy.Spider): | |
name = "gujarat" | |
start_urls = ["https://fir.gujarat.gov.in/"] | |
def __init__(self): | |
print("initiallising the driver...") | |
options = webdriver.ChromeOptions() | |
# replace with your web driver path | |
self.driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', chrome_options=options) | |
self.driver.wait = WebDriverWait(self.driver, 5) | |
def parse(self, response): | |
self.driver.get(response.url) | |
# options 1 | |
drpdist = Select(self.driver.find_element_by_id("drpdist")) | |
drpdist_options = drpdist.options | |
drpdist_options = [option.text for option in drpdist_options] | |
drpdist_options = drpdist_options[1:] | |
for district in drpdist_options: | |
print("about to process option...", district) | |
drpdist.select_by_visible_text(district) | |
time.sleep(1) | |
# options 2 | |
drppolicestation = Select(self.driver.find_element_by_id("drppolicestation")) | |
drppolicestation_options = drppolicestation.options | |
drppolicestation_options = [option.text for option in drppolicestation_options] | |
drppolicestation_options = drppolicestation_options[1:] | |
for station in drppolicestation_options: | |
print("about to process station...", station) | |
drppolicestation.select_by_visible_text(station) | |
time.sleep(1) | |
# from date | |
self.driver.find_element_by_xpath('//input[@id="txtfd"]').click() | |
# set global variables | |
global year | |
global month | |
global day | |
# set to your requirements | |
year = "2018" | |
month = "Jan" | |
day = "12" | |
get_title = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_title"]') | |
title = get_title.text | |
get_title.click() | |
while year not in title: | |
prev = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_prevArrow"]').click() | |
get_title = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_title"]') | |
title = get_title.text | |
# refresh page | |
self.driver.find_element_by_xpath('//input[@id="txtfd"]').click() | |
time.sleep(2) | |
print("testing month...", month) | |
month_expr = '//tbody[@id="CalendarExtender2_monthsBody"]//div[contains(text(), ' + '"' + month + '"' + ')]' | |
month = self.driver.find_element_by_xpath(month_expr) | |
month.click() | |
print("selected month!") | |
time.sleep(2) | |
self.driver.find_element_by_xpath('//input[@id="txtfd"]').click() | |
print("testing day...", day) | |
day_expr = '//tbody[@id="CalendarExtender2_daysBody"]//div[contains(text(), ' + '"' + day + '"' + ')]' | |
day = self.driver.find_element_by_xpath(day_expr) | |
day.click() | |
print("selected day!") | |
### (B) download pdfs ### | |
# click on search to get results | |
search = self.driver.find_element_by_xpath('//input[@value="Search / શોધો"]') | |
search.click() | |
###click and download pdf### | |
get_data = self.driver.find_elements_by_xpath('//table[@id="getdata"]//tr/td/a') | |
pdfs = [pdf for pdf in get_data] | |
### test pdfs ### | |
pdfs = pdfs[:4] | |
# replace with your paths | |
# PATH = '/home/seraphina/Documents/CONTRACTS/UPWORK/JAY/gujarat_gov/gujarat_gov/TEXT/' | |
for i, pdf in enumerate(pdfs): | |
item = GujaratGovItem() | |
if os.path.isfile("file.tiff"): | |
print("deleting file.tiff") | |
os.remove("file.tiff") | |
if os.path.isfile("output.txt"): | |
print("deleting output.txt") | |
os.remove("output.txt") | |
if os.path.isfile("new_file.pdf"): | |
print("deleting new_file.pdf") | |
os.remove("new_file.pdf") | |
print("file number...", i) | |
pdf.click() | |
time.sleep(10) | |
# get most recent download, and rename file - replace with your download directory | |
file = glob.glob('/home/seraphina/Downloads/*') | |
latest_file = max(file, key=os.path.getctime) | |
### (C) convert pdf to text | |
count = 0 | |
while '.crdownload' in latest_file and count < 60: | |
print count, | |
count += 1 | |
time.sleep(1) | |
if os.path.isfile(latest_file): | |
# read file | |
print("we have downloaded the following file...", latest_file) | |
new_name = 'new_file.pdf' | |
os.rename(latest_file, new_name) | |
print "start processing pdf..." | |
subprocess.call("./run_tesseract.sh", shell=True) | |
print "end processing pdf" | |
else: | |
raise ValueError("%s isn't a file!" % file) | |
time.sleep(3) | |
## (D) process text and output to csv | |
file = abspath('output.txt') | |
# district, station and number, i | |
pdf_name = district + '-' + station + '-' + str(i) + '.pdf' | |
os.rename(new_name, pdf_name) | |
print("precessing file...",file) | |
#1 get text | |
# open and read file | |
text = open(file, 'r') | |
# create list | |
text = list(text) | |
if text != []: | |
print("testing text output...") | |
item['district_name'] = district | |
item['police_station'] = station | |
#2 parse unstructured text | |
info1 = [re.match('District.*Police.*?Date\s*\d{2}\/\d{2}\/(?:19|20)\d{2}.*',e).group(0) for e in text if re.match('District.*Police.*?Date\s*\d{2}\/\d{2}\/(?:19|20)\d{2}.*',e)] | |
if info1 != []: | |
info1 = info1[0] | |
if re.match('.*?Date\s*((?:[0-2][0-9]|3[0-1])\/(?:[0-2][0-9]|3[0-1])\/(?:19|20)\d{2}).*',info1): | |
item['fir_date'] = re.match('.*?Date\s*((?:[0-2][0-9]|3[0-1])\/(?:[0-2][0-9]|3[0-1])\/(?:19|20)\d{2}).*',info1).group(1) | |
else: | |
item['fir_date'] = '' | |
# Police station hierarchy- | |
if re.match('.*?FIR\s*No\.?\s*(.*?)\s*Date.*',info1): | |
fir_number = re.match('.*?FIR\s*No\.?\s*(.*?)\s*Date.*',info1).group(1) | |
item['fir_no'] = fir_number | |
else: | |
item['fir_no'] = '' | |
if re.match('.*Police\s*Stati\s*.*\s*Year\s*((?:20|19)\d{2})\s*FIR.*', info1): | |
year = re.match('.*Police\s*Stati\s*.*\s*Year\s*((?:20|19)\d{2})\s*FIR.*', info1).group(1) | |
item['fir_year'] = year | |
else: | |
item['fir_year'] = '' | |
else: | |
item['fir_date'] = '' | |
item['fir_no'] = '' | |
item['fir_year'] = '' | |
beg = [text.index(e) for e in text if 'Complai' in e] | |
if beg != []: | |
beg = beg[0] | |
end = [text.index(e) for e in text if 'Details of known' in e] | |
if end != []: | |
end = end[0] | |
pet_info = ''.join(text[beg:end]) | |
if re.match('(?:.*\n)+.*?\(a\)\s*Name\s*((?:.*\n)+.*?)\(c\).*', pet_info): | |
item['petitioner_name'] = re.match('(?:.*\n)+.*?\(a\)\s*Name\s*((?:.*\n)+.*?)\(c\).*', pet_info).group(1) | |
else: | |
item['petitioner_name'] = '' | |
if re.match('(?:.*\n)+.*?\(c\)\s*Date\/Year\s*of\s*Birth\s*\(d\)\s*((?:.*\n)+.*)\(e\).*', pet_info): | |
item['petitioner_dob'] = re.match('(?:.*\n)+.*?\(c\)\s*Date\/Year\s*of\s*Birth\s*\(d\)\s*((?:.*\n)+.*)\(e\).*', pet_info).group(1).replace('\n','') | |
else: | |
item['petitioner_dob'] = '' | |
if re.match('(?:.*\n)+.*?Occupation((?:.*\n)+.*?)\(g\).*', pet_info): | |
item['petitioner_occupation'] = re.match('(?:.*\n)+.*?Occupation((?:.*\n)+.*?)\(g\).*', pet_info).group(1).strip() | |
else: | |
item['petitioner_occupation'] = '' | |
if re.match('(?:.*\n)+.*?Nationality\s*(.*\n.*?)\n+.*', pet_info): | |
item['petitioner_dob_nationality'] = re.match('(?:.*\n)+.*?Nationality\s*(.*\n.*?)\n+.*', pet_info).group(1) | |
else: | |
item['petitioner_dob_nationality'] = '' | |
else: | |
item['petitioner_name'] = '' | |
item['petitioner_dob'] = '' | |
item['petitioner_occupation'] = '' | |
item['petitioner_dob_nationality'] = '' | |
beg = [text.index(e) for e in text if 'Details of known/suspected' in e] | |
if beg != []: | |
beg = beg[0] | |
end = [text.index(e) for e in text if 'Accused Name' in e] | |
if end != []: | |
end = end[0] | |
details = ''.join(text[beg+1:end]) | |
item['respondent_details'] = details.replace('(Attach seperate sheet, if necessary)','').strip() | |
else: | |
item['respondent_details'] = '' | |
else: | |
item['respondent_details'] = '' | |
beg = [text.index(e) for e in text if 'Accused Name Age' in e] | |
if beg != []: | |
beg = beg[0] | |
end = [text.index(e) for e in text if 'Reasons for delay' in e] | |
if end != []: | |
end = end[0] | |
item['respondent_name_age_address'] = ''.join(text[beg+1:end]).strip() | |
else: | |
item['respondent_name_age_address'] = '' | |
else: | |
item['respondent_name_age_address'] = '' | |
beg = [text.index(e) for e in text if 'Signature of Officer' in e] | |
if beg != []: | |
beg = beg[0] | |
end = [text.index(e) for e in text if 'Signature/Thumb' in e] | |
if end != []: | |
end = end[0] | |
sig = ''.join(text[beg:end]) | |
if re.match('(?:.*\n)+.*?(\(.*\)).*', sig): | |
item['officer_signature'] = re.match('(?:.*\n)+.*?(\(.*\)).*', sig).group(1) | |
else: | |
item['officer_signature'] = '' | |
else: | |
item['officer_signature'] = '' | |
else: | |
item['officer_signature'] = '' | |
beg = [text.index(e) for e in text if 'Signature/Thumb' in e] | |
if beg != []: | |
beg = beg[0] | |
end = [text.index(e) for e in text if 'Rank' in e] | |
if end != []: | |
end = end[0] | |
thumb = ''.join(text[beg:end]) | |
if re.match('Signature/Thumb.*?Name\s*((?:.*\n)+.*)',thumb): | |
item['officer_thumb'] = re.match('Signature/Thumb.*?Name\s*((?:.*\n)+.*)',thumb).group(1).strip() | |
else: | |
item['officer_thumb'] = '' | |
else: | |
item['officer_thumb'] = '' | |
else: | |
item['officer_thumb'] = '' | |
beg = [text.index(e) for e in text if 'Rank' in e] | |
if beg != []: | |
beg = beg[0] | |
end = [text.index(e) for e in text if 'Date and time of dispatch' in e] | |
if end != []: | |
end = end[0] | |
rank_no = ''.join(text[beg:end]) | |
if re.match('Rank\s*(.*?)\s*GPF.*',rank_no): | |
item['officer_rank'] = re.match('Rank\s*(.*?)\s*GPF.*',rank_no).group(1) | |
else: | |
item['officer_rank'] = '' | |
if re.match('.*?GPF\s*No\s*(.*?)\n.*',rank_no): | |
item['officer_number'] = re.match('.*?GPF\s*No\s*(.*?)\n.*',rank_no).group(1) | |
else: | |
item['officer_number'] = '' | |
else: | |
item['officer_rank'] = '' | |
item['officer_number'] = '' | |
else: | |
item['officer_rank'] = '' | |
item['officer_number'] = '' | |
dispatch = [re.match('.*?Date\s*and\s*time\s*of\s*dispatch\s*to\s*the\s*cour\s*(.*)', e).group(1) for e in text if re.match('.*?Date\s*and\s*time\s*of\s*dispatch\s*to\s*the\s*cour\s*(.*)', e)] | |
if dispatch != []: | |
item['date_dispatch_to_court'] = dispatch[0] | |
else: | |
item['date_dispatch_to_court'] = '' | |
beg = [text.index(e) for e in text if 'case No.' in e] | |
if beg != []: | |
beg = beg[0] | |
item['case_number'] = text[beg+1].strip() | |
else: | |
item['case_number'] = '' | |
beg = [text.index(e) for e in text if 'received at PS' in e] | |
if beg != []: | |
beg = beg[0] | |
date = ''.join(text[beg]) | |
if re.match('.*?PS:\s*(.*)\n*', date): | |
item['date_received_at_station'] = re.match('.*?PS:\s*(.*)\n*', date).group(1) | |
else: | |
item['date_received_at_station'] = '' | |
else: | |
item['date_received_at_station'] = '' | |
beg = [text.index(e) for e in text if 'Direction and distance' in e] | |
if beg != []: | |
beg = beg[0] | |
item['direction_distance_beat_no'] = text[beg+1].strip() | |
else: | |
item['direction_distance_beat_no'] = '' | |
index = [text.index(e) for e in text if 'Type of Information:' in e] | |
if index != []: | |
index = index[0] | |
type_info = ''.join(text[index:index+10]) | |
if re.match('Type of Information:\s*(.*\n+.*)\n+',type_info): | |
case_type = re.match('Type of Information:\s*(.*\n+.*)\n+',type_info).group(1) | |
item['case_type'] = case_type | |
else: | |
item['case_type'] = '' | |
else: | |
item['case_type'] | |
index = [text.index(e) for e in text if 'Details of known/suspected' in e] | |
if index != []: | |
index = index[0] | |
case_details = text[index:index+20] | |
end = [text.index(e) for e in case_details if 'Reasons for delay' in e] | |
if end != []: | |
end = end[0] | |
case_details = ''.join(text[index:end]) | |
item['case_details'] = case_details | |
else: | |
item['case_details'] = '' | |
else: | |
item['case_details'] = '' | |
upload_file_path = '/home/seraphina/Documents/CONTRACTS/UPWORK/JAY/gujarat_gov/gujarat_gov/' + pdf_name | |
# post the file to the api | |
files = { | |
'state': (None, 'NJ'), | |
'district': (None, 'nyc'), | |
'file_type': (None, 'pdf'), | |
'uploadfile': (upload_file_path, open(upload_file_path, 'rb')), | |
'user': (None, 'Seraphina'), | |
} | |
try: | |
response = requests.post('https://verify24x7.in/live/dump_pdf/api.php', files=files) | |
logging.info('Response status successfull') | |
logging.info(response) | |
logging.info(response.text) | |
except: | |
logging.info('There was an error posting the data with status code: ', response.status_code) | |
yield item | |
drpdist = Select(self.driver.find_element_by_id("drpdist")) | |
drpdist_options = drpdist.options | |
drpdist_options = [option.text for option in drpdist_options] | |
drpdist_options = drpdist_options[1:] | |
drppolicestation = Select(self.driver.find_element_by_id("drppolicestation")) | |
drppolicestation_options = drppolicestation.options | |
drppolicestation_options = [option.text for option in drppolicestation_options] | |
drppolicestation_options = drppolicestation_options[1:] | |
# curl -X POST -F 'state=NJ' -F 'district=nyc' -F 'file_type=pdf' -F 'uploadfile=@/Users/ssss/Downloads/fir.pdf' -F 'user=Seraphina' https://verify24x7.in/live/dump_pdf/api.php | |
# note - enter | |
# scrapy crawl gujarat | |
# to run from terminal |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment