Seraph2000/gujarat_spider.py

## gujarat_spider.py
# -*- coding: utf-8 -*-
import scrapy
import os
import re
import json
import csv
import time
import sys
import glob
import shutil
import argparse
import ConfigParser
import requests
import urllib
import subprocess
import logging


from os.path import abspath
from langdetect import detect
from os import listdir
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from gujarat_gov.items import GujaratGovItem

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class ProductSpider(scrapy.Spider):
    name = "gujarat"
    start_urls = ["https://fir.gujarat.gov.in/"]

    def __init__(self):
        print("initiallising the driver...")
        options = webdriver.ChromeOptions()

        # replace with your web driver path
        self.driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', chrome_options=options)
        self.driver.wait = WebDriverWait(self.driver, 5)

    def parse(self, response):
        self.driver.get(response.url)

        # options 1
        drpdist = Select(self.driver.find_element_by_id("drpdist"))
        drpdist_options = drpdist.options
        drpdist_options = [option.text for option in drpdist_options]
        drpdist_options = drpdist_options[1:]
        for district in drpdist_options:
            print("about to process option...", district)
            drpdist.select_by_visible_text(district)
            time.sleep(1)
            # options 2
            drppolicestation = Select(self.driver.find_element_by_id("drppolicestation"))
            drppolicestation_options = drppolicestation.options
            drppolicestation_options = [option.text for option in drppolicestation_options]
            drppolicestation_options = drppolicestation_options[1:]

            for station in drppolicestation_options:
                print("about to process station...", station)
                drppolicestation.select_by_visible_text(station)
                time.sleep(1)

                # from date
                self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()

                # set global variables
                global year
                global month
                global day

                # set to your requirements
                year = "2018"
                month = "Jan"
                day = "12"

                get_title = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_title"]')
                title = get_title.text
                get_title.click()

                while year not in title:
                    prev = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_prevArrow"]').click()
                    get_title = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_title"]')
                    title = get_title.text

                # refresh page
                self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()

                time.sleep(2)

                print("testing month...", month)

                month_expr = '//tbody[@id="CalendarExtender2_monthsBody"]//div[contains(text(), ' + '"' + month + '"' + ')]'
                month = self.driver.find_element_by_xpath(month_expr)
                month.click()
                print("selected month!")

                time.sleep(2)

                self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()

                print("testing day...", day)
                day_expr = '//tbody[@id="CalendarExtender2_daysBody"]//div[contains(text(), ' + '"' + day + '"' + ')]'
                day = self.driver.find_element_by_xpath(day_expr)
                day.click()
                print("selected day!")


                ### (B) download pdfs ###
                # click on search to get results
                search = self.driver.find_element_by_xpath('//input[@value="Search / શોધો"]')
                search.click()

                ###click and download pdf###
                get_data = self.driver.find_elements_by_xpath('//table[@id="getdata"]//tr/td/a')
                pdfs = [pdf for pdf in get_data]

                ### test pdfs ###
                pdfs = pdfs[:4]

                # replace with your paths
                # PATH = '/home/seraphina/Documents/CONTRACTS/UPWORK/JAY/gujarat_gov/gujarat_gov/TEXT/'

                for i, pdf in enumerate(pdfs):

                    item = GujaratGovItem()

                    if os.path.isfile("file.tiff"):
                        print("deleting file.tiff")
                        os.remove("file.tiff")
                    if os.path.isfile("output.txt"):
                        print("deleting output.txt")
                        os.remove("output.txt")
                    if os.path.isfile("new_file.pdf"):
                        print("deleting new_file.pdf")
                        os.remove("new_file.pdf")

                    print("file number...", i)
                    pdf.click()
                    time.sleep(10)
                    # get most recent download, and rename file - replace with your download directory
                    file = glob.glob('/home/seraphina/Downloads/*')
                    latest_file = max(file, key=os.path.getctime)

                    ### (C) convert pdf to text
                    count = 0

                    while '.crdownload' in latest_file and count < 60:
                        print count,
                        count += 1
                        time.sleep(1)
                    if os.path.isfile(latest_file):
                        # read file
                        print("we have downloaded the following file...", latest_file)
                        new_name = 'new_file.pdf'
                        os.rename(latest_file, new_name)
                        print "start processing pdf..."
                        subprocess.call("./run_tesseract.sh", shell=True)
                        print "end processing pdf"
                    else:
                        raise ValueError("%s isn't a file!" % file)

                    time.sleep(3)

                    ## (D) process text and output to csv
                    file = abspath('output.txt')

                    # district, station and number, i
                    pdf_name = district + '-' + station + '-' + str(i) + '.pdf'
                    os.rename(new_name, pdf_name)

                    print("precessing file...",file)
                    #1 get text
                    # open and read file
                    text = open(file, 'r')
                    # create list
                    text = list(text)
                    if text != []:
                        print("testing text output...")

                        item['district_name'] = district
                        item['police_station'] = station


                        #2 parse unstructured text
                        info1 = [re.match('District.*Police.*?Date\s*\d{2}\/\d{2}\/(?:19|20)\d{2}.*',e).group(0) for e in text if re.match('District.*Police.*?Date\s*\d{2}\/\d{2}\/(?:19|20)\d{2}.*',e)]

                        if info1 != []:
                            info1 = info1[0]

                            if re.match('.*?Date\s*((?:[0-2][0-9]|3[0-1])\/(?:[0-2][0-9]|3[0-1])\/(?:19|20)\d{2}).*',info1):
                                item['fir_date'] = re.match('.*?Date\s*((?:[0-2][0-9]|3[0-1])\/(?:[0-2][0-9]|3[0-1])\/(?:19|20)\d{2}).*',info1).group(1)
                            else:
                                item['fir_date'] = ''
                                # Police station hierarchy-
                            if re.match('.*?FIR\s*No\.?\s*(.*?)\s*Date.*',info1):
                                fir_number = re.match('.*?FIR\s*No\.?\s*(.*?)\s*Date.*',info1).group(1)
                                item['fir_no'] = fir_number
                            else:
                                item['fir_no'] = ''
                            if re.match('.*Police\s*Stati\s*.*\s*Year\s*((?:20|19)\d{2})\s*FIR.*', info1):
                                year = re.match('.*Police\s*Stati\s*.*\s*Year\s*((?:20|19)\d{2})\s*FIR.*', info1).group(1)
                                item['fir_year'] = year
                            else:
                                item['fir_year'] = ''
                        else:
                            item['fir_date'] = ''
                            item['fir_no'] = ''
                            item['fir_year'] = ''

                        beg = [text.index(e) for e in text if 'Complai' in e]
                        if beg != []:
                            beg = beg[0]
                            end = [text.index(e) for e in text if 'Details of known' in e]
                            if end != []:
                                end = end[0]
                                pet_info = ''.join(text[beg:end])
                                if re.match('(?:.*\n)+.*?\(a\)\s*Name\s*((?:.*\n)+.*?)\(c\).*', pet_info):
                                    item['petitioner_name'] = re.match('(?:.*\n)+.*?\(a\)\s*Name\s*((?:.*\n)+.*?)\(c\).*', pet_info).group(1)
                                else:
                                    item['petitioner_name'] = ''
                                if re.match('(?:.*\n)+.*?\(c\)\s*Date\/Year\s*of\s*Birth\s*\(d\)\s*((?:.*\n)+.*)\(e\).*', pet_info):
                                    item['petitioner_dob'] = re.match('(?:.*\n)+.*?\(c\)\s*Date\/Year\s*of\s*Birth\s*\(d\)\s*((?:.*\n)+.*)\(e\).*', pet_info).group(1).replace('\n','')
                                else:
                                    item['petitioner_dob'] = ''
                                if re.match('(?:.*\n)+.*?Occupation((?:.*\n)+.*?)\(g\).*', pet_info):
                                    item['petitioner_occupation'] = re.match('(?:.*\n)+.*?Occupation((?:.*\n)+.*?)\(g\).*', pet_info).group(1).strip()
                                else:
                                    item['petitioner_occupation'] = ''
                                if re.match('(?:.*\n)+.*?Nationality\s*(.*\n.*?)\n+.*', pet_info):
                                    item['petitioner_dob_nationality'] = re.match('(?:.*\n)+.*?Nationality\s*(.*\n.*?)\n+.*', pet_info).group(1)
                                else:
                                    item['petitioner_dob_nationality'] = ''
                        else:
                            item['petitioner_name'] = ''
                            item['petitioner_dob'] = ''
                            item['petitioner_occupation'] = ''
                            item['petitioner_dob_nationality'] = ''


                        beg = [text.index(e) for e in text if 'Details of known/suspected' in e]
                        if beg != []:
                            beg = beg[0]
                            end = [text.index(e) for e in text if 'Accused Name' in e]
                            if end != []:
                                end = end[0]
                                details = ''.join(text[beg+1:end])
                                item['respondent_details'] = details.replace('(Attach seperate sheet, if necessary)','').strip()
                            else:
                                item['respondent_details'] = ''
                        else:
                            item['respondent_details'] = ''

                        beg = [text.index(e) for e in text if 'Accused Name Age' in e]
                        if beg != []:
                            beg = beg[0]
                            end = [text.index(e) for e in text if 'Reasons for delay' in e]
                            if end != []:
                                end = end[0]
                                item['respondent_name_age_address'] = ''.join(text[beg+1:end]).strip()
                            else:
                                item['respondent_name_age_address'] = ''
                        else:
                            item['respondent_name_age_address'] = ''


                        beg = [text.index(e) for e in text if 'Signature of Officer' in e]
                        if beg != []:
                            beg = beg[0]
                            end = [text.index(e) for e in text if 'Signature/Thumb' in e]
                            if end != []:
                                end = end[0]
                                sig = ''.join(text[beg:end])
                                if re.match('(?:.*\n)+.*?(\(.*\)).*', sig):
                                    item['officer_signature'] = re.match('(?:.*\n)+.*?(\(.*\)).*', sig).group(1)
                                else:
                                    item['officer_signature'] = ''
                            else:
                                item['officer_signature'] = ''
                        else:
                            item['officer_signature'] = ''


                        beg = [text.index(e) for e in text if 'Signature/Thumb' in e]
                        if beg != []:
                            beg = beg[0]
                            end = [text.index(e) for e in text if 'Rank' in e]
                            if end != []:
                                end = end[0]
                                thumb = ''.join(text[beg:end])
                                if re.match('Signature/Thumb.*?Name\s*((?:.*\n)+.*)',thumb):
                                    item['officer_thumb'] = re.match('Signature/Thumb.*?Name\s*((?:.*\n)+.*)',thumb).group(1).strip()
                                else:
                                    item['officer_thumb'] = ''
                            else:
                                item['officer_thumb'] = ''
                        else:
                            item['officer_thumb'] = ''

                        beg = [text.index(e) for e in text if 'Rank' in e]
                        if beg != []:
                            beg = beg[0]
                            end = [text.index(e) for e in text if 'Date and time of dispatch' in e]
                            if end != []:
                                end = end[0]
                                rank_no = ''.join(text[beg:end])
                                if re.match('Rank\s*(.*?)\s*GPF.*',rank_no):
                                    item['officer_rank'] = re.match('Rank\s*(.*?)\s*GPF.*',rank_no).group(1)
                                else:
                                    item['officer_rank'] = ''
                                if re.match('.*?GPF\s*No\s*(.*?)\n.*',rank_no):
                                    item['officer_number'] = re.match('.*?GPF\s*No\s*(.*?)\n.*',rank_no).group(1)
                                else:
                                    item['officer_number'] = ''
                            else:
                                item['officer_rank'] = ''
                                item['officer_number'] = ''
                        else:
                            item['officer_rank'] = ''
                            item['officer_number'] = ''

                        dispatch = [re.match('.*?Date\s*and\s*time\s*of\s*dispatch\s*to\s*the\s*cour\s*(.*)', e).group(1) for e in text if re.match('.*?Date\s*and\s*time\s*of\s*dispatch\s*to\s*the\s*cour\s*(.*)', e)]
                        if dispatch != []:
                            item['date_dispatch_to_court'] = dispatch[0]
                        else:
                            item['date_dispatch_to_court'] = ''

                        beg = [text.index(e) for e in text if 'case No.' in e]
                        if beg != []:
                            beg = beg[0]
                            item['case_number'] = text[beg+1].strip()
                        else:
                            item['case_number'] = ''

                        beg = [text.index(e) for e in text if 'received at PS' in e]
                        if beg != []:
                            beg = beg[0]
                            date = ''.join(text[beg])
                            if re.match('.*?PS:\s*(.*)\n*', date):
                                item['date_received_at_station'] = re.match('.*?PS:\s*(.*)\n*', date).group(1)
                            else:
                                item['date_received_at_station'] = ''
                        else:
                            item['date_received_at_station'] = ''

                        beg = [text.index(e) for e in text if 'Direction and distance' in e]
                        if beg != []:
                            beg = beg[0]
                            item['direction_distance_beat_no'] = text[beg+1].strip()
                        else:
                            item['direction_distance_beat_no'] = ''


                        index = [text.index(e) for e in text if 'Type of Information:' in e]
                        if index != []:
                            index = index[0]
                            type_info = ''.join(text[index:index+10])
                            if re.match('Type of Information:\s*(.*\n+.*)\n+',type_info):
                                case_type = re.match('Type of Information:\s*(.*\n+.*)\n+',type_info).group(1)
                                item['case_type'] = case_type
                            else:
                                item['case_type'] = ''
                        else:
                            item['case_type']

                        index = [text.index(e) for e in text if 'Details of known/suspected' in e]
                        if index != []:
                            index = index[0]
                            case_details = text[index:index+20]
                            end = [text.index(e) for e in case_details if 'Reasons for delay' in e]
                            if end != []:
                                end = end[0]
                                case_details = ''.join(text[index:end])
                                item['case_details'] = case_details
                            else:
                                item['case_details'] = ''
                        else:
                            item['case_details'] = ''

                    upload_file_path = '/home/seraphina/Documents/CONTRACTS/UPWORK/JAY/gujarat_gov/gujarat_gov/' + pdf_name

                    # post the file to the api
                    files = {
                        'state': (None, 'NJ'),
                        'district': (None, 'nyc'),
                        'file_type': (None, 'pdf'),
                        'uploadfile': (upload_file_path, open(upload_file_path, 'rb')),
                        'user': (None, 'Seraphina'),
                    }

                    try:
                        response = requests.post('https://verify24x7.in/live/dump_pdf/api.php', files=files)
                        logging.info('Response status successfull')
                        logging.info(response)
                        logging.info(response.text)
                    except:
                        logging.info('There was an error posting the data with status code: ', response.status_code)
                yield item

                drpdist = Select(self.driver.find_element_by_id("drpdist"))
                drpdist_options = drpdist.options
                drpdist_options = [option.text for option in drpdist_options]
                drpdist_options = drpdist_options[1:]
                drppolicestation = Select(self.driver.find_element_by_id("drppolicestation"))
                drppolicestation_options = drppolicestation.options
                drppolicestation_options = [option.text for option in drppolicestation_options]
                drppolicestation_options = drppolicestation_options[1:]

# curl -X POST -F 'state=NJ' -F 'district=nyc' -F 'file_type=pdf' -F 'uploadfile=@/Users/ssss/Downloads/fir.pdf' -F 'user=Seraphina'  https://verify24x7.in/live/dump_pdf/api.php
# note - enter
# scrapy crawl gujarat
# to run from terminal
	# -- coding: utf-8 --
	import scrapy
	import os
	import re
	import json
	import csv
	import time
	import sys
	import glob
	import shutil
	import argparse
	import ConfigParser
	import requests
	import urllib
	import subprocess
	import logging


	from os.path import abspath
	from langdetect import detect
	from os import listdir
	from datetime import datetime
	from selenium import webdriver
	from selenium.webdriver.support.ui import Select
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.common.action_chains import ActionChains
	from selenium.webdriver.common.keys import Keys
	from selenium.common.exceptions import TimeoutException
	from selenium.common.exceptions import NoSuchElementException
	from scrapy import Spider
	from scrapy.spiders import CrawlSpider, Rule
	from scrapy.linkextractors import LinkExtractor
	from scrapy.selector import Selector
	from scrapy.http import Request
	from gujarat_gov.items import GujaratGovItem

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class ProductSpider(scrapy.Spider):
	name = "gujarat"
	start_urls = ["https://fir.gujarat.gov.in/"]

	def __init__(self):
	print("initiallising the driver...")
	options = webdriver.ChromeOptions()

	# replace with your web driver path
	self.driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', chrome_options=options)
	self.driver.wait = WebDriverWait(self.driver, 5)

	def parse(self, response):
	self.driver.get(response.url)

	# options 1
	drpdist = Select(self.driver.find_element_by_id("drpdist"))
	drpdist_options = drpdist.options
	drpdist_options = [option.text for option in drpdist_options]
	drpdist_options = drpdist_options[1:]
	for district in drpdist_options:
	print("about to process option...", district)
	drpdist.select_by_visible_text(district)
	time.sleep(1)
	# options 2
	drppolicestation = Select(self.driver.find_element_by_id("drppolicestation"))
	drppolicestation_options = drppolicestation.options
	drppolicestation_options = [option.text for option in drppolicestation_options]
	drppolicestation_options = drppolicestation_options[1:]

	for station in drppolicestation_options:
	print("about to process station...", station)
	drppolicestation.select_by_visible_text(station)
	time.sleep(1)

	# from date
	self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()

	# set global variables
	global year
	global month
	global day

	# set to your requirements
	year = "2018"
	month = "Jan"
	day = "12"

	get_title = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_title"]')
	title = get_title.text
	get_title.click()

	while year not in title:
	prev = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_prevArrow"]').click()
	get_title = self.driver.find_element_by_xpath('//div[@id="CalendarExtender2_title"]')
	title = get_title.text

	# refresh page
	self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()

	time.sleep(2)

	print("testing month...", month)

	month_expr = '//tbody[@id="CalendarExtender2_monthsBody"]//div[contains(text(), ' + '"' + month + '"' + ')]'
	month = self.driver.find_element_by_xpath(month_expr)
	month.click()
	print("selected month!")

	time.sleep(2)

	self.driver.find_element_by_xpath('//input[@id="txtfd"]').click()

	print("testing day...", day)
	day_expr = '//tbody[@id="CalendarExtender2_daysBody"]//div[contains(text(), ' + '"' + day + '"' + ')]'
	day = self.driver.find_element_by_xpath(day_expr)
	day.click()
	print("selected day!")


	### (B) download pdfs ###
	# click on search to get results
	search = self.driver.find_element_by_xpath('//input[@value="Search / શોધો"]')
	search.click()

	###click and download pdf###
	get_data = self.driver.find_elements_by_xpath('//table[@id="getdata"]//tr/td/a')
	pdfs = [pdf for pdf in get_data]

	### test pdfs ###
	pdfs = pdfs[:4]

	# replace with your paths
	# PATH = '/home/seraphina/Documents/CONTRACTS/UPWORK/JAY/gujarat_gov/gujarat_gov/TEXT/'

	for i, pdf in enumerate(pdfs):

	item = GujaratGovItem()

	if os.path.isfile("file.tiff"):
	print("deleting file.tiff")
	os.remove("file.tiff")
	if os.path.isfile("output.txt"):
	print("deleting output.txt")
	os.remove("output.txt")
	if os.path.isfile("new_file.pdf"):
	print("deleting new_file.pdf")
	os.remove("new_file.pdf")

	print("file number...", i)
	pdf.click()
	time.sleep(10)
	# get most recent download, and rename file - replace with your download directory
	file = glob.glob('/home/seraphina/Downloads/*')
	latest_file = max(file, key=os.path.getctime)

	### (C) convert pdf to text
	count = 0

	while '.crdownload' in latest_file and count < 60:
	print count,
	count += 1
	time.sleep(1)
	if os.path.isfile(latest_file):
	# read file
	print("we have downloaded the following file...", latest_file)
	new_name = 'new_file.pdf'
	os.rename(latest_file, new_name)
	print "start processing pdf..."
	subprocess.call("./run_tesseract.sh", shell=True)
	print "end processing pdf"
	else:
	raise ValueError("%s isn't a file!" % file)

	time.sleep(3)

	## (D) process text and output to csv
	file = abspath('output.txt')

	# district, station and number, i
	pdf_name = district + '-' + station + '-' + str(i) + '.pdf'
	os.rename(new_name, pdf_name)

	print("precessing file...",file)
	#1 get text
	# open and read file
	text = open(file, 'r')
	# create list
	text = list(text)
	if text != []:
	print("testing text output...")

	item['district_name'] = district
	item['police_station'] = station


	#2 parse unstructured text
	info1 = [re.match('District.Police.?Date\s\d{2}\/\d{2}\/(?:19\|20)\d{2}.',e).group(0) for e in text if re.match('District.Police.?Date\s\d{2}\/\d{2}\/(?:19\|20)\d{2}.',e)]

	if info1 != []:
	info1 = info1[0]

	if re.match('.?Date\s((?:[0-2][0-9]\|3[0-1])\/(?:[0-2][0-9]\|3[0-1])\/(?:19\|20)\d{2}).*',info1):
	item['fir_date'] = re.match('.?Date\s((?:[0-2][0-9]\|3[0-1])\/(?:[0-2][0-9]\|3[0-1])\/(?:19\|20)\d{2}).*',info1).group(1)
	else:
	item['fir_date'] = ''
	# Police station hierarchy-
	if re.match('.?FIR\sNo\.?\s(.?)\sDate.',info1):
	fir_number = re.match('.?FIR\sNo\.?\s(.?)\sDate.',info1).group(1)
	item['fir_no'] = fir_number
	else:
	item['fir_no'] = ''
	if re.match('.Police\sStati\s.\sYear\s((?:20\|19)\d{2})\sFIR.', info1):
	year = re.match('.Police\sStati\s.\sYear\s((?:20\|19)\d{2})\sFIR.', info1).group(1)
	item['fir_year'] = year
	else:
	item['fir_year'] = ''
	else:
	item['fir_date'] = ''
	item['fir_no'] = ''
	item['fir_year'] = ''

	beg = [text.index(e) for e in text if 'Complai' in e]
	if beg != []:
	beg = beg[0]
	end = [text.index(e) for e in text if 'Details of known' in e]
	if end != []:
	end = end[0]
	pet_info = ''.join(text[beg:end])
	if re.match('(?:.\n)+.?\(a\)\sName\s((?:.\n)+.?)\(c\).*', pet_info):
	item['petitioner_name'] = re.match('(?:.\n)+.?\(a\)\sName\s((?:.\n)+.?)\(c\).*', pet_info).group(1)
	else:
	item['petitioner_name'] = ''
	if re.match('(?:.\n)+.?\(c\)\sDate\/Year\sof\sBirth\s\(d\)\s((?:.\n)+.)\(e\).', pet_info):
	item['petitioner_dob'] = re.match('(?:.\n)+.?\(c\)\sDate\/Year\sof\sBirth\s\(d\)\s((?:.\n)+.)\(e\).', pet_info).group(1).replace('\n','')
	else:
	item['petitioner_dob'] = ''
	if re.match('(?:.\n)+.?Occupation((?:.\n)+.?)\(g\).*', pet_info):
	item['petitioner_occupation'] = re.match('(?:.\n)+.?Occupation((?:.\n)+.?)\(g\).*', pet_info).group(1).strip()
	else:
	item['petitioner_occupation'] = ''
	if re.match('(?:.\n)+.?Nationality\s(.\n.?)\n+.', pet_info):
	item['petitioner_dob_nationality'] = re.match('(?:.\n)+.?Nationality\s(.\n.?)\n+.', pet_info).group(1)
	else:
	item['petitioner_dob_nationality'] = ''
	else:
	item['petitioner_name'] = ''
	item['petitioner_dob'] = ''
	item['petitioner_occupation'] = ''
	item['petitioner_dob_nationality'] = ''


	beg = [text.index(e) for e in text if 'Details of known/suspected' in e]
	if beg != []:
	beg = beg[0]
	end = [text.index(e) for e in text if 'Accused Name' in e]
	if end != []:
	end = end[0]
	details = ''.join(text[beg+1:end])
	item['respondent_details'] = details.replace('(Attach seperate sheet, if necessary)','').strip()
	else:
	item['respondent_details'] = ''
	else:
	item['respondent_details'] = ''

	beg = [text.index(e) for e in text if 'Accused Name Age' in e]
	if beg != []:
	beg = beg[0]
	end = [text.index(e) for e in text if 'Reasons for delay' in e]
	if end != []:
	end = end[0]
	item['respondent_name_age_address'] = ''.join(text[beg+1:end]).strip()
	else:
	item['respondent_name_age_address'] = ''
	else:
	item['respondent_name_age_address'] = ''


	beg = [text.index(e) for e in text if 'Signature of Officer' in e]
	if beg != []:
	beg = beg[0]
	end = [text.index(e) for e in text if 'Signature/Thumb' in e]
	if end != []:
	end = end[0]
	sig = ''.join(text[beg:end])
	if re.match('(?:.\n)+.?(\(.\)).', sig):
	item['officer_signature'] = re.match('(?:.\n)+.?(\(.\)).', sig).group(1)
	else:
	item['officer_signature'] = ''
	else:
	item['officer_signature'] = ''
	else:
	item['officer_signature'] = ''


	beg = [text.index(e) for e in text if 'Signature/Thumb' in e]
	if beg != []:
	beg = beg[0]
	end = [text.index(e) for e in text if 'Rank' in e]
	if end != []:
	end = end[0]
	thumb = ''.join(text[beg:end])
	if re.match('Signature/Thumb.?Name\s((?:.\n)+.)',thumb):
	item['officer_thumb'] = re.match('Signature/Thumb.?Name\s((?:.\n)+.)',thumb).group(1).strip()
	else:
	item['officer_thumb'] = ''
	else:
	item['officer_thumb'] = ''
	else:
	item['officer_thumb'] = ''

	beg = [text.index(e) for e in text if 'Rank' in e]
	if beg != []:
	beg = beg[0]
	end = [text.index(e) for e in text if 'Date and time of dispatch' in e]
	if end != []:
	end = end[0]
	rank_no = ''.join(text[beg:end])
	if re.match('Rank\s(.?)\sGPF.',rank_no):
	item['officer_rank'] = re.match('Rank\s(.?)\sGPF.',rank_no).group(1)
	else:
	item['officer_rank'] = ''
	if re.match('.?GPF\sNo\s(.?)\n.*',rank_no):
	item['officer_number'] = re.match('.?GPF\sNo\s(.?)\n.*',rank_no).group(1)
	else:
	item['officer_number'] = ''
	else:
	item['officer_rank'] = ''
	item['officer_number'] = ''
	else:
	item['officer_rank'] = ''
	item['officer_number'] = ''

	dispatch = [re.match('.?Date\sand\stime\sof\sdispatch\sto\sthe\scour\s(.)', e).group(1) for e in text if re.match('.?Date\sand\stime\sof\sdispatch\sto\sthe\scour\s(.)', e)]
	if dispatch != []:
	item['date_dispatch_to_court'] = dispatch[0]
	else:
	item['date_dispatch_to_court'] = ''

	beg = [text.index(e) for e in text if 'case No.' in e]
	if beg != []:
	beg = beg[0]
	item['case_number'] = text[beg+1].strip()
	else:
	item['case_number'] = ''

	beg = [text.index(e) for e in text if 'received at PS' in e]
	if beg != []:
	beg = beg[0]
	date = ''.join(text[beg])
	if re.match('.?PS:\s(.)\n', date):
	item['date_received_at_station'] = re.match('.?PS:\s(.)\n', date).group(1)
	else:
	item['date_received_at_station'] = ''
	else:
	item['date_received_at_station'] = ''

	beg = [text.index(e) for e in text if 'Direction and distance' in e]
	if beg != []:
	beg = beg[0]
	item['direction_distance_beat_no'] = text[beg+1].strip()
	else:
	item['direction_distance_beat_no'] = ''


	index = [text.index(e) for e in text if 'Type of Information:' in e]
	if index != []:
	index = index[0]
	type_info = ''.join(text[index:index+10])
	if re.match('Type of Information:\s(.\n+.*)\n+',type_info):
	case_type = re.match('Type of Information:\s(.\n+.*)\n+',type_info).group(1)
	item['case_type'] = case_type
	else:
	item['case_type'] = ''
	else:
	item['case_type']

	index = [text.index(e) for e in text if 'Details of known/suspected' in e]
	if index != []:
	index = index[0]
	case_details = text[index:index+20]
	end = [text.index(e) for e in case_details if 'Reasons for delay' in e]
	if end != []:
	end = end[0]
	case_details = ''.join(text[index:end])
	item['case_details'] = case_details
	else:
	item['case_details'] = ''
	else:
	item['case_details'] = ''

	upload_file_path = '/home/seraphina/Documents/CONTRACTS/UPWORK/JAY/gujarat_gov/gujarat_gov/' + pdf_name

	# post the file to the api
	files = {
	'state': (None, 'NJ'),
	'district': (None, 'nyc'),
	'file_type': (None, 'pdf'),
	'uploadfile': (upload_file_path, open(upload_file_path, 'rb')),
	'user': (None, 'Seraphina'),
	}

	try:
	response = requests.post('https://verify24x7.in/live/dump_pdf/api.php', files=files)
	logging.info('Response status successfull')
	logging.info(response)
	logging.info(response.text)
	except:
	logging.info('There was an error posting the data with status code: ', response.status_code)
	yield item

	drpdist = Select(self.driver.find_element_by_id("drpdist"))
	drpdist_options = drpdist.options
	drpdist_options = [option.text for option in drpdist_options]
	drpdist_options = drpdist_options[1:]
	drppolicestation = Select(self.driver.find_element_by_id("drppolicestation"))
	drppolicestation_options = drppolicestation.options
	drppolicestation_options = [option.text for option in drppolicestation_options]
	drppolicestation_options = drppolicestation_options[1:]

	# curl -X POST -F 'state=NJ' -F 'district=nyc' -F 'file_type=pdf' -F 'uploadfile=@/Users/ssss/Downloads/fir.pdf' -F 'user=Seraphina' https://verify24x7.in/live/dump_pdf/api.php
	# note - enter
	# scrapy crawl gujarat
	# to run from terminal