firstworldproblems/caselaw_scraper.py

## caselaw_scraper.py
import furl
import requests
import htmlmin
from bs4 import BeautifulSoup
from datetime import date, datetime
from dateutil import parser
import datefinder
import dateparser
from difflib import SequenceMatcher as SM
from fuzzywuzzy import fuzz, process
from nltk.util import ngrams
import codecs
import pandas as pd
import re
import json
import locale

locale.setlocale(locale.LC_ALL, '')


# scrapes caselaw website and calculates decision times for each case
# could be used to examine the efficiency of the court system
# in my case it was used to see just how abnormal a wait time of 756 days


# takes case title as argument e.g. ALZ v WorkCover NSW [2014] NSWCATAD 93
# returns medium neutral citation e.g. [2014] NSWCATAD 93
def medium_neutral_citation(title):
    sliced = re.split('(\[\d{4}\])', title)
    return "".join(sliced[1:]).strip()

# takes string and extract
# strips out # checks if on the papers,
# extracts checks if string contains dates in different date formats
# then iterates

def date_parser(text, key = "Hearing"):

    print(f"\n\nkey: {key}, checking string for date: {text}")

    if text.lower() == "on the papers":
        print("returning nothing")
        return ""
    stringsplit = re.split(',|&|and|\n',text)
    for fmt in ('%d %B %Y', '%d/%m/%Y', '%d/%m/%y','%d/%#m/%Y','%d.%m.%y','%m/%d/%Y'):
        try:
            x = datetime.strptime(stringsplit[-1].rstrip('.').strip(), fmt)
            return str(x.strftime("%#d %B %Y"))
        except Exception as e:
            print(f"Error parsing: {stringsplit[-1].rstrip('.').strip()} into {fmt} e: {e}")
            pass

    try:
        print("checking if string has any dates in it...")
        matches = datefinder.find_dates(text)
        parsed = []
        for m in matches:
            parsed.append(m)
        if len(parsed) > 0:
            print(f'oldest date is: {max(parsed).strftime("%#d %B %Y")}')
            return str(max(parsed).strftime("%#d %B %Y"))
    except Exception as e:
        print(f"Error parsing: {max(parsed)} e: {e}")
        pass
    return f"invalid: {text}"

# create dataframe columns for scraper
columns = ['Citation','Title','URL','Corrected','Papers','Submissions','Hearing','Decision','Days','Missing','Catchwords','Content']
df = pd.DataFrame(columns = columns)
all_columns = list(df)
df[all_columns] = df[all_columns].astype(str)

# constructs url query
baseurl = "https://www.caselaw.nsw.gov.au"


# url of search query used
search = "https://www.caselaw.nsw.gov.au/search/advanced?sort=decisionDate%2Casc&body=&title=&before=&catchwords=&party=&mnc=&startDate=&endDate=&fileNumber=&legislationCited=%22Privacy+and+Personal+Information+Protection+Act%22+OR+%22Health+Records+and+Information+Privacy+Act%22&casesCited=&courts=54a634063004de94513d827a&_courts=on&courts=54a634063004de94513d827b&_courts=on&courts=54a634063004de94513d8278&_courts=on&courts=54a634063004de94513d8279&_courts=on&courts=54a634063004de94513d827c&_courts=on&courts=54a634063004de94513d827d&_courts=on&courts=54a634063004de94513d828e&_courts=on&courts=54a634063004de94513d8285&_courts=on&courts=54a634063004de94513d827e&_courts=on&courts=54a634063004de94513d827f&_courts=on&courts=54a634063004de94513d8286&_courts=on&courts=54a634063004de94513d8280&_courts=on&courts=54a634063004de94513d8281&_courts=on&tribunals=54a634063004de94513d8282&_tribunals=on&tribunals=54a634063004de94513d8287&_tribunals=on&tribunals=54a634063004de94513d8289&_tribunals=on&tribunals=54a634063004de94513d828d&_tribunals=on&tribunals=54a634063004de94513d828b&_tribunals=on&tribunals=173b71a8beab2951cc1fab8d&_tribunals=on&tribunals=54a634063004de94513d828c&_tribunals=on&tribunals=54a634063004de94513d828a&_tribunals=on&tribunals=54a634063004de94513d8283&_tribunals=on&tribunals=1723173e41f6b6d63f2105d3&_tribunals=on&tribunals=5e5c92e1e4b0c8604babc749&_tribunals=on&tribunals=5e5c92c5e4b0c8604babc748&_tribunals=on&tribunals=54a634063004de94513d8284&_tribunals=on&tribunals=54a634063004de94513d8288&_tribunals=on"
pages = 32

data = []

# iterates through each row on each page of search results and extracts basic details of each case
# case title, medium neutral citation, url, decision date, catchwords
for count in range(0, pages):
    try:
        print(f"Search results page: {count}")
        page = requests.get(search, params={'page': count})
        soup = BeautifulSoup(page.text, 'html.parser')
        results = soup.select_one("div.container.searchresults")
        cases = results.find_all("div", {"class": ["row", "result"] })

        for row in cases:
            details = row.find("h4").find_next('a')
            published = baseurl + details.get('href')
            title = details.get_text()
            citation = medium_neutral_citation(title)
            decision = row.find(lambda tag:tag.name=="li" and "Decision date" in tag.text).find_next('li').get_text().strip()
            try:
                catchwords = row.find(lambda tag:tag.name=="p" and "Catchwords" in tag.text).find_next('p').get_text().strip() or "n/a"
            except:
                pass
            data.append({'Title': title,'Citation' : citation, 'URL': published, 'Decision': decision, 'Catchwords': catchwords})

    except:
        pass

# creates dataframe containing every case from search results
df = df.append(data, ignore_index = True)

# iterates through each case in dataframe and performs http get request of case url
# extracts html containing case details does fuzzy search for html tags containing submission and hearing info
# extracts dates and checks if on the papers, then calculates decision time in days
# for debugging, uncomment printed text
for index, row in df.iterrows():

    try:
        print(f"\n\n----- {df.loc[index,'Citation']} -----\n\n")
        response = requests.get(df.loc[index,'URL'])
        page_html = response.text

        soup = BeautifulSoup(page_html, "html.parser")
        table = soup.find(lambda tag:tag.name=="table" and "HEARING DATE" in tag.text) or soup.find("div", {"class": "coversheet"})
        df.loc[index,'Content'] = htmlmin.minify(str(table))

        keywords = ['Hearing date', 'SUBMISSIONS CLOSED', 'HEARING DATE', 'Submissions close', 'Papers', 'papers', "DATE OF DECISION", 'Jurisdiction', 'JURISDICTION']
        v1 = table.find_all("dt", string=re.compile('|'.join(keywords)))
        v2 = table.find_all("td", string=re.compile('|'.join(keywords)))

        matches = {'appeal': "", 'hearing': "", 'submissions': "", 'papers': "", 'jurisdiction': ""}
        # fuzzy search match options
        choices = ["appeal", "hearing", "submissions", "papers", "jurisdiction"]

        for tag in v1:
            print(tag.get_text())
            print(process.extract(tag.get_text(), choices))
            fuzzymatch = process.extractOne(tag.get_text(), choices)
            matches[fuzzymatch[0]] = tag.find_next('dd').get_text().strip()
            print((tag.find_next('dd').get_text().strip()).lower() )

        for tag in v2:
            print(tag)
            print(tag.get_text())
            print(process.extract(tag.get_text(), choices))
            fuzzymatch = process.extractOne(tag.get_text(), choices)
            matches[fuzzymatch[0]] = tag.find_next('td').get_text().strip()
            print( (tag.find_next('td').get_text().strip()).lower() )

        # prints dates extracted from page before being parsed
        print(json.dumps(matches, indent=4, sort_keys=True, default=str))

        print(f"values extracted: \n\n{list(matches.values())}")

        # checks if on papers
        df.loc[index,'Papers'] = "Yes" if any("paper" in string.lower() for string in list(matches.values())) else ""
        df.loc[index,'Jurisdiction'] = matches.pop('jurisdiction', "n/a")

        for key, value in matches.items():
            print(f"key: {key}, value: {value}")
            matches[key] = str(date_parser(value,key)) if len(value) > 0 else ""

        print(json.dumps(matches, indent=4, sort_keys=True, default=str))


        df.loc[index,'Hearing'] =  pd.to_datetime(matches['hearing'], errors='coerce')
        df.loc[index,'Submissions'] = pd.to_datetime(matches['submissions'], errors='coerce')
        df.loc[index,'Decision'] =  pd.to_datetime(df.loc[index, 'Decision'], errors='coerce')
        df.loc[index,'Days'] =  pd.to_datetime(df.loc[index, 'Days'], errors='coerce')

        # checks if valid dates have been extracted for either hearing or submissions
        # calculates
        valid = [d for d in [df.loc[index,'Hearing'], df.loc[index,'Submissions']] if type(d) is pd.Timestamp]
        if len(valid)>0:
            df.loc[index,'Days'] = df.loc[index,'Decision'] - max(valid)

        # flags case if unable to calculate decision time but only if decision was not on the papers
        if pd.isnull(df.loc[index,'Days']) and df.loc[index,'Papers'] != "Yes":
            df.loc[index,'Missing'] = "Yes"

        # prints final dataframe row constructed for case
        print(df.loc[index])

    except Exception as e:
        print(f"Error: {e}")
        pass

# outputs entire dataframe to spreadsheet file
writer = pd.ExcelWriter(r'caselawscraper.ppip.hrip.act.v2.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='results', index=False)
writer.save()
print("Saved")
	import furl
	import requests
	import htmlmin
	from bs4 import BeautifulSoup
	from datetime import date, datetime
	from dateutil import parser
	import datefinder
	import dateparser
	from difflib import SequenceMatcher as SM
	from fuzzywuzzy import fuzz, process
	from nltk.util import ngrams
	import codecs
	import pandas as pd
	import re
	import json
	import locale

	locale.setlocale(locale.LC_ALL, '')


	# scrapes caselaw website and calculates decision times for each case
	# could be used to examine the efficiency of the court system
	# in my case it was used to see just how abnormal a wait time of 756 days






	# takes case title as argument e.g. ALZ v WorkCover NSW [2014] NSWCATAD 93
	# returns medium neutral citation e.g. [2014] NSWCATAD 93
	def medium_neutral_citation(title):
	sliced = re.split('(\[\d{4}\])', title)
	return "".join(sliced[1:]).strip()

	# takes string and extract
	# strips out # checks if on the papers,
	# extracts checks if string contains dates in different date formats
	# then iterates

	def date_parser(text, key = "Hearing"):

	print(f"\n\nkey: {key}, checking string for date: {text}")

	if text.lower() == "on the papers":
	print("returning nothing")
	return ""
	stringsplit = re.split(',\|&\|and\|\n',text)
	for fmt in ('%d %B %Y', '%d/%m/%Y', '%d/%m/%y','%d/%#m/%Y','%d.%m.%y','%m/%d/%Y'):
	try:
	x = datetime.strptime(stringsplit[-1].rstrip('.').strip(), fmt)
	return str(x.strftime("%#d %B %Y"))
	except Exception as e:
	print(f"Error parsing: {stringsplit[-1].rstrip('.').strip()} into {fmt} e: {e}")
	pass

	try:
	print("checking if string has any dates in it...")
	matches = datefinder.find_dates(text)
	parsed = []
	for m in matches:
	parsed.append(m)
	if len(parsed) > 0:
	print(f'oldest date is: {max(parsed).strftime("%#d %B %Y")}')
	return str(max(parsed).strftime("%#d %B %Y"))
	except Exception as e:
	print(f"Error parsing: {max(parsed)} e: {e}")
	pass
	return f"invalid: {text}"

	# create dataframe columns for scraper
	columns = ['Citation','Title','URL','Corrected','Papers','Submissions','Hearing','Decision','Days','Missing','Catchwords','Content']
	df = pd.DataFrame(columns = columns)
	all_columns = list(df)
	df[all_columns] = df[all_columns].astype(str)

	# constructs url query
	baseurl = "https://www.caselaw.nsw.gov.au"


	# url of search query used
	search = "https://www.caselaw.nsw.gov.au/search/advanced?sort=decisionDate%2Casc&body=&title=&before=&catchwords=&party=&mnc=&startDate=&endDate=&fileNumber=&legislationCited=%22Privacy+and+Personal+Information+Protection+Act%22+OR+%22Health+Records+and+Information+Privacy+Act%22&casesCited=&courts=54a634063004de94513d827a&_courts=on&courts=54a634063004de94513d827b&_courts=on&courts=54a634063004de94513d8278&_courts=on&courts=54a634063004de94513d8279&_courts=on&courts=54a634063004de94513d827c&_courts=on&courts=54a634063004de94513d827d&_courts=on&courts=54a634063004de94513d828e&_courts=on&courts=54a634063004de94513d8285&_courts=on&courts=54a634063004de94513d827e&_courts=on&courts=54a634063004de94513d827f&_courts=on&courts=54a634063004de94513d8286&_courts=on&courts=54a634063004de94513d8280&_courts=on&courts=54a634063004de94513d8281&_courts=on&tribunals=54a634063004de94513d8282&_tribunals=on&tribunals=54a634063004de94513d8287&_tribunals=on&tribunals=54a634063004de94513d8289&_tribunals=on&tribunals=54a634063004de94513d828d&_tribunals=on&tribunals=54a634063004de94513d828b&_tribunals=on&tribunals=173b71a8beab2951cc1fab8d&_tribunals=on&tribunals=54a634063004de94513d828c&_tribunals=on&tribunals=54a634063004de94513d828a&_tribunals=on&tribunals=54a634063004de94513d8283&_tribunals=on&tribunals=1723173e41f6b6d63f2105d3&_tribunals=on&tribunals=5e5c92e1e4b0c8604babc749&_tribunals=on&tribunals=5e5c92c5e4b0c8604babc748&_tribunals=on&tribunals=54a634063004de94513d8284&_tribunals=on&tribunals=54a634063004de94513d8288&_tribunals=on"
	pages = 32

	data = []

	# iterates through each row on each page of search results and extracts basic details of each case
	# case title, medium neutral citation, url, decision date, catchwords
	for count in range(0, pages):
	try:
	print(f"Search results page: {count}")
	page = requests.get(search, params={'page': count})
	soup = BeautifulSoup(page.text, 'html.parser')
	results = soup.select_one("div.container.searchresults")
	cases = results.find_all("div", {"class": ["row", "result"] })

	for row in cases:
	details = row.find("h4").find_next('a')
	published = baseurl + details.get('href')
	title = details.get_text()
	citation = medium_neutral_citation(title)
	decision = row.find(lambda tag:tag.name=="li" and "Decision date" in tag.text).find_next('li').get_text().strip()
	try:
	catchwords = row.find(lambda tag:tag.name=="p" and "Catchwords" in tag.text).find_next('p').get_text().strip() or "n/a"
	except:
	pass
	data.append({'Title': title,'Citation' : citation, 'URL': published, 'Decision': decision, 'Catchwords': catchwords})

	except:
	pass

	# creates dataframe containing every case from search results
	df = df.append(data, ignore_index = True)

	# iterates through each case in dataframe and performs http get request of case url
	# extracts html containing case details does fuzzy search for html tags containing submission and hearing info
	# extracts dates and checks if on the papers, then calculates decision time in days
	# for debugging, uncomment printed text
	for index, row in df.iterrows():

	try:
	print(f"\n\n----- {df.loc[index,'Citation']} -----\n\n")
	response = requests.get(df.loc[index,'URL'])
	page_html = response.text

	soup = BeautifulSoup(page_html, "html.parser")
	table = soup.find(lambda tag:tag.name=="table" and "HEARING DATE" in tag.text) or soup.find("div", {"class": "coversheet"})
	df.loc[index,'Content'] = htmlmin.minify(str(table))

	keywords = ['Hearing date', 'SUBMISSIONS CLOSED', 'HEARING DATE', 'Submissions close', 'Papers', 'papers', "DATE OF DECISION", 'Jurisdiction', 'JURISDICTION']
	v1 = table.find_all("dt", string=re.compile('\|'.join(keywords)))
	v2 = table.find_all("td", string=re.compile('\|'.join(keywords)))

	matches = {'appeal': "", 'hearing': "", 'submissions': "", 'papers': "", 'jurisdiction': ""}
	# fuzzy search match options
	choices = ["appeal", "hearing", "submissions", "papers", "jurisdiction"]

	for tag in v1:
	print(tag.get_text())
	print(process.extract(tag.get_text(), choices))
	fuzzymatch = process.extractOne(tag.get_text(), choices)
	matches[fuzzymatch[0]] = tag.find_next('dd').get_text().strip()
	print((tag.find_next('dd').get_text().strip()).lower() )

	for tag in v2:
	print(tag)
	print(tag.get_text())
	print(process.extract(tag.get_text(), choices))
	fuzzymatch = process.extractOne(tag.get_text(), choices)
	matches[fuzzymatch[0]] = tag.find_next('td').get_text().strip()
	print( (tag.find_next('td').get_text().strip()).lower() )

	# prints dates extracted from page before being parsed
	print(json.dumps(matches, indent=4, sort_keys=True, default=str))

	print(f"values extracted: \n\n{list(matches.values())}")

	# checks if on papers
	df.loc[index,'Papers'] = "Yes" if any("paper" in string.lower() for string in list(matches.values())) else ""
	df.loc[index,'Jurisdiction'] = matches.pop('jurisdiction', "n/a")

	for key, value in matches.items():
	print(f"key: {key}, value: {value}")
	matches[key] = str(date_parser(value,key)) if len(value) > 0 else ""

	print(json.dumps(matches, indent=4, sort_keys=True, default=str))


	df.loc[index,'Hearing'] = pd.to_datetime(matches['hearing'], errors='coerce')
	df.loc[index,'Submissions'] = pd.to_datetime(matches['submissions'], errors='coerce')
	df.loc[index,'Decision'] = pd.to_datetime(df.loc[index, 'Decision'], errors='coerce')
	df.loc[index,'Days'] = pd.to_datetime(df.loc[index, 'Days'], errors='coerce')

	# checks if valid dates have been extracted for either hearing or submissions
	# calculates
	valid = [d for d in [df.loc[index,'Hearing'], df.loc[index,'Submissions']] if type(d) is pd.Timestamp]
	if len(valid)>0:
	df.loc[index,'Days'] = df.loc[index,'Decision'] - max(valid)

	# flags case if unable to calculate decision time but only if decision was not on the papers
	if pd.isnull(df.loc[index,'Days']) and df.loc[index,'Papers'] != "Yes":
	df.loc[index,'Missing'] = "Yes"

	# prints final dataframe row constructed for case
	print(df.loc[index])

	except Exception as e:
	print(f"Error: {e}")
	pass

	# outputs entire dataframe to spreadsheet file
	writer = pd.ExcelWriter(r'caselawscraper.ppip.hrip.act.v2.xlsx', engine='xlsxwriter')
	df.to_excel(writer, sheet_name='results', index=False)
	writer.save()
	print("Saved")