Skip to content

Instantly share code, notes, and snippets.

@firstworldproblems
Created November 23, 2022 01:05
Show Gist options
  • Save firstworldproblems/a59d797a816b8ce1d8bb5013e3b394cc to your computer and use it in GitHub Desktop.
Save firstworldproblems/a59d797a816b8ce1d8bb5013e3b394cc to your computer and use it in GitHub Desktop.
scrapes caselaw website to extract data for analysis
import furl
import requests
import htmlmin
from bs4 import BeautifulSoup
from datetime import date, datetime
from dateutil import parser
import datefinder
import dateparser
from difflib import SequenceMatcher as SM
from fuzzywuzzy import fuzz, process
from nltk.util import ngrams
import codecs
import pandas as pd
import re
import json
import locale
locale.setlocale(locale.LC_ALL, '')
# scrapes caselaw website and calculates decision times for each case
# could be used to examine the efficiency of the court system
# in my case it was used to see just how abnormal a wait time of 756 days
# takes case title as argument e.g. ALZ v WorkCover NSW [2014] NSWCATAD 93
# returns medium neutral citation e.g. [2014] NSWCATAD 93
def medium_neutral_citation(title):
sliced = re.split('(\[\d{4}\])', title)
return "".join(sliced[1:]).strip()
# takes string and extract
# strips out # checks if on the papers,
# extracts checks if string contains dates in different date formats
# then iterates
def date_parser(text, key = "Hearing"):
print(f"\n\nkey: {key}, checking string for date: {text}")
if text.lower() == "on the papers":
print("returning nothing")
return ""
stringsplit = re.split(',|&|and|\n',text)
for fmt in ('%d %B %Y', '%d/%m/%Y', '%d/%m/%y','%d/%#m/%Y','%d.%m.%y','%m/%d/%Y'):
try:
x = datetime.strptime(stringsplit[-1].rstrip('.').strip(), fmt)
return str(x.strftime("%#d %B %Y"))
except Exception as e:
print(f"Error parsing: {stringsplit[-1].rstrip('.').strip()} into {fmt} e: {e}")
pass
try:
print("checking if string has any dates in it...")
matches = datefinder.find_dates(text)
parsed = []
for m in matches:
parsed.append(m)
if len(parsed) > 0:
print(f'oldest date is: {max(parsed).strftime("%#d %B %Y")}')
return str(max(parsed).strftime("%#d %B %Y"))
except Exception as e:
print(f"Error parsing: {max(parsed)} e: {e}")
pass
return f"invalid: {text}"
# create dataframe columns for scraper
columns = ['Citation','Title','URL','Corrected','Papers','Submissions','Hearing','Decision','Days','Missing','Catchwords','Content']
df = pd.DataFrame(columns = columns)
all_columns = list(df)
df[all_columns] = df[all_columns].astype(str)
# constructs url query
baseurl = "https://www.caselaw.nsw.gov.au"
# url of search query used
search = "https://www.caselaw.nsw.gov.au/search/advanced?sort=decisionDate%2Casc&body=&title=&before=&catchwords=&party=&mnc=&startDate=&endDate=&fileNumber=&legislationCited=%22Privacy+and+Personal+Information+Protection+Act%22+OR+%22Health+Records+and+Information+Privacy+Act%22&casesCited=&courts=54a634063004de94513d827a&_courts=on&courts=54a634063004de94513d827b&_courts=on&courts=54a634063004de94513d8278&_courts=on&courts=54a634063004de94513d8279&_courts=on&courts=54a634063004de94513d827c&_courts=on&courts=54a634063004de94513d827d&_courts=on&courts=54a634063004de94513d828e&_courts=on&courts=54a634063004de94513d8285&_courts=on&courts=54a634063004de94513d827e&_courts=on&courts=54a634063004de94513d827f&_courts=on&courts=54a634063004de94513d8286&_courts=on&courts=54a634063004de94513d8280&_courts=on&courts=54a634063004de94513d8281&_courts=on&tribunals=54a634063004de94513d8282&_tribunals=on&tribunals=54a634063004de94513d8287&_tribunals=on&tribunals=54a634063004de94513d8289&_tribunals=on&tribunals=54a634063004de94513d828d&_tribunals=on&tribunals=54a634063004de94513d828b&_tribunals=on&tribunals=173b71a8beab2951cc1fab8d&_tribunals=on&tribunals=54a634063004de94513d828c&_tribunals=on&tribunals=54a634063004de94513d828a&_tribunals=on&tribunals=54a634063004de94513d8283&_tribunals=on&tribunals=1723173e41f6b6d63f2105d3&_tribunals=on&tribunals=5e5c92e1e4b0c8604babc749&_tribunals=on&tribunals=5e5c92c5e4b0c8604babc748&_tribunals=on&tribunals=54a634063004de94513d8284&_tribunals=on&tribunals=54a634063004de94513d8288&_tribunals=on"
pages = 32
data = []
# iterates through each row on each page of search results and extracts basic details of each case
# case title, medium neutral citation, url, decision date, catchwords
for count in range(0, pages):
try:
print(f"Search results page: {count}")
page = requests.get(search, params={'page': count})
soup = BeautifulSoup(page.text, 'html.parser')
results = soup.select_one("div.container.searchresults")
cases = results.find_all("div", {"class": ["row", "result"] })
for row in cases:
details = row.find("h4").find_next('a')
published = baseurl + details.get('href')
title = details.get_text()
citation = medium_neutral_citation(title)
decision = row.find(lambda tag:tag.name=="li" and "Decision date" in tag.text).find_next('li').get_text().strip()
try:
catchwords = row.find(lambda tag:tag.name=="p" and "Catchwords" in tag.text).find_next('p').get_text().strip() or "n/a"
except:
pass
data.append({'Title': title,'Citation' : citation, 'URL': published, 'Decision': decision, 'Catchwords': catchwords})
except:
pass
# creates dataframe containing every case from search results
df = df.append(data, ignore_index = True)
# iterates through each case in dataframe and performs http get request of case url
# extracts html containing case details does fuzzy search for html tags containing submission and hearing info
# extracts dates and checks if on the papers, then calculates decision time in days
# for debugging, uncomment printed text
for index, row in df.iterrows():
try:
print(f"\n\n----- {df.loc[index,'Citation']} -----\n\n")
response = requests.get(df.loc[index,'URL'])
page_html = response.text
soup = BeautifulSoup(page_html, "html.parser")
table = soup.find(lambda tag:tag.name=="table" and "HEARING DATE" in tag.text) or soup.find("div", {"class": "coversheet"})
df.loc[index,'Content'] = htmlmin.minify(str(table))
keywords = ['Hearing date', 'SUBMISSIONS CLOSED', 'HEARING DATE', 'Submissions close', 'Papers', 'papers', "DATE OF DECISION", 'Jurisdiction', 'JURISDICTION']
v1 = table.find_all("dt", string=re.compile('|'.join(keywords)))
v2 = table.find_all("td", string=re.compile('|'.join(keywords)))
matches = {'appeal': "", 'hearing': "", 'submissions': "", 'papers': "", 'jurisdiction': ""}
# fuzzy search match options
choices = ["appeal", "hearing", "submissions", "papers", "jurisdiction"]
for tag in v1:
print(tag.get_text())
print(process.extract(tag.get_text(), choices))
fuzzymatch = process.extractOne(tag.get_text(), choices)
matches[fuzzymatch[0]] = tag.find_next('dd').get_text().strip()
print((tag.find_next('dd').get_text().strip()).lower() )
for tag in v2:
print(tag)
print(tag.get_text())
print(process.extract(tag.get_text(), choices))
fuzzymatch = process.extractOne(tag.get_text(), choices)
matches[fuzzymatch[0]] = tag.find_next('td').get_text().strip()
print( (tag.find_next('td').get_text().strip()).lower() )
# prints dates extracted from page before being parsed
print(json.dumps(matches, indent=4, sort_keys=True, default=str))
print(f"values extracted: \n\n{list(matches.values())}")
# checks if on papers
df.loc[index,'Papers'] = "Yes" if any("paper" in string.lower() for string in list(matches.values())) else ""
df.loc[index,'Jurisdiction'] = matches.pop('jurisdiction', "n/a")
for key, value in matches.items():
print(f"key: {key}, value: {value}")
matches[key] = str(date_parser(value,key)) if len(value) > 0 else ""
print(json.dumps(matches, indent=4, sort_keys=True, default=str))
df.loc[index,'Hearing'] = pd.to_datetime(matches['hearing'], errors='coerce')
df.loc[index,'Submissions'] = pd.to_datetime(matches['submissions'], errors='coerce')
df.loc[index,'Decision'] = pd.to_datetime(df.loc[index, 'Decision'], errors='coerce')
df.loc[index,'Days'] = pd.to_datetime(df.loc[index, 'Days'], errors='coerce')
# checks if valid dates have been extracted for either hearing or submissions
# calculates
valid = [d for d in [df.loc[index,'Hearing'], df.loc[index,'Submissions']] if type(d) is pd.Timestamp]
if len(valid)>0:
df.loc[index,'Days'] = df.loc[index,'Decision'] - max(valid)
# flags case if unable to calculate decision time but only if decision was not on the papers
if pd.isnull(df.loc[index,'Days']) and df.loc[index,'Papers'] != "Yes":
df.loc[index,'Missing'] = "Yes"
# prints final dataframe row constructed for case
print(df.loc[index])
except Exception as e:
print(f"Error: {e}")
pass
# outputs entire dataframe to spreadsheet file
writer = pd.ExcelWriter(r'caselawscraper.ppip.hrip.act.v2.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='results', index=False)
writer.save()
print("Saved")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment