-
-
Save kcecireyes/07db9ca13b8ab5cce83e4d567b2c1251 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# TODO: output something meaninful if parser fails on caught exceptions | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import requests, time | |
e = pd.DataFrame(pd.read_csv('ids.csv')) | |
# Assume the first 2 digits are from a year in the 2000s | |
# Assume that the DIVCODE is 2 digits long (MM) | |
# Assume case number is 6 digits long (even if digs into DIVCODE) | |
def parseCase(case): | |
yr = case[0:2] | |
divcode = case[2:4] | |
no = case[-6:] | |
return "20"+yr, divcode, no | |
new = {} | |
def get_info(case): | |
year, divcode, no = parseCase(case['CaseNum']) | |
try: | |
url = '' | |
r = requests.get(url) | |
answer = r.text | |
page = BeautifulSoup(answer, "lxml") | |
trs = page.table.find_next_siblings()[2].findAll('tr') | |
if trs[6].td.text == 'Defendant(s)': | |
d = [trs[7].td.text, trs[8].td.text] | |
else: | |
d = [trs[6].td.text, trs[7].td.text] | |
print d, case['id'] | |
new[case['id']] = d | |
return case | |
except requests.exceptions.ConnectionError: | |
print 'connection error' | |
print 'waiting...' | |
time.sleep(5) | |
except AttributeError: | |
print 'none type, bro' | |
pass | |
except IndexError: | |
print 'index out of range, yo' | |
pass | |
e = e[((e['id'] > 1995))].apply(get_info, axis=1) | |
m = pd.DataFrame({'ID' : new.keys() , 'info' : new.values() }) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment