Created
October 24, 2019 04:08
-
-
Save simonw/bbde42c1e42d13107a17a050c3b5993f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Importing Louisiana bills for a Big Local project\n", | |
"\n", | |
"We have a collection of zip files containing \"bills\" - where each bill is described using JSON.\n", | |
"\n", | |
"I'm going to import them into a SQLite database using https://sqlite-utils.readthedocs.io/ so they can be explored using Datasette https://datasette.readthedocs.io/\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import sqlite_utils" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"It looks to me like the interesting stuff is in the bills JSON files inside the zips.\n", | |
"\n", | |
"e.g. `/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA 2/2019-2019_Regular_Session/bill/HB4.json`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"bill = json.load(open(\"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA 2/2019-2019_Regular_Session/bill/HB4.json\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dict_keys(['bill'])" | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bill.keys()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'bill_id': 1187160,\n", | |
" 'change_hash': 'a3f129234f61f5f54285aef24149bed2',\n", | |
" 'session_id': 1634,\n", | |
" 'session': {'session_id': 1634,\n", | |
" 'session_name': '2019 Regular Session',\n", | |
" 'session_title': 'Regular Session',\n", | |
" 'year_start': 2019,\n", | |
" 'year_end': 2019,\n", | |
" 'special': 0},\n", | |
" 'url': 'https://legiscan.com/LA/bill/HB4/2019',\n", | |
" 'state_link': 'http://www.legis.la.gov/legis/BillInfo.aspx?s=19rs&b=HB4&sbi=y',\n", | |
" 'completed': 1,\n", | |
" 'status': 4,\n", | |
" 'status_date': '2019-06-11',\n", | |
" 'progress': [{'date': '2019-01-28', 'event': 1},\n", | |
" {'date': '2019-01-28', 'event': 9},\n", | |
" {'date': '2019-04-08', 'event': 9},\n", | |
" {'date': '2019-05-08', 'event': 10},\n", | |
" {'date': '2019-05-21', 'event': 2},\n", | |
" {'date': '2019-05-22', 'event': 9},\n", | |
" {'date': '2019-05-28', 'event': 10},\n", | |
" {'date': '2019-06-03', 'event': 3},\n", | |
" {'date': '2019-06-11', 'event': 8},\n", | |
" {'date': '2019-06-11', 'event': 4}],\n", | |
" 'state': 'LA',\n", | |
" 'state_id': 18,\n", | |
" 'bill_number': 'HB4',\n", | |
" 'bill_type': 'B',\n", | |
" 'bill_type_id': '1',\n", | |
" 'body': 'H',\n", | |
" 'body_id': 45,\n", | |
" 'current_body': 'H',\n", | |
" 'current_body_id': 45,\n", | |
" 'title': \"Provides relative to the procedures for sheriff's sales of property\",\n", | |
" 'description': \"Provides relative to the procedures for sheriff's sales of property\",\n", | |
" 'committee': [],\n", | |
" 'pending_committee_id': 0,\n", | |
" 'history': [{'date': '2019-01-28',\n", | |
" 'action': 'Prefiled.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 1},\n", | |
" {'date': '2019-01-28',\n", | |
" 'action': 'Under the rules, provisionally referred to the Committee on Judiciary.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 1},\n", | |
" {'date': '2019-02-01',\n", | |
" 'action': 'First appeared in the Interim Calendar on 2/1/2019.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-04-08',\n", | |
" 'action': 'Read by title, under the rules, referred to the Committee on Judiciary.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 1},\n", | |
" {'date': '2019-05-08',\n", | |
" 'action': 'Reported with amendments (13-0).',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 1},\n", | |
" {'date': '2019-05-09',\n", | |
" 'action': 'Read by title, amended, ordered engrossed, passed to 3rd reading.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-05-09',\n", | |
" 'action': 'Scheduled for floor debate on 05/21/19.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-05-21',\n", | |
" 'action': 'Read third time by title, roll called on final passage, yeas 96, nays 0. Finally passed, title adopted, ordered to the Senate.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-05-21',\n", | |
" 'action': 'Received in the Senate. Rules suspended. Read first time by title and placed on the Calendar for a second reading.',\n", | |
" 'chamber': 'S',\n", | |
" 'chamber_id': 46,\n", | |
" 'importance': 1},\n", | |
" {'date': '2019-05-22',\n", | |
" 'action': 'Read second time by title and referred to the Committee on Judiciary B.',\n", | |
" 'chamber': 'S',\n", | |
" 'chamber_id': 46,\n", | |
" 'importance': 1},\n", | |
" {'date': '2019-05-28',\n", | |
" 'action': 'Rules suspended. Reported favorably.',\n", | |
" 'chamber': 'S',\n", | |
" 'chamber_id': 46,\n", | |
" 'importance': 1},\n", | |
" {'date': '2019-05-29',\n", | |
" 'action': 'Read by title and referred to the Legislative Bureau.',\n", | |
" 'chamber': 'S',\n", | |
" 'chamber_id': 46,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-05-30',\n", | |
" 'action': 'Reported without Legislative Bureau amendments. Read by title and passed to third reading and final passage.',\n", | |
" 'chamber': 'S',\n", | |
" 'chamber_id': 46,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-06-02',\n", | |
" 'action': 'Rules suspended. Read by title, passed by a vote of 35 yeas and 1 nays, and ordered returned to the House. Motion to reconsider tabled.',\n", | |
" 'chamber': 'S',\n", | |
" 'chamber_id': 46,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-06-02',\n", | |
" 'action': 'Received from the Senate without amendments.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-06-03',\n", | |
" 'action': 'Enrolled and signed by the Speaker of the House.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 1},\n", | |
" {'date': '2019-06-04',\n", | |
" 'action': 'Signed by the President of the Senate.',\n", | |
" 'chamber': 'S',\n", | |
" 'chamber_id': 46,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-06-05',\n", | |
" 'action': 'Sent to the Governor for executive approval.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 0},\n", | |
" {'date': '2019-06-11',\n", | |
" 'action': 'Signed by the Governor. Becomes Act No. 415.',\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'importance': 1}],\n", | |
" 'sponsors': [{'people_id': 17824,\n", | |
" 'person_hash': '0cys0f51',\n", | |
" 'party_id': 2,\n", | |
" 'party': 'R',\n", | |
" 'role_id': 1,\n", | |
" 'role': 'Rep',\n", | |
" 'name': 'Stephen Dwight',\n", | |
" 'first_name': 'Stephen',\n", | |
" 'middle_name': 'C.',\n", | |
" 'last_name': 'Dwight',\n", | |
" 'suffix': '',\n", | |
" 'nickname': '',\n", | |
" 'district': 'HD-035',\n", | |
" 'ftm_eid': 32373968,\n", | |
" 'votesmart_id': 165523,\n", | |
" 'opensecrets_id': '',\n", | |
" 'ballotpedia': 'Stephen_Dwight',\n", | |
" 'sponsor_type_id': 1,\n", | |
" 'sponsor_order': 1,\n", | |
" 'committee_sponsor': 0,\n", | |
" 'committee_id': '0'}],\n", | |
" 'sasts': [],\n", | |
" 'subjects': [{'subject_id': 3827, 'subject_name': 'PROPERTY/IMMOVABLE'}],\n", | |
" 'texts': [{'doc_id': 1888068,\n", | |
" 'date': '0000-00-00',\n", | |
" 'type': 'Introduced',\n", | |
" 'type_id': 1,\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/text/HB4/id/1888068',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1111410',\n", | |
" 'text_size': 38480},\n", | |
" {'doc_id': 2012585,\n", | |
" 'date': '0000-00-00',\n", | |
" 'type': 'Engrossed',\n", | |
" 'type_id': 4,\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/text/HB4/id/2012585',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1132572',\n", | |
" 'text_size': 40755},\n", | |
" {'doc_id': 2031989,\n", | |
" 'date': '0000-00-00',\n", | |
" 'type': 'Enrolled',\n", | |
" 'type_id': 5,\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/text/HB4/id/2031989',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1141455',\n", | |
" 'text_size': 36084},\n", | |
" {'doc_id': 2042583,\n", | |
" 'date': '0000-00-00',\n", | |
" 'type': 'Chaptered',\n", | |
" 'type_id': 6,\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/text/HB4/id/2042583',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1144887',\n", | |
" 'text_size': 42881}],\n", | |
" 'votes': [{'roll_call_id': 868132,\n", | |
" 'date': '2019-05-21',\n", | |
" 'desc': 'House Vote on HB 4 FINAL PASSAGE (#451)',\n", | |
" 'yea': 96,\n", | |
" 'nay': 0,\n", | |
" 'nv': 0,\n", | |
" 'absent': 9,\n", | |
" 'total': 105,\n", | |
" 'passed': 1,\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'url': 'https://legiscan.com/LA/rollcall/HB4/id/868132',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1136368'},\n", | |
" {'roll_call_id': 876289,\n", | |
" 'date': '2019-06-02',\n", | |
" 'desc': 'Senate Vote on HB 4 FINAL PASSAGE (#772)',\n", | |
" 'yea': 35,\n", | |
" 'nay': 1,\n", | |
" 'nv': 0,\n", | |
" 'absent': 3,\n", | |
" 'total': 39,\n", | |
" 'passed': 1,\n", | |
" 'chamber': 'S',\n", | |
" 'chamber_id': 46,\n", | |
" 'url': 'https://legiscan.com/LA/rollcall/HB4/id/876289',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1141000'}],\n", | |
" 'amendments': [{'amendment_id': 73639,\n", | |
" 'adopted': 0,\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'date': '0000-00-00',\n", | |
" 'title': 'House Committee Amendment #1017 JUD Draft',\n", | |
" 'description': 'House Committee Amendment #1017 JUD Draft',\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/amendment/HB4/id/73639',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1123678'},\n", | |
" {'amendment_id': 75936,\n", | |
" 'adopted': 0,\n", | |
" 'chamber': 'H',\n", | |
" 'chamber_id': 45,\n", | |
" 'date': '0000-00-00',\n", | |
" 'title': 'House Committee Amendment #2398 JUD Proposed',\n", | |
" 'description': 'House Committee Amendment #2398 JUD Proposed',\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/amendment/HB4/id/75936',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1132459'}],\n", | |
" 'supplements': [{'supplement_id': 94072,\n", | |
" 'date': '0000-00-00',\n", | |
" 'type': 'Analysis',\n", | |
" 'type_id': 2,\n", | |
" 'title': 'Analysis',\n", | |
" 'description': 'Digest of HB4 Original',\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/supplement/HB4/id/94072',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1111412'},\n", | |
" {'supplement_id': 109022,\n", | |
" 'date': '0000-00-00',\n", | |
" 'type': 'Analysis',\n", | |
" 'type_id': 2,\n", | |
" 'title': 'Analysis',\n", | |
" 'description': 'Digest of HB4 Engrossed',\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/supplement/HB4/id/109022',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1132897'},\n", | |
" {'supplement_id': 110800,\n", | |
" 'date': '0000-00-00',\n", | |
" 'type': 'Analysis',\n", | |
" 'type_id': 2,\n", | |
" 'title': 'Analysis',\n", | |
" 'description': 'Resume Digest for HB4',\n", | |
" 'mime': 'application/pdf',\n", | |
" 'mime_id': 2,\n", | |
" 'url': 'https://legiscan.com/LA/supplement/HB4/id/110800',\n", | |
" 'state_link': 'http://www.legis.la.gov/Legis/ViewDocument.aspx?d=1145650'}],\n", | |
" 'calendar': [{'type_id': 1,\n", | |
" 'type': 'Hearing',\n", | |
" 'date': '2019-05-08',\n", | |
" 'time': '09:30',\n", | |
" 'location': 'Room 1',\n", | |
" 'description': 'House Committee on Judiciary'},\n", | |
" {'type_id': 1,\n", | |
" 'type': 'Hearing',\n", | |
" 'date': '2019-05-28',\n", | |
" 'time': '09:00',\n", | |
" 'location': 'Room E',\n", | |
" 'description': 'Senate Committee on Judiciary B'}]}" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bill['bill']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def save_bill(db, bill):\n", | |
" bill = dict(bill)\n", | |
" session = bill.pop(\"session\")\n", | |
" session[\"name\"] = session.pop(\"session_name\")\n", | |
" bill_id = bill[\"bill_id\"]\n", | |
" db[\"sessions\"].upsert(session, pk=\"session_id\")\n", | |
" \n", | |
" # Does it have a committee?\n", | |
" committee = bill.pop(\"committee\", None)\n", | |
" if committee:\n", | |
" db[\"committees\"].upsert(committee, pk=\"committee_id\")\n", | |
" bill[\"committee\"] = committee[\"committee_id\"]\n", | |
" \n", | |
" # Bill types\n", | |
" bill_type = {\n", | |
" \"bill_type_id\": bill[\"bill_type_id\"],\n", | |
" \"name\": bill.pop(\"bill_type\"),\n", | |
" }\n", | |
" db[\"bill_types\"].upsert(bill_type, pk=\"bill_type_id\")\n", | |
" # history, sponser etc items will be handled separately\n", | |
" history = bill.pop(\"history\", None) or []\n", | |
" sponsors = bill.pop(\"sponsors\", None) or []\n", | |
" progress = bill.pop(\"progress\", None) or []\n", | |
" subjects = bill.pop(\"subjects\", None) or []\n", | |
" texts = bill.pop(\"texts\", None) or []\n", | |
" votes = bill.pop(\"votes\", None) or []\n", | |
" amendments = bill.pop(\"amendments\", None) or []\n", | |
" supplements = bill.pop(\"supplements\", None) or []\n", | |
" calendar = bill.pop(\"calendar\", None) or []\n", | |
" sasts = bill.pop(\"sasts\", None) or []\n", | |
" \n", | |
" db[\"bills\"].upsert(bill, pk=\"bill_id\", foreign_keys=(\"session_id\", \"bill_type_id\", \"committee\"), column_order=(\"bill_id\", \"bill_number\", \"session_id\", \"title\", \"description\", \"url\"))\n", | |
" \n", | |
" # Now save all the stuff we stashed earlier\n", | |
" for item in history:\n", | |
" item[\"bill_id\"] = bill_id\n", | |
" item[\"id\"] = id_for_history(item)\n", | |
" db[\"history\"].upsert_all(history, pk=\"id\", foreign_keys=(\"bill_id\",))\n", | |
" \n", | |
" for item in sponsors:\n", | |
" db[\"people\"].upsert(item, pk=\"people_id\")\n", | |
" db[\"sponsors\"].upsert({\n", | |
" \"bill_id\": bill_id,\n", | |
" \"people_id\": item[\"people_id\"],\n", | |
" },\n", | |
" pk=(\"bill_id\", \"people_id\"),\n", | |
" foreign_keys=((\"bill_id\", \"bills\", \"bill_id\"), (\"people_id\", \"people\", \"people_id\")),\n", | |
" )\n", | |
"\n", | |
" for subject in subjects:\n", | |
" db[\"subjects\"].upsert({\n", | |
" \"subject_id\": subject[\"subject_id\"],\n", | |
" \"name\": subject[\"subject_name\"],\n", | |
" }, pk=\"subject_id\")\n", | |
" db[\"bill_subjects\"].upsert({\n", | |
" \"bill_id\": bill_id,\n", | |
" \"subject_id\": subject[\"subject_id\"],\n", | |
" }, pk=(\"bill_id\", \"subject_id\"), foreign_keys=(\"bill_id\", \"subject_id\"))\n", | |
"\n", | |
" for item in texts:\n", | |
" item[\"bill_id\"] = bill_id\n", | |
" db[\"texts\"].upsert_all(texts, pk=\"doc_id\", foreign_keys=(\"bill_id\",))\n", | |
"\n", | |
" for item in votes:\n", | |
" item[\"bill_id\"] = bill_id\n", | |
" db[\"votes\"].upsert_all(votes, pk=\"roll_call_id\", foreign_keys=(\"bill_id\",))\n", | |
"\n", | |
" for item in amendments:\n", | |
" item[\"bill_id\"] = bill_id\n", | |
" db[\"amendments\"].upsert_all(amendments, pk=\"amendment_id\", foreign_keys=(\"bill_id\",))\n", | |
"\n", | |
" for item in supplements:\n", | |
" item[\"bill_id\"] = bill_id\n", | |
" db[\"supplements\"].upsert_all(supplements, pk=\"supplement_id\", foreign_keys=(\"bill_id\",))\n", | |
"\n", | |
" for item in calendar:\n", | |
" item[\"bill_id\"] = bill_id\n", | |
" db[\"calendars\"].upsert_all(calendar, hash_id=\"id\", foreign_keys=(\"bill_id\",))\n", | |
"\n", | |
" for item in progress:\n", | |
" item[\"bill_id\"] = bill_id\n", | |
" db[\"progress\"].upsert_all(progress, hash_id=\"id\", foreign_keys=(\"bill_id\",))\n", | |
"\n", | |
" for item in sasts:\n", | |
" item[\"bill_id\"] = bill_id\n", | |
" db[\"sasts\"].upsert_all(sasts, hash_id=\"id\", foreign_keys=(\"bill_id\",))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import glob\n", | |
"import hashlib" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def id_for_history(history):\n", | |
" return history[\"date\"] + '-' + hashlib.sha1(\n", | |
" json.dumps(history, separators=(\",\", \":\"), sort_keys=True).encode(\"utf8\")\n", | |
" ).hexdigest()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Now do it for ALL of the zip files" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2017-2017_1st_Special_Session_JSON_20170303_1c6c379a0812d16cc6e37c58edd49b07.zip'" | |
] | |
}, | |
"execution_count": 51, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"path = glob.glob(\"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/*.zip\")[0]\n", | |
"path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import zipfile\n", | |
"from tqdm import tqdm_notebook" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2017-2017_1st_Special_Session_JSON_20170303_1c6c379a0812d16cc6e37c58edd49b07.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "844d63d16f59497c8ff0a3b6838bdee0", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=63), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2016-2016_1st_Special_Session_JSON_20160323_54f6629215b7154f6016bbc6916d07c8.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "25817160ff5d4374a7bb3fbbfd0ec9cb", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=208), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2010-2010_Regular_Session_JSON_20100709_b48471018582304f8fea4310bca02252.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "65b8444d8a074beabd499e2ea144d0f3", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=3150), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2018-2018_1st_Special_Session_JSON_20180326_0f61f2b1744734fc449aedac4e63ba8b.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "8cc8648c5cf34a72b39676e0e348db32", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=71), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2016-2016_Regular_Session_JSON_20160622_44a138d299546438ecdd49ac30f8b404.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "ff9fa6fbe50d4480a843ab5739510113", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2404), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2014-2014_Regular_Session_JSON_20140623_75d27dc7b6cb8d01313debe183d9a404.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "3bdc8d6fce20426c9873e435f9428f90", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2823), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2018-2018_Regular_Session_JSON_20180607_be428bc5dc290eadb68ccfe4da9ee16e.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "39f3eb471aaf4d98b41f9fa947180e54", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2216), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2017-2017_2nd_Special_Session_JSON_20170629_ff2878ba3ef09842473fe77c9489d54e.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "968b351e428847bf9a67ed94e9c064da", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=37), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2011-2011_1st_Special_Session_JSON_20110415_5977d4e451f6d1f267cf996a85dcab8f.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "fefe0b5195d540ae9e6f454d64494450", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=179), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2015-2015_Regular_Session_JSON_20150701_41b9a0c9e02faf3331fe9b27474e768b.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "0a44692a615445a49a9e666cb8311e6c", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=1974), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2012-2012_Regular_Session_JSON_20120615_86b9c3de62acdf989e847ffd88f9bf17.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "3cdf1358d6944d8e89a9602295cff177", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2734), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2013-2013_Regular_Session_JSON_20130626_cc2bcba14d9205cddc803b1bf0f6b8ba.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "18818d7ab99349d49bb6a775c40e436e", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=1763), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2017-2017_Regular_Session_JSON_20170630_625263f55b207673251bfd0fa17c8b11.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "06b6e13e98084183921631ec01f2f15c", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=1670), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2018-2018_2nd_Special_Session_JSON_20180618_bb419aa09bc14b6b8ff5968196a24b35.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "b69e2bae574349c2a28018de713cf5e7", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=172), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2019-2019_Regular_Session_JSON_20190730_4edbf0b284d5bf72c6da92b76c626573.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "7b1b7e14e2bd40779f8478d5510ff2de", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=1735), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2018-2018_3rd_Special_Session_JSON_20180627_700e121dfd5a2320844ba34f0cb998ca.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "47426fbc35a24b86b986c319ab3c92ea", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=79), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2016-2016_2nd_Special_Session_JSON_20160711_e04cf88c7b13e09c4594eb02af5b8249.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "fb5d2ead2b4c4a5cb1ad16be8644142b", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=169), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/LA_2011-2011_Regular_Session_JSON_20110713_27c4c635ef1d569710775409f8a7a04b.zip\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "2e02e650476c4057adb8dd9ec6240c73", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=1583), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"db = sqlite_utils.Database(\"/tmp/la_bills.db\")\n", | |
"\n", | |
"for zippath in glob.glob(\"/Users/simonw/Dropbox/Development/big-local-2019/drive-download-20191023T183757Z-001/*.zip\"):\n", | |
" zf = zipfile.ZipFile(zippath)\n", | |
" print(zippath)\n", | |
" filenames = [f.filename for f in zf.filelist]\n", | |
" bills = [f for f in filenames if \"/bill/\" in f and f.endswith(\".json\")]\n", | |
" for bill in tqdm_notebook(bills):\n", | |
" bill = json.loads(zf.open(bill).read())['bill']\n", | |
" save_bill(db, bill)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<Table bills (bill_id, bill_number, session_id, title, description, url, change_hash, state_link, completed, status, status_date, state, state_id, bill_type_id, body, body_id, current_body, current_body_id, pending_committee_id, committee)>" | |
] | |
}, | |
"execution_count": 76, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Make it so you can search against the title and decsription fields:\n", | |
"db[\"bills\"].enable_fts([\"title\", \"description\"], create_triggers=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment