Skip to content

Instantly share code, notes, and snippets.

@careduz
Last active February 13, 2018 21:32
Show Gist options
  • Save careduz/ae6e09ccd6197015eb320e9d931c1138 to your computer and use it in GitHub Desktop.
Save careduz/ae6e09ccd6197015eb320e9d931c1138 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 0. Context\n",
"The purpose of this script is to transform the Lobbyist Registry XML data and load it into a Neo4j graph database. This transformation is performed through the following steps:\n",
"1. Importing XML data\n",
"2. Parsing XML data\n",
"3. Creating Neo4j loader\n",
"4. Loading Neo4j database"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import time\n",
"import traceback\n",
"from xml.dom import minidom\n",
"from tqdm import tqdm\n",
"from py2neo import Graph, Node, Relationship, authenticate, ConstraintError\n",
"\n",
"directory = \"./\"\n",
"input_files_dir, output_files_dir = \"input_files/\", \"output_files/\"\n",
"\n",
"neo4j = {'host_port': 'localhost:7474', 'user_name': 'neo4j',\n",
" 'password': 'open-data', 'db_uri': 'http://localhost:7474/db/data/'}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Import XML data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def extractXMLdata(files):\n",
" extracts = {}\n",
" for xml_filename in tqdm(files, desc='XMLs'):\n",
" xml_row_list = []\n",
" xml_file = open(\"{0}{1}{2}\".format(directory, input_files_dir, xml_filename))\n",
" xml_doc = minidom.parse(xml_file)\n",
" rows = xml_doc.getElementsByTagName('ROW')\n",
" xml_row_list.append(rows)\n",
" extracts[xml_filename] = xml_row_list\n",
" \n",
" return extracts "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"XMLs: 100%|██████████| 2/2 [00:29<00:00, 14.68s/it]\n"
]
}
],
"source": [
"files = [ filename for filename in os.listdir(directory + input_files_dir) if (filename.startswith( \"lobbyactivity\" ) and filename.endswith(\".xml\"))]\n",
"\n",
"extracts = extractXMLdata(files)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Parse XML data\n",
"After loading the XML data, it needed to be parsed so values could be loaded into a tool for exploration. To parse this data, created a simple function that takes two inputs:\n",
"1. XML object, and\n",
"2. List of tags to extract from the object"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def getTagValues(xml_elem, fields=[]):\n",
" \"\"\"\n",
" returns tag values for XML element if in field list provided\n",
" \"\"\"\n",
" values = {}\n",
" for node in xml_elem.childNodes:\n",
" if node.nodeName in fields and len(node.childNodes) == 1:\n",
" values[node.nodeName] = node.firstChild.data\n",
" return values\n",
"\n",
"tag_list = {\n",
" 'ROW': ['SMNumber', 'Status', 'Type', 'SubjectMatter', 'Particulars', 'InitialApprovalDate'\n",
" , 'EffectiveDate', 'ProposedStartDate', 'ProposedEndDate']\n",
" , 'Registrant': ['RegistrationNUmber', 'RegistrationNUmberWithSoNum', 'Status'\n",
" , 'EffectiveDate', 'Type', 'Prefix', 'FirstName', 'MiddleInitials'\n",
" , 'LastName', 'Suffix', 'PositionTitle', 'PreviousPublicOfficeHolder'\n",
" , 'PreviousPublicOfficeHoldPosition'\n",
" , 'PreviousPublicOfficeHoldPositionProgramName'\n",
" , 'PreviousPublicOfficeHoldLastDate']\n",
" , 'BusinessAddress': ['AddressLine1', 'AddressLine2', 'City', 'Country'\n",
" , 'PostalCode', 'Province', 'Phone']\n",
" , 'Communication': ['CommunicationDate', 'CommunicationGroupId', 'CommunicationMethod'\n",
" , 'LobbyistBusiness', 'LobbyistBusinessAddress', 'LobbyistFirstName'\n",
" , 'LobbyistLastName', 'LobbyistMiddleInitials', 'LobbyistNumber'\n",
" , 'LobbyistPositionTitle', 'LobbyistPrefix', 'LobbyistSuffix'\n",
" , 'LobbyistType', 'POH_Name', 'POH_Office', 'POH_Position', 'POH_Type'\n",
" , 'PreviousPublicOfficeHoldLastDate', 'PreviousPublicOfficeHoldPosition'\n",
" , 'PreviousPublicOfficeHolder'\n",
" , 'PreviousPublicOfficePositionProgramName']\n",
" , 'Meeting': ['Committee', 'Desc', 'Date']\n",
" , 'POH': ['Name', 'Office', 'Title', 'Type']\n",
" , 'Lobbyist': ['Number', 'Prefix', 'FirstName', 'MiddleInitials'\n",
" , 'LastName', 'Suffix', 'Business', 'Type']\n",
" , 'Beneficiary': ['Type', 'Name', 'TradeName', 'FiscalStart'\n",
" , 'FiscalEnd', 'BusinessAddress']\n",
" , 'Privatefunding': ['Funding', 'Contact', 'Agent', 'AgentContact']\n",
" , 'GmtFunding': ['GMTName', 'Program']\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Create Neo4j loader\n",
"Decided to use Neo4j (https://neo4j.com) due to the highly connected nature of the data. In order to load it systematically, defined functions to create the elements of the graph database - nodes and properties.\n",
"\n",
"## 3.1. General graph database load functions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def createNode(name, properties):\n",
" \"\"\"\n",
" takes node name (str) and dictionary with node properties, adding:\n",
" 1. a FullName property if there is a FirstName and/or LastName property in props\n",
" 2. writes node to graph database\n",
" 3. returns py2neo Node object\n",
" returns py2neo Node object\n",
" \"\"\"\n",
" keys = properties.keys()\n",
" \n",
" if (\"FirstName\" in keys) and (\"LastName\" in keys): properties[\"FullName\"] = properties[\"FirstName\"] + \" \" + properties[\"LastName\"]\n",
" elif (\"FirstName\" in keys) and (\"LastName\" not in keys): properties[\"FullName\"] = properties[\"FirstName\"] + \" UNK\"\n",
" elif (\"FirstName\" not in keys) and (\"LastName\" in keys): properties[\"FullName\"] = \"UNK \" + properties[\"LastName\"]\n",
" \n",
" node = Node(name, **properties)\n",
" if len(node.keys()) > 0: return node\n",
" else: return None\n",
"\n",
"\n",
"def createRelationship(node1, rlship, node2, properties={}):\n",
" \"\"\"\n",
" takes 2 py2neo Node object and the relationship between them, as str,\n",
" and writes Relationship to graph database\n",
" \"\"\"\n",
" if (node1 is not None) and (node2 is not None):\n",
" if (len(node1.keys()) > 0) and (len(node2.keys()) > 0):\n",
" node1_to_node2 = Relationship(node1, rlship, node2, **properties)\n",
" return node1_to_node2\n",
" else: return None\n",
" else: return None\n",
" \n",
"\n",
"def getNodeProperties(tag_values={}, props=[], prefixes=[], exclusion=[]):\n",
" \"\"\"\n",
" creates Node properties by taking dictionrytag/values, adding prefix to each tag, \n",
" excluding undesired tags, and returning a dictionary of node properties without prefixes\n",
" that contains only tags in properties\n",
" \"\"\"\n",
" prefixes += [\"\"]\n",
" values = {}\n",
" for p in prefixes:\n",
" for t in [ (p + f) for f in props if (f != p) and (f not in exclusion) ]:\n",
" if (t in tag_values) and (tag_values[t] != \".\"): values[t.replace(p, \"\")] = tag_values[t]\n",
" return values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3.2. Specific XML tag parser\n",
"Defined functions to parse an XML row. Each function parses a tag, and creates the proper nodes (with properties) and relationships between them."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def parseRow(row, properties): \n",
" r = getTagValues(row, tag_list['ROW'])\n",
" nodes, relationships = [], []\n",
" \n",
" LobbyingActivity = getNodeProperties(r, properties['LobbyingActivity'])\n",
" LobbyingActivity = createNode('LobbyingActivity', LobbyingActivity)\n",
"\n",
" matters = getNodeProperties(r, properties['SubjectMatter'])\n",
"\n",
" nodes.append(LobbyingActivity)\n",
" \n",
" for matter_name in matters['SubjectMatter'].split(\";\"):\n",
" SubjectMatter = createNode('SubjectMatter', {'Name': matter_name})\n",
" LA_SME_rlship = createRelationship(LobbyingActivity, \"CONCERNED\", SubjectMatter)\n",
" nodes.append(SubjectMatter)\n",
" relationships.append(LA_SME_rlship)\n",
"\n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseRegistrant(row, LobbyingActivity, properties):\n",
" r = row.getElementsByTagName(\"Registrant\")[0]\n",
" registrant = getTagValues(r, tag_list['Registrant'])\n",
" nodes, relationships = [], []\n",
" \n",
" Registration = getNodeProperties(registrant, properties['Registration'])\n",
" Registration = createNode('Registration', Registration)\n",
"\n",
" Lobbyist = getNodeProperties(registrant, properties['Lobbyist'])\n",
" Lobbyist = createNode('Lobbyist', Lobbyist)\n",
" \n",
" PublicOfficeHolder_Previous = getNodeProperties(registrant, properties['PublicOfficeHolder'],\n",
" ['PreviousPublicOfficeHold',\n",
" 'PreviousPublicOfficePosition'])\n",
" PublicOfficeHolder_Previous = createNode('PublicOfficeHolder', PublicOfficeHolder_Previous) \n",
" \n",
" L_R_rlship = createRelationship(Lobbyist, \"REGISTERED\", Registration)\n",
" L_LA_rlship = createRelationship(Lobbyist, \"RECORDED\", LobbyingActivity)\n",
" L_POH_rlship = createRelationship(Lobbyist, \"WAS_A\", PublicOfficeHolder_Previous)\n",
" addr_nodes, addr_rlships = parseAddresses(r.getElementsByTagName(\"BusinessAddress\"), Lobbyist, properties)\n",
" \n",
" relationships.extend([L_R_rlship, L_LA_rlship, L_POH_rlship] + addr_rlships)\n",
" nodes.extend([Registration, Lobbyist, PublicOfficeHolder_Previous] + addr_nodes)\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseCommunications(row, LobbyingActivity, properties):\n",
" comms = row.getElementsByTagName(\"Communication\")\n",
" nodes, relationships = [], []\n",
" \n",
" for c in comms:\n",
" Ind_nodes, Ind_rlships = parseIndividualCommunication(c, properties)\n",
" nodes.extend(Ind_nodes)\n",
" relationships.extend(Ind_rlships)\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseIndividualCommunication(c, properties):\n",
" comm = getTagValues(c, tag_list['Communication'])\n",
" nodes, relationships = [], []\n",
" \n",
" Lobbyist = getNodeProperties(comm, properties['Lobbyist'], ['Lobbyist'])\n",
" Lobbyist = createNode('Lobbyist', Lobbyist)\n",
"\n",
" PublicOfficeHolder_Previous = getNodeProperties(comm, properties['PublicOfficeHolder'],\n",
" ['PreviousPublicOfficeHold',\n",
" 'PreviousPublicOfficePosition'], ['Type'] )\n",
" PublicOfficeHolder_Previous = createNode('PublicOfficeHolder', PublicOfficeHolder_Previous)\n",
"\n",
" PublicOfficeHolder = getNodeProperties(comm, properties['PublicOfficeHolder'], ['POH_'])\n",
" PublicOfficeHolder = createNode('PublicOfficeHolder', PublicOfficeHolder)\n",
"\n",
" addr_nodes, addr_rlships = parseAddresses(c.getElementsByTagName(\"LobbyistBusinessAddress\"), Lobbyist, properties)\n",
"\n",
" Communication = getNodeProperties(comm, properties['Communication'], ['Communication'])\n",
"\n",
" meeting_flag = False\n",
" if 'Method' in Communication.keys(): \n",
" if \"meet\" in Communication['Method'].lower(): meeting_flag = True \n",
"\n",
" if meeting_flag == False:\n",
" L_POH_rlship = createRelationship(Lobbyist, \"COMMUNICATED_WITH\", PublicOfficeHolder, Communication)\n",
" else:\n",
" L_POH_rlship = createRelationship(Lobbyist, \"MET_WITH\", PublicOfficeHolder, Communication)\n",
" \n",
" L_POHP_rlship = createRelationship(Lobbyist, \"WAS_A\", PublicOfficeHolder_Previous)\n",
" \n",
" nodes.extend([Lobbyist, PublicOfficeHolder_Previous, PublicOfficeHolder] + addr_nodes)\n",
" relationships.extend([L_POHP_rlship, L_POH_rlship] + addr_rlships)\n",
" \n",
" return nodes, relationships\n",
"\n",
" \n",
"def parseBeneficiaries(row, LobbyingActivity, properties):\n",
" beneficiaries = row.getElementsByTagName(\"BENEFICIARY\")\n",
" nodes, relationships = [], []\n",
" \n",
" for b in beneficiaries:\n",
" Beneficiary = getTagValues( b, tag_list['Beneficiary'] )\n",
" Beneficiary = createNode('Beneficiary', Beneficiary)\n",
" \n",
" addr_nodes, addr_rlships = parseAddresses(b.getElementsByTagName(\"BusinessAddress\"), Beneficiary, properties)\n",
" LA_Ben_rlship = createRelationship(LobbyingActivity, \"BENEFITTED\", Beneficiary)\n",
" \n",
" nodes.extend([Beneficiary] + addr_nodes)\n",
" relationships.extend([LA_Ben_rlship] + addr_rlships)\n",
" \n",
" return nodes, relationships\n",
" \n",
" \n",
"def parseMeetings(row, LobbyingActivity, individual_matters, properties):\n",
" meetings = row.getElementsByTagName(\"Meeting\")\n",
" nodes, relationships = [], []\n",
" \n",
" for m in meetings:\n",
" Mtn_nodes, Mtn_relationships = parseIndividualMeeting(m, LobbyingActivity, individual_matters, properties)\n",
" nodes.extend(Mtn_nodes)\n",
" relationships.extend(Mtn_relationships)\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseIndividualMeeting(m, LobbyingActivity, individual_matters, properties):\n",
" Meeting = getTagValues(m, properties['Meeting'])\n",
" pohs = m.getElementsByTagName(\"POH\")\n",
" lobbyists = m.getElementsByTagName(\"Lobbyist\")\n",
" \n",
" Meeting[\"LobbyingActivity\"] = LobbyingActivity['SMNumber']\n",
" nodes, relationships = [], []\n",
" \n",
" for lob in lobbyists:\n",
" Lobbyist = getTagValues(lob, properties['Lobbyist'])\n",
" Lobbyist = createNode('Lobbyist', Lobbyist)\n",
" nodes.append(Lobbyist)\n",
" \n",
" for poh in pohs:\n",
" PublicOfficeHolder = getTagValues(poh, properties['PublicOfficeHolder'])\n",
" PublicOfficeHolder = createNode('PublicOfficeHolder', PublicOfficeHolder)\n",
" nodes.append(PublicOfficeHolder)\n",
" \n",
" L_POH_rlship = createRelationship(Lobbyist, \"MET_WITH\", PublicOfficeHolder, Meeting)\n",
" L_LA_rlship = createRelationship(Lobbyist, \"MET_FOR\", LobbyingActivity, Meeting)\n",
" POH_LA_rlship = createRelationship(PublicOfficeHolder, \"MET_FOR\", LobbyingActivity, Meeting)\n",
" relationships.extend([L_POH_rlship, L_LA_rlship, POH_LA_rlship])\n",
" \n",
" for SubjectMatter in individual_matters:\n",
" L_SME_rlship = createRelationship(Lobbyist, \"INTERESTED_IN\", SubjectMatter)\n",
" POH_SME_rlship = createRelationship(PublicOfficeHolder, \"INTERESTED_IN\", SubjectMatter)\n",
" relationships.extend([L_SME_rlship, POH_SME_rlship])\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseGrassroots(row, LobbyingActivity, SubjectMatter, properties):\n",
" grassroots = row.getElementsByTagName('GRASSROOT')\n",
" nodes, relationships = [], []\n",
" \n",
" for g in grassroots:\n",
" Ind_nodes, Ind_rlships = parseIndividualGrassroots(g, LobbyingActivity, SubjectMatter, properties)\n",
" nodes.extend(Ind_nodes)\n",
" relationships.extend(Ind_rlships)\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseIndividualGrassroots(g, LobbyingActivity, SubjectMatter, properties):\n",
" Grassroot = getTagValues(g, properties['Grassroot'])\n",
" nodes, relationships = [], []\n",
"\n",
" CommunityGroup = getTagValues(g, properties['CommunityGroup'])\n",
" CommunityGroup = createNode('CommunityGroup', CommunityGroup)\n",
"\n",
" CG_LA_rlship = createRelationship(CommunityGroup, 'ADVOCATED_IN', LobbyingActivity, Grassroot)\n",
" CG_SME_rlship = createRelationship(CommunityGroup, 'INTERESTED_IN', SubjectMatter)\n",
" \n",
" nodes.append(CommunityGroup)\n",
" relationships.extend([CG_LA_rlship, CG_SME_rlship])\n",
" \n",
" return nodes, relationships\n",
" \n",
" \n",
"def parseFirms(row, LobbyingActivity, Lobbyist, properties):\n",
" firms = row.getElementsByTagName('Firm')\n",
" nodes, relationships = [], []\n",
"\n",
" for f in firms:\n",
" Ind_nodes, Ind_rlships = parseIndividualFirms(f, LobbyingActivity, Lobbyist, properties)\n",
" nodes.extend(Ind_nodes)\n",
" relationships.extend(Ind_rlships)\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseIndividualFirms(f, LobbyingActivity, Lobbyist, properties):\n",
" Firm = getTagValues(f, properties['Firm'])\n",
" nodes, relationships = [], []\n",
" \n",
" Firm = createNode('Firm', Firm)\n",
" \n",
" addr_nodes, addr_rlships = parseAddresses(f.getElementsByTagName(\"BusinessAddress\"), Firm, properties)\n",
"\n",
" F_LA_rlship = createRelationship(Firm, 'LOBBIED_IN', LobbyingActivity)\n",
" L_F_rlship = createRelationship(Lobbyist, 'WORKED_FOR', Firm)\n",
" \n",
" nodes.extend([Firm] + addr_nodes)\n",
" relationships.extend([F_LA_rlship, L_F_rlship] + addr_rlships)\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseGmtFunding(row, LobbyingActivity, SubjectMatter, properties):\n",
" tags = [tag.nodeName for tag in row.childNodes if tag.nodeName != \"#text\"]\n",
" nodes, relationships = [], []\n",
" \n",
" if \"Gmtfundings\" in tags: gmtfundings_tag = \"Gmtfundings\"; gmt = True\n",
" elif \"GMTFUNDINGS\" in tags: gmtfundings_tag = \"GMTFUNDINGS\"; gmt = True\n",
" else: gmt = False\n",
" \n",
" if gmt:\n",
" gmt_funds = row.getElementsByTagName(gmtfundings_tag[:-1])\n",
" for g in gmt_funds:\n",
" Ind_nodes, Ind_rlships = parseIndividualGmtFunding(g, LobbyingActivity, SubjectMatter, properties)\n",
" nodes.extend(Ind_nodes)\n",
" relationships.extend(Ind_rlships)\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseIndividualGmtFunding(g, LobbyingActivity, SubjectMatter, properties):\n",
" gmt = getTagValues(g, tag_list['GmtFunding'])\n",
" nodes, relationships = [], []\n",
"\n",
" GMTName = getNodeProperties(gmt, properties['GMTName'])\n",
" GMTName = createNode('GMTFunding', GMTName)\n",
"\n",
" GMTProgram = getNodeProperties(gmt, properties['GMTProgram'])\n",
" GMT_LA_rlship = createRelationship(GMTName, \"SPONSORED\", LobbyingActivity, GMTProgram)\n",
" GMT_SME_rlship = createRelationship(GMTName, \"INTERESTED_IN\", SubjectMatter)\n",
" \n",
" nodes.append(GMTName)\n",
" relationships.extend([GMT_LA_rlship, GMT_SME_rlship])\n",
"\n",
" return nodes, relationships\n",
" \n",
"\n",
"def parsePrivateFunding(row, LobbyingActivity, SubjectMatter, properties):\n",
" priv_funds = row.getElementsByTagName('Privatefunding')\n",
" nodes, relationships = [], []\n",
" \n",
" for pf in priv_funds:\n",
" Ind_nodes, Ind_rlships = parseIndividualPrivateFunding(pf, LobbyingActivity, SubjectMatter, properties)\n",
" nodes.extend(Ind_nodes)\n",
" relationships.extend(Ind_rlships)\n",
" \n",
" return nodes, relationships\n",
"\n",
"\n",
"def parseIndividualPrivateFunding(pf, LobbyingActivity, SubjectMatter, properties):\n",
" priv = getTagValues(pf, tag_list['Privatefunding'])\n",
" nodes, relationships = [], []\n",
"\n",
" PrivateFundingContact = getNodeProperties(priv, properties['PrivateFundingContact'])\n",
"\n",
" Agent = getNodeProperties(priv, properties['Agent'])\n",
" Agent = createNode('Agent', Agent)\n",
"\n",
" FundingBody = getNodeProperties(priv, properties['FundingBody'])\n",
" FundingBody = createNode('PrivateFundingBody', FundingBody)\n",
"\n",
" Fund_LA_rlship = createRelationship(FundingBody, \"SPONSORED\", LobbyingActivity, PrivateFundingContact)\n",
" Agent_Fund_rlship = createRelationship(Agent, \"REPRESENTED\", FundingBody)\n",
" Fund_SME_rlship = createRelationship(FundingBody, \"INTERESTED_IN\", SubjectMatter)\n",
" \n",
" nodes.extend([Agent, FundingBody])\n",
" relationships.extend([Fund_LA_rlship, Agent_Fund_rlship, Fund_SME_rlship])\n",
" \n",
" return nodes, relationships\n",
" \n",
"\n",
"def parseAddresses(addresses, Owner, properties):\n",
" nodes, relationships = [], []\n",
" for addr in addresses:\n",
" BusinessAddress = getTagValues(addr, properties['BusinessAddress'])\n",
" BusinessAddress = createNode('BusinessAddress', BusinessAddress)\n",
" \n",
" PhoneNumber = getTagValues(addr, properties['PhoneNumber'])\n",
" PhoneNumber = createNode('PhoneNumber', PhoneNumber)\n",
" \n",
" Own_BA_rlship = createRelationship(Owner, \"HAS\", BusinessAddress)\n",
" PN_BA_rlship = createRelationship(PhoneNumber, \"REACHES\", BusinessAddress)\n",
" \n",
" nodes.extend([BusinessAddress, PhoneNumber])\n",
" relationships.extend([Own_BA_rlship, PN_BA_rlship])\n",
" \n",
" return nodes, relationships\n",
" \n",
"\n",
"def loadDatabase(row, properties, graph):\n",
" LA_nodes, LA_rlship = parseRow(row, properties)\n",
" LA_node = LA_nodes[0]\n",
" matters = LA_nodes[1:]\n",
" \n",
" Reg_nodes, Reg_rlships = parseRegistrant(row, LA_node, properties)\n",
" Lobbyist = Reg_nodes[1]\n",
" \n",
" Comm_nodes, Comm_rlships = parseCommunications(row, LA_node, properties)\n",
" Ben_nodes, Ben_rlships = parseBeneficiaries(row, LA_node, properties)\n",
" Mtn_nodes, Mtn_rlships = parseMeetings(row, LA_node, matters, properties)\n",
" Fi_nodes, Fi_rlships = parseFirms(row, LA_node, Lobbyist, properties)\n",
" \n",
" for SubjectMatter in matters:\n",
" Grass_nodes, Grass_rlships = parseGrassroots(row, LA_node, SubjectMatter, properties)\n",
" Priv_nodes, Priv_rlships = parsePrivateFunding(row, LA_node, SubjectMatter, properties)\n",
" GMT_nodes, GMT_rlships = parseGmtFunding(row, LA_node, SubjectMatter, properties)\n",
" \n",
" nodes = LA_nodes + Reg_nodes + Comm_nodes + Ben_nodes + Mtn_nodes \\\n",
" + Fi_nodes + Grass_nodes + Priv_nodes + GMT_nodes\n",
" \n",
" relationships = LA_rlship + Reg_rlships + Comm_rlships + Ben_rlships \\\n",
" + Mtn_rlships + Fi_rlships + Grass_rlships + Priv_rlships + GMT_rlships\n",
" \n",
" load_nodes = [n for n in nodes if n is not None]\n",
" load_relationships = [r for r in relationships if r is not None]\n",
" \n",
" for n in load_nodes: graph.merge(n)\n",
" for r in load_relationships: graph.create(r)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Load Neo4j database\n",
"This is the list of properties per node, to assign from XML tag values"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Active: 100%|██████████| 2489/2489 [44:03<00:00, 1.06s/it] \n",
"Closed: 100%|██████████| 4803/4803 [1:55:40<00:00, 1.45s/it]\n"
]
}
],
"source": [
"authenticate(neo4j[\"host_port\"], neo4j[\"user_name\"], neo4j[\"password\"])\n",
"graph = Graph()\n",
"\n",
"properties = {\n",
" 'LobbyingActivity': ['SMNumber', 'Status', 'Type', 'Particulars', 'InitialApprovalDate'\n",
" , 'EffectiveDate', 'ProposedStartDate', 'ProposedEndDate']\n",
" , 'SubjectMatter': ['SubjectMatter']\n",
" , 'Registration': ['RegistrationNUmber', 'RegistrationNUmberWithSoNum', 'Status', 'EffectiveDate', 'Type']\n",
" , 'Lobbyist': ['Prefix', 'FirstName', 'MiddleInitials', 'LastName', 'Suffix', 'PositionTitle', 'Number', 'Type']\n",
" , 'PublicOfficeHolder': ['Name', 'Office', 'Position', 'LastDate', 'ProgramName', 'Type', 'Title']\n",
" , 'BusinessAddress': ['AddressLine1', 'AddressLine2', 'City', 'Country', 'PostalCode', 'Province']\n",
" , 'Grassroot': ['EndDate', 'StartDate', 'Target']\n",
" , 'CommunityGroup': ['Community']\n",
" , 'PrivateFundingContact': ['Contact']\n",
" , 'GMTName': ['GMTName']\n",
" , 'GMTProgram': ['Program']\n",
" , 'Agent': ['Agent', 'AgentContact']\n",
" , 'PhoneNumber': ['Phone']\n",
" , 'Beneficiary': ['FiscalEnd', 'FiscalStart', 'Name', 'TradeName', 'Type']\n",
" , 'Firm': ['BusinessType', 'Description', 'FiscalEnd', 'FiscalStart', 'Name', 'TradeName', 'Type']\n",
" , 'Meeting': ['Committee', 'Date', 'Desc']\n",
" , 'Communication': ['Date', 'GroupId', 'Method']\n",
" , 'FundingBody': ['Funding']\n",
"}\n",
"\n",
"start = time.time()\n",
"\n",
"for f in list(extracts):\n",
" if \"closed\" in f: desc = \"Closed\"\n",
" elif \"active\" in f: desc= \"Active\"\n",
" \n",
" for element in tqdm(extracts[f][0], desc=desc): loadDatabase(element, properties, graph)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
},
"widgets": {
"state": {
"3cc57b4dc28b4a5bab71a24716718ee9": {
"views": [
{
"cell_index": 0
}
]
},
"b06fb8b74f6944c28eb03a1de65ea7a0": {
"views": [
{
"cell_index": 6
}
]
}
},
"version": "1.2.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment