Last active
July 18, 2024 15:31
-
-
Save jexp/74bd5a43305550236321eab8f0c723c0 to your computer and use it in GitHub Desktop.
Quick Neo4j Loaders for GraphRAG Parquet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.2\u001b[0m\n", | |
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n", | |
"Note: you may need to restart the kernel to use updated packages.\n" | |
] | |
} | |
], | |
"source": [ | |
"%pip install --quiet pandas neo4j-rust-ext" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from neo4j import GraphDatabase\n", | |
"import time" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"NEO4J_URI=\"bolt://localhost\"\n", | |
"NEO4J_USERNAME=\"neo4j\"\n", | |
"NEO4J_PASSWORD=\"password\"\n", | |
"NEO4J_DATABASE=\"graphrag\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def batched_import(statement, df, batch_size=1000):\n", | |
" total = len(df)\n", | |
" start_s = time.time()\n", | |
" for start in range(0,total, batch_size):\n", | |
" batch = df.iloc[start: min(start+batch_size,total)]\n", | |
" result = driver.execute_query(\"UNWIND $rows AS value \" + statement, \n", | |
" rows=batch.to_dict('records'),\n", | |
" database_=NEO4J_DATABASE)\n", | |
" print(result.summary.counters)\n", | |
" print(f'{total} rows in { time.time() - start_s} s.') \n", | |
" return total" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique\n", | |
"\n", | |
"create constraint document_id if not exists for (d:__Document__) require d.id is unique\n", | |
"\n", | |
"create constraint entity_id if not exists for (c:__Community__) require c.community is unique\n", | |
"\n", | |
"create constraint entity_id if not exists for (e:__Entity__) require e.id is unique\n", | |
"\n", | |
"create constraint entity_title if not exists for (e:__Entity__) require e.title is unique\n", | |
"\n", | |
"create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique\n" | |
] | |
} | |
], | |
"source": [ | |
"# create constraints\n", | |
"\n", | |
"statements = \"\"\"\n", | |
"create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;\n", | |
"create constraint document_id if not exists for (d:__Document__) require d.id is unique;\n", | |
"create constraint entity_id if not exists for (c:__Community__) require c.community is unique;\n", | |
"create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;\n", | |
"create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;\n", | |
"create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;\n", | |
"\"\"\".split(\";\")\n", | |
"\n", | |
"for s in statements:\n", | |
" if len((s or \"\").strip()) > 0:\n", | |
" print(s)\n", | |
" driver.execute_query(query_=s,database_=NEO4J_DATABASE)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"GRAPHRAG_FOLDER=\"/Users/mh/d/llm/graphrag/ragtest/output/20240703-144633/artifacts\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>level</th>\n", | |
" <th>clustered_graph</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td><graphml xmlns=\"http://graphml.graphdrawing.or...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td><graphml xmlns=\"http://graphml.graphdrawing.or...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td><graphml xmlns=\"http://graphml.graphdrawing.or...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" level clustered_graph\n", | |
"0 0 <graphml xmlns=\"http://graphml.graphdrawing.or...\n", | |
"1 1 <graphml xmlns=\"http://graphml.graphdrawing.or...\n", | |
"2 2 <graphml xmlns=\"http://graphml.graphdrawing.or..." | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_base_entity_graph.parquet')\n", | |
"df.head()\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'_contains_updates': True, 'properties_set': 1}\n", | |
"1 rows in 0.0053479671478271484 s.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>c305886e4aa2f6efcf64b57762777055</td>\n", | |
" <td>book.txt</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id title\n", | |
"0 c305886e4aa2f6efcf64b57762777055 book.txt" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# import documents\n", | |
"statement = \"\"\"\n", | |
"MERGE (d:__Document__ {id:value.id})\n", | |
"SET d += value {.title}\n", | |
"// , text_unit_ids:value.text_unit_ids, raw_content:substring(value.raw_content,0,1000)};\n", | |
"\"\"\"\n", | |
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_documents.parquet', columns=[\"id\", \"title\"])\n", | |
"\n", | |
"batched_import(statement, df)\n", | |
"\n", | |
"df.head()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'_contains_updates': True, 'properties_set': 462}\n", | |
"231 rows in 0.02997303009033203 s.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>chunk_id</th>\n", | |
" <th>chunk</th>\n", | |
" <th>n_tokens</th>\n", | |
" <th>document_ids</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>680dd6d2a970a49082fa4f34bf63a34e</td>\n", | |
" <td>The Project Gutenberg eBook of A Christmas Ca...</td>\n", | |
" <td>300</td>\n", | |
" <td>[c305886e4aa2f6efcf64b57762777055]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>95f1f8f5bdbf0bee3a2c6f2f4a4907f6</td>\n", | |
" <td>THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL...</td>\n", | |
" <td>300</td>\n", | |
" <td>[c305886e4aa2f6efcf64b57762777055]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" chunk_id \\\n", | |
"0 680dd6d2a970a49082fa4f34bf63a34e \n", | |
"1 95f1f8f5bdbf0bee3a2c6f2f4a4907f6 \n", | |
"\n", | |
" chunk n_tokens \\\n", | |
"0 The Project Gutenberg eBook of A Christmas Ca... 300 \n", | |
"1 THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL... 300 \n", | |
"\n", | |
" document_ids \n", | |
"0 [c305886e4aa2f6efcf64b57762777055] \n", | |
"1 [c305886e4aa2f6efcf64b57762777055] " | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# import text units\n", | |
"statement = \"\"\"\n", | |
"MERGE (c:__Chunk__ {id:value.chunk_id})\n", | |
"SET c += value {.chunk, .n_tokens}\n", | |
"WITH *\n", | |
"UNWIND value.document_ids as doc_id\n", | |
"MATCH (d:__Document__ {id:doc_id})\n", | |
"MERGE (d)<-[:PART_OF]-(c)\n", | |
"RETURN count(distinct c) as chunksCreated\n", | |
"\"\"\"\n", | |
"\n", | |
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_base_text_units.parquet', \n", | |
" columns=[\"chunk_id\",\"chunk\",\"n_tokens\",\"document_ids\"])\n", | |
"\n", | |
"batched_import(statement, df)\n", | |
"\n", | |
"df.head(2)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'_contains_updates': True, 'properties_set': 4155}\n", | |
"831 rows in 0.13263273239135742 s.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>level</th>\n", | |
" <th>title</th>\n", | |
" <th>type</th>\n", | |
" <th>description</th>\n", | |
" <th>source_id</th>\n", | |
" <th>human_readable_id</th>\n", | |
" <th>id</th>\n", | |
" <th>top_level_node_id</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>\"PROJECT GUTENBERG\"</td>\n", | |
" <td>\"ORGANIZATION\"</td>\n", | |
" <td>Project Gutenberg is a pioneering organization...</td>\n", | |
" <td>01e84646075b255eab0a34d872336a89,10bab8e9773ee...</td>\n", | |
" <td>0</td>\n", | |
" <td>b45241d70f0e43fca764df95b2b81f77</td>\n", | |
" <td>b45241d70f0e43fca764df95b2b81f77</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0</td>\n", | |
" <td>\"UNITED STATES\"</td>\n", | |
" <td>\"GEO\"</td>\n", | |
" <td>The United States is prominently recognized fo...</td>\n", | |
" <td>01e84646075b255eab0a34d872336a89,28f242c451594...</td>\n", | |
" <td>1</td>\n", | |
" <td>4119fd06010c494caa07f439b333f4c5</td>\n", | |
" <td>4119fd06010c494caa07f439b333f4c5</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" level title type \\\n", | |
"0 0 \"PROJECT GUTENBERG\" \"ORGANIZATION\" \n", | |
"1 0 \"UNITED STATES\" \"GEO\" \n", | |
"\n", | |
" description \\\n", | |
"0 Project Gutenberg is a pioneering organization... \n", | |
"1 The United States is prominently recognized fo... \n", | |
"\n", | |
" source_id human_readable_id \\\n", | |
"0 01e84646075b255eab0a34d872336a89,10bab8e9773ee... 0 \n", | |
"1 01e84646075b255eab0a34d872336a89,28f242c451594... 1 \n", | |
"\n", | |
" id top_level_node_id \n", | |
"0 b45241d70f0e43fca764df95b2b81f77 b45241d70f0e43fca764df95b2b81f77 \n", | |
"1 4119fd06010c494caa07f439b333f4c5 4119fd06010c494caa07f439b333f4c5 " | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# import nodes\n", | |
"\n", | |
"statement = \"\"\"\n", | |
"MERGE (n:__Entity__ {id:value.id})\n", | |
"SET n += value {.level, .top_level_node_id, .human_readable_id, .description, \n", | |
" title:replace(value.title,'\"','')}\n", | |
"WITH n, value\n", | |
"CALL apoc.create.addLabels(n, case when value.type is null then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n", | |
"UNWIND split(value.source_id,\",\") as source_id\n", | |
"MATCH (c:__Chunk__ {id:source_id})\n", | |
"RETURN count(distinct n) as createdNodes\n", | |
"\"\"\"\n", | |
"\n", | |
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_nodes.parquet',\n", | |
" columns=[\"level\",\"title\",\"type\",\"description\",\"source_id\",\"human_readable_id\",\"id\",\"top_level_node_id\"])\n", | |
"\n", | |
"batched_import(statement, df)\n", | |
"df.head(2)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'_contains_updates': True, 'properties_set': 2052}\n", | |
"342 rows in 0.013482093811035156 s.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>source</th>\n", | |
" <th>target</th>\n", | |
" <th>id</th>\n", | |
" <th>rank</th>\n", | |
" <th>weight</th>\n", | |
" <th>human_readable_id</th>\n", | |
" <th>description</th>\n", | |
" <th>text_unit_ids</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>\"PROJECT GUTENBERG\"</td>\n", | |
" <td>\"A CHRISTMAS CAROL\"</td>\n", | |
" <td>b84d71ed9c3b45819eb3205fd28e13a0</td>\n", | |
" <td>20</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0</td>\n", | |
" <td>\"Project Gutenberg is responsible for releasin...</td>\n", | |
" <td>[680dd6d2a970a49082fa4f34bf63a34e]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>\"PROJECT GUTENBERG\"</td>\n", | |
" <td>\"SUZANNE SHELL\"</td>\n", | |
" <td>b0b464bc92a541e48547fe9738378dab</td>\n", | |
" <td>15</td>\n", | |
" <td>1.0</td>\n", | |
" <td>1</td>\n", | |
" <td>\"Suzanne Shell produced the eBook version of '...</td>\n", | |
" <td>[680dd6d2a970a49082fa4f34bf63a34e]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" source target id \\\n", | |
"0 \"PROJECT GUTENBERG\" \"A CHRISTMAS CAROL\" b84d71ed9c3b45819eb3205fd28e13a0 \n", | |
"1 \"PROJECT GUTENBERG\" \"SUZANNE SHELL\" b0b464bc92a541e48547fe9738378dab \n", | |
"\n", | |
" rank weight human_readable_id \\\n", | |
"0 20 1.0 0 \n", | |
"1 15 1.0 1 \n", | |
"\n", | |
" description \\\n", | |
"0 \"Project Gutenberg is responsible for releasin... \n", | |
"1 \"Suzanne Shell produced the eBook version of '... \n", | |
"\n", | |
" text_unit_ids \n", | |
"0 [680dd6d2a970a49082fa4f34bf63a34e] \n", | |
"1 [680dd6d2a970a49082fa4f34bf63a34e] " | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# import relationships\n", | |
"\n", | |
"statement = \"\"\"\n", | |
" MATCH (source:__Entity__ {title:replace(value.source,'\"','')})\n", | |
" MATCH (target:__Entity__ {title:replace(value.target,'\"','')})\n", | |
" // todo rel-type from source-target labels?\n", | |
" MERGE (source)-[rel:RELATED]->(target)\n", | |
" SET rel += value {.id, .rank, .weight, .human_readable_id, .description, text_unit_ids:value.text_unit_ids}\n", | |
" RETURN count(*) as createdRels\n", | |
"\"\"\"\n", | |
"\n", | |
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_relationships.parquet',\n", | |
" columns=[\"source\",\"target\",\"id\",\"rank\",\"weight\",\"human_readable_id\",\"description\",\"text_unit_ids\"])\n", | |
"\n", | |
"batched_import(statement, df)\n", | |
"\n", | |
"df.head(2)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'_contains_updates': True, 'properties_set': 94}\n", | |
"47 rows in 0.021432161331176758 s.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>level</th>\n", | |
" <th>title</th>\n", | |
" <th>text_unit_ids</th>\n", | |
" <th>relationship_ids</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>Community 2</td>\n", | |
" <td>[0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0...</td>\n", | |
" <td>[ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>4</td>\n", | |
" <td>0</td>\n", | |
" <td>Community 4</td>\n", | |
" <td>[054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb...</td>\n", | |
" <td>[929f30875e1744b49e7b416eaf5a790c, 4920fda0318...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id level title text_unit_ids \\\n", | |
"0 2 0 Community 2 [0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0... \n", | |
"1 4 0 Community 4 [054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb... \n", | |
"\n", | |
" relationship_ids \n", | |
"0 [ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b... \n", | |
"1 [929f30875e1744b49e7b416eaf5a790c, 4920fda0318... " | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# import communities\n", | |
"\n", | |
"statement = \"\"\"\n", | |
"MERGE (c:__Community__ {community:value.id})\n", | |
"SET c += value {.level, .title}\n", | |
"/*\n", | |
"UNWIND value.text_unit_ids as text_unit_id\n", | |
"MATCH (t:__Chunk__ {id:text_unit_id})\n", | |
"MERGE (c)-[:HAS_CHUNK]->(t)\n", | |
"WITH distinct c, value\n", | |
"*/\n", | |
"WITH *\n", | |
"UNWIND value.relationship_ids as rel_id\n", | |
"MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)\n", | |
"MERGE (start)-[:IN_COMMUNITY]->(c)\n", | |
"MERGE (end)-[:IN_COMMUNITY]->(c)\n", | |
"RETURn count(distinct c) as createdCommunities\n", | |
"\"\"\"\n", | |
"\n", | |
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_communities.parquet', \n", | |
" columns=[\"id\",\"level\",\"title\",\"text_unit_ids\",\"relationship_ids\"])\n", | |
"batched_import(statement, df)\n", | |
"\n", | |
"df.head(2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'_contains_updates': True, 'properties_set': 329}\n", | |
"47 rows in 0.022797107696533203 s.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>community</th>\n", | |
" <th>level</th>\n", | |
" <th>title</th>\n", | |
" <th>summary</th>\n", | |
" <th>findings</th>\n", | |
" <th>rank</th>\n", | |
" <th>rank_explanation</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>e7822326-4da8-4954-afa9-be7f4f5791a5</td>\n", | |
" <td>42</td>\n", | |
" <td>2</td>\n", | |
" <td>Scrooge's Supernatural Encounters: Marley's Gh...</td>\n", | |
" <td>This report delves into the pivotal supernatur...</td>\n", | |
" <td>[{'explanation': 'Marley's Ghost plays a cruci...</td>\n", | |
" <td>8.0</td>\n", | |
" <td>The impact severity rating is high due to the ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>8a5afac1-99ef-4f01-a1b1-f044ce392ff9</td>\n", | |
" <td>43</td>\n", | |
" <td>2</td>\n", | |
" <td>The Ghost's Influence on Scrooge's Transformation</td>\n", | |
" <td>This report delves into the pivotal role of 'T...</td>\n", | |
" <td>[{'explanation': 'The Ghost, identified at tim...</td>\n", | |
" <td>8.5</td>\n", | |
" <td>The impact severity rating is high due to the ...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id community level \\\n", | |
"0 e7822326-4da8-4954-afa9-be7f4f5791a5 42 2 \n", | |
"1 8a5afac1-99ef-4f01-a1b1-f044ce392ff9 43 2 \n", | |
"\n", | |
" title \\\n", | |
"0 Scrooge's Supernatural Encounters: Marley's Gh... \n", | |
"1 The Ghost's Influence on Scrooge's Transformation \n", | |
"\n", | |
" summary \\\n", | |
"0 This report delves into the pivotal supernatur... \n", | |
"1 This report delves into the pivotal role of 'T... \n", | |
"\n", | |
" findings rank \\\n", | |
"0 [{'explanation': 'Marley's Ghost plays a cruci... 8.0 \n", | |
"1 [{'explanation': 'The Ghost, identified at tim... 8.5 \n", | |
"\n", | |
" rank_explanation \n", | |
"0 The impact severity rating is high due to the ... \n", | |
"1 The impact severity rating is high due to the ... " | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# import communities\n", | |
"\n", | |
"statement = \"\"\"\n", | |
"MERGE (c:__Community__ {community:value.community})\n", | |
"// we can also extract findings as separate nodes\n", | |
"WITH c, value, [f in value.findings | apoc.text.join([k in keys(f) | k+\": \"+f[k]],',\\n')] as findings\n", | |
"SET c += value {.level, .title, .summary, findings, .rank, .rank_explanation, .id}\n", | |
"RETURn count(distinct c) as createdCommunities\n", | |
"\"\"\"\n", | |
"\n", | |
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_community_reports.parquet',\n", | |
" columns=[\"id\",\"community\",\"level\",\"title\",\"summary\", \"findings\",\"rank\",\"rank_explanation\"])\n", | |
"\n", | |
"batched_import(statement, df)\n", | |
"df.head(2)\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
cp ragtest/output/*/artifacts/*.parquet $NEO4J_HOME/import | |
echo 'apoc.import.file.enabled=true' >> $NEO4J_HOME/conf/apoc.conf | |
cd $NEO4J_HOME/plugins | |
cp ../labs/*apoc*.jar . | |
curl -OL https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/5.21.0/apoc-5.21.0-extended.jar | |
curl -OL https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/5.21.0/apoc-hadoop-dependencies-5.21.0-all.jar | |
cd .. | |
bin/neo4j console | |
*/ | |
// TODO - Load documents, text-chunks, claims, communities and connect them | |
call apoc.load.parquet("create_final_nodes.parquet") yield value | |
// return keys(value), value limit 5 | |
return | |
replace(value.type,'"','') as type, value.id, value.level, | |
replace(value.title,'"','') as title, | |
value.top_level_node_id, | |
value.human_readable_id as nr, | |
split(value.source_id,",") as sources, | |
value.description | |
LIMIT 5; | |
/* | |
{ | |
"source_id": "01e84646075b255eab0a34d872336a89,10bab8e9773ee6dfbb465bfa45794c34,28f242c45159426edb8589f5ca3c10e6,2f918cd94d1825eb5cbdc2a9d3ce094e,34c3d4a02c4a7e3b8ec57f41075aeeea,3fedcfeffb43c689a33ffa06897ad045,50160bdfa976f5b946c699722c81b412,535f6bed392a62760401b1d4f2aa5e2f,608db27bee139aaab8ded9989997d00a,680dd6d2a970a49082fa4f34bf63a34e,6968390fb201fda828835d2d1fd4e953,6ea022365de9ab0d226801de90139c8a,879b3fc36c9a2427cdb8d5d41b60e11b,972bb34ddd371530f06d006480526d3e,9e59af410db84b25757e3bf90e036f39,da3ca9f93aac15c67f6acf3cca2fc229,e8cf7d2eec5c3bcbeefc60d9f15941ed,f96b5ddf7fae853edbc4d916f66c623f", | |
"type": ""ORGANIZATION"", | |
"size": 13, | |
"id": "b45241d70f0e43fca764df95b2b81f77", | |
"title": ""PROJECT GUTENBERG"", | |
"level": 0, | |
"degree": 13, | |
"description": "Project Gutenberg is a pioneering organization dedicated to the free distribution of electronic works, with a focus on those not protected by U.S. copyright law. It was initiated by Professor Michael S. Hart and is supported by a network of volunteers and the Gutenberg Literary Archive Foundation. The organization's mission is to increase the number of public domain and licensed works freely distributed in machine-readable form, thereby promoting free access to literature and electronic works. Project Gutenberg owns a compilation copyright in its collection of electronic works, ensuring their accessibility while requiring compliance with specific copyright and distribution guidelines outlined in their license agreement. | |
For over forty years, Project Gutenberg has been creating and distributing eBooks, offering a vast array of works in various formats, including 'Plain Vanilla ASCII'. Its collection includes notable titles like 'A Christmas Carol', available for free under a license that allows copying, giving away, and re-using with almost no restrictions. The organization operates globally, emphasizing copyright status and adherence to its license, which includes a system of royalty payments and refunds under certain conditions. Project Gutenberg's main search facility is accessible through its website, www.gutenberg.org, facilitating easy access to its extensive library. | |
Project Gutenberg is committed to keeping its collection freely available for future generations, supported by donations and the efforts of its volunteer network. It promotes the creation, modification, and redistribution of eBooks, especially focusing on works that allow for free copying and distribution in the United States under specific terms. The organization is described as being focused on promoting free access to electronic works, ensuring that literature remains accessible to the public while keeping its name associated with shared works in compliance with its agreement.", | |
"top_level_node_id": "b45241d70f0e43fca764df95b2b81f77", | |
"human_readable_id": 0, | |
"__index_level_0__": 0, | |
"y": 0, | |
"x": 0 | |
} | |
*/ | |
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique; | |
create constraint document_id if not exists for (d:__Document__) require d.id is unique; | |
create constraint entity_id if not exists for (c:__Community__) require c.community is unique; | |
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique; | |
create constraint entity_title if not exists for (e:__Entity__) require e.title is unique; | |
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique; | |
call apoc.load.parquet("create_final_documents.parquet") yield value | |
return keys(value),value limit 1; | |
// ["__index_level_0__", "raw_content", "id", "title", "text_unit_ids"] | |
call apoc.load.parquet("create_final_documents.parquet") yield value | |
MERGE (d:__Document__ {id:value.id}) | |
SET d += value {.title, text_unit_ids:value.text_unit_ids, raw_content:substring(value.raw_content,0,1000)}; | |
call apoc.load.parquet("create_base_text_units.parquet") yield value | |
return keys(value),value limit 1; | |
// ["document_ids", "chunk", "n_tokens", "id", "chunk_id"] | |
:auto | |
call apoc.load.parquet("create_base_text_units.parquet") yield value | |
CALL { with value | |
MERGE (c:__Chunk__ {id:value.chunk_id}) | |
SET c += value {.chunk, .n_tokens} | |
WITH * | |
UNWIND value.document_ids as doc_id | |
MATCH (d:__Document__ {id:doc_id}) | |
MERGE (d)<-[:PART_OF]-(c) | |
RETURN count(distinct c) as chunksCreated | |
} in transactions of 1000 rows | |
RETURN sum(chunksCreated) as chunksCreated; | |
:auto | |
call apoc.load.parquet("create_final_nodes.parquet") yield value | |
call { with value | |
MERGE (n:__Entity__ {id:value.id}) | |
SET n += value {.level, .top_level_node_id, .human_readable_id, .description, | |
title:replace(value.title,'"','')} | |
WITH n, value | |
CALL apoc.create.addLabels(n, case when value.type is null then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node | |
UNWIND split(value.source_id,",") as source_id | |
MATCH (c:__Chunk__ {id:source_id}) | |
MERGE (c)-[:HAS_ENTITY]->(n) | |
RETURN count(distinct n) as created | |
} in transactions of 25000 rows | |
return sum(created) as createdNodes; | |
call apoc.load.parquet("create_final_relationships.parquet") yield value | |
return keys(value), value limit 5; | |
:auto | |
call apoc.load.parquet("create_final_relationships.parquet") yield value | |
call { with value | |
MATCH (source:__Entity__ {title:replace(value.source,'"','')}) | |
MATCH (target:__Entity__ {title:replace(value.target,'"','')}) | |
// todo rel-type from source-target labels? | |
MERGE (source)-[rel:RELATED]->(target) | |
SET rel += value {.id, .rank, .weight, .human_readable_id, .description, text_unit_ids:value.text_unit_ids} | |
RETURN count(*) as created | |
} in transactions of 25000 rows | |
return sum(created) as createdRels; | |
/* | |
{ | |
"id": "b84d71ed9c3b45819eb3205fd28e13a0", | |
"target_degree": 7, | |
"rank": 20, | |
"source_degree": 13, | |
"weight": 1.0, | |
"source": ""PROJECT GUTENBERG"", | |
"description": ""Project Gutenberg is responsible for releasing 'A Christmas Carol' as an eBook."", | |
"target": ""A CHRISTMAS CAROL"", | |
"human_readable_id": "0", | |
"text_unit_ids": [ | |
"680dd6d2a970a49082fa4f34bf63a34e" | |
] | |
} | |
*/ | |
:auto | |
call apoc.load.parquet("create_final_communities.parquet") yield value | |
// return keys(value), value limit 5; | |
CALL { with value | |
MERGE (c:__Community__ {community:value.id}) | |
SET c += value {.level, .title} | |
/* | |
UNWIND value.text_unit_ids as text_unit_id | |
MATCH (t:__Chunk__ {id:text_unit_id}) | |
MERGE (c)-[:HAS_CHUNK]->(t) | |
WITH distinct c, value | |
*/ | |
WITH * | |
UNWIND value.relationship_ids as rel_id | |
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__) | |
MERGE (start)-[:IN_COMMUNITY]->(c) | |
MERGE (end)-[:IN_COMMUNITY]->(c) | |
RETURn count(distinct c) as created | |
} in transactions of 1000 rows | |
RETURN sum(created) as createdCommunities; | |
// ["level", "text_unit_ids", "relationship_ids", "id", "title", "raw_community"] | |
:auto | |
call apoc.load.parquet("create_final_community_reports.parquet") yield value | |
CALL { with value | |
MERGE (c:__Community__ {community:value.community}) | |
SET c += value {.level, .title, .summary, .findings, .rank, .rank_explanation, .id} | |
RETURn count(distinct c) as created | |
} in transactions of 1000 rows | |
RETURN sum(created) as createdReports; | |
// ["summary", "full_content_json", "level", "findings", "full_content", "rank", "id", "rank_explanation", "title", "community"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Load of the default example (Charles Dickens)