jexp/graphrag-load-neo4j-1.ipynb

## graphrag-load-neo4j-1.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              graphrag-load-neo4j-1.ipynb
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## graphrag-load-neo4j-2.cypher
/*
cp ragtest/output/*/artifacts/*.parquet $NEO4J_HOME/import

echo 'apoc.import.file.enabled=true' >> $NEO4J_HOME/conf/apoc.conf

cd $NEO4J_HOME/plugins
cp ../labs/*apoc*.jar .
curl -OL https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/5.21.0/apoc-5.21.0-extended.jar
curl -OL https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/5.21.0/apoc-hadoop-dependencies-5.21.0-all.jar
cd ..
bin/neo4j console
*/

// TODO - Load documents, text-chunks, claims, communities and connect them

call apoc.load.parquet("create_final_nodes.parquet") yield value
// return keys(value), value limit 5
return
replace(value.type,'"','') as type, value.id, value.level,
replace(value.title,'"','') as title,
value.top_level_node_id,
value.human_readable_id as nr,
split(value.source_id,",") as sources,
value.description
LIMIT 5;

/*
{
  "source_id": "01e84646075b255eab0a34d872336a89,10bab8e9773ee6dfbb465bfa45794c34,28f242c45159426edb8589f5ca3c10e6,2f918cd94d1825eb5cbdc2a9d3ce094e,34c3d4a02c4a7e3b8ec57f41075aeeea,3fedcfeffb43c689a33ffa06897ad045,50160bdfa976f5b946c699722c81b412,535f6bed392a62760401b1d4f2aa5e2f,608db27bee139aaab8ded9989997d00a,680dd6d2a970a49082fa4f34bf63a34e,6968390fb201fda828835d2d1fd4e953,6ea022365de9ab0d226801de90139c8a,879b3fc36c9a2427cdb8d5d41b60e11b,972bb34ddd371530f06d006480526d3e,9e59af410db84b25757e3bf90e036f39,da3ca9f93aac15c67f6acf3cca2fc229,e8cf7d2eec5c3bcbeefc60d9f15941ed,f96b5ddf7fae853edbc4d916f66c623f",
  "type": ""ORGANIZATION"",
  "size": 13,
  "id": "b45241d70f0e43fca764df95b2b81f77",
  "title": ""PROJECT GUTENBERG"",
  "level": 0,
  "degree": 13,
  "description": "Project Gutenberg is a pioneering organization dedicated to the free distribution of electronic works, with a focus on those not protected by U.S. copyright law. It was initiated by Professor Michael S. Hart and is supported by a network of volunteers and the Gutenberg Literary Archive Foundation. The organization's mission is to increase the number of public domain and licensed works freely distributed in machine-readable form, thereby promoting free access to literature and electronic works. Project Gutenberg owns a compilation copyright in its collection of electronic works, ensuring their accessibility while requiring compliance with specific copyright and distribution guidelines outlined in their license agreement.

For over forty years, Project Gutenberg has been creating and distributing eBooks, offering a vast array of works in various formats, including 'Plain Vanilla ASCII'. Its collection includes notable titles like 'A Christmas Carol', available for free under a license that allows copying, giving away, and re-using with almost no restrictions. The organization operates globally, emphasizing copyright status and adherence to its license, which includes a system of royalty payments and refunds under certain conditions. Project Gutenberg's main search facility is accessible through its website, www.gutenberg.org, facilitating easy access to its extensive library.

Project Gutenberg is committed to keeping its collection freely available for future generations, supported by donations and the efforts of its volunteer network. It promotes the creation, modification, and redistribution of eBooks, especially focusing on works that allow for free copying and distribution in the United States under specific terms. The organization is described as being focused on promoting free access to electronic works, ensuring that literature remains accessible to the public while keeping its name associated with shared works in compliance with its agreement.",
  "top_level_node_id": "b45241d70f0e43fca764df95b2b81f77",
  "human_readable_id": 0,
  "__index_level_0__": 0,
  "y": 0,
  "x": 0
}
*/
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;


call apoc.load.parquet("create_final_documents.parquet") yield value
return keys(value),value limit 1;

// ["__index_level_0__", "raw_content", "id", "title", "text_unit_ids"]

call apoc.load.parquet("create_final_documents.parquet") yield value
MERGE (d:__Document__ {id:value.id})
SET d += value {.title, text_unit_ids:value.text_unit_ids, raw_content:substring(value.raw_content,0,1000)};

call apoc.load.parquet("create_base_text_units.parquet") yield value
return keys(value),value limit 1;
// ["document_ids", "chunk", "n_tokens", "id", "chunk_id"]

:auto
call apoc.load.parquet("create_base_text_units.parquet") yield value
CALL { with value
MERGE (c:__Chunk__ {id:value.chunk_id})
SET c += value {.chunk, .n_tokens}
WITH *
UNWIND value.document_ids as doc_id
MATCH (d:__Document__ {id:doc_id})
MERGE (d)<-[:PART_OF]-(c)
RETURN count(distinct c) as chunksCreated
} in transactions of 1000 rows
RETURN sum(chunksCreated) as chunksCreated;


:auto
call apoc.load.parquet("create_final_nodes.parquet") yield value
call { with value
    MERGE (n:__Entity__ {id:value.id})
    SET n += value {.level, .top_level_node_id, .human_readable_id, .description,
        title:replace(value.title,'"','')}
    WITH n, value
    CALL apoc.create.addLabels(n, case when value.type is null then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
    UNWIND split(value.source_id,",") as source_id
    MATCH (c:__Chunk__ {id:source_id})
    MERGE (c)-[:HAS_ENTITY]->(n)
    RETURN count(distinct n) as created
} in transactions of 25000 rows
return sum(created) as createdNodes;


call apoc.load.parquet("create_final_relationships.parquet") yield value
return keys(value), value limit 5;

:auto
call apoc.load.parquet("create_final_relationships.parquet") yield value
call { with value
    MATCH (source:__Entity__ {title:replace(value.source,'"','')})
    MATCH (target:__Entity__ {title:replace(value.target,'"','')})
    // todo rel-type from source-target labels?
    MERGE (source)-[rel:RELATED]->(target)
    SET rel += value {.id, .rank, .weight, .human_readable_id, .description, text_unit_ids:value.text_unit_ids}
    RETURN count(*) as created
} in transactions of 25000 rows
return sum(created) as createdRels;

/*
{
  "id": "b84d71ed9c3b45819eb3205fd28e13a0",
  "target_degree": 7,
  "rank": 20,
  "source_degree": 13,
  "weight": 1.0,
  "source": ""PROJECT GUTENBERG"",
  "description": ""Project Gutenberg is responsible for releasing 'A Christmas Carol' as an eBook."",
  "target": ""A CHRISTMAS CAROL"",
  "human_readable_id": "0",
  "text_unit_ids": [
    "680dd6d2a970a49082fa4f34bf63a34e"
  ]
}
*/

:auto
call apoc.load.parquet("create_final_communities.parquet") yield value
// return keys(value), value limit 5;
CALL { with value
    MERGE (c:__Community__ {community:value.id})
    SET c += value {.level, .title}
    /*
    UNWIND value.text_unit_ids as text_unit_id
    MATCH (t:__Chunk__ {id:text_unit_id})
    MERGE (c)-[:HAS_CHUNK]->(t)
    WITH distinct c, value
    */
    WITH *
    UNWIND value.relationship_ids as rel_id
    MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
    MERGE (start)-[:IN_COMMUNITY]->(c)
    MERGE (end)-[:IN_COMMUNITY]->(c)
    RETURn count(distinct c) as created
} in transactions of 1000 rows
RETURN sum(created) as createdCommunities;

// ["level", "text_unit_ids", "relationship_ids", "id", "title", "raw_community"]

:auto
call apoc.load.parquet("create_final_community_reports.parquet") yield value
CALL { with value
    MERGE (c:__Community__ {community:value.community})
    SET c += value {.level, .title, .summary, .findings, .rank, .rank_explanation, .id}
    RETURn count(distinct c) as created
} in transactions of 1000 rows
RETURN sum(created) as createdReports;
// ["summary", "full_content_json", "level", "findings", "full_content", "rank", "id", "rank_explanation", "title", "community"]
	/*
	cp ragtest/output//artifacts/.parquet $NEO4J_HOME/import

	echo 'apoc.import.file.enabled=true' >> $NEO4J_HOME/conf/apoc.conf

	cd $NEO4J_HOME/plugins
	cp ../labs/apoc.jar .
	curl -OL https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/5.21.0/apoc-5.21.0-extended.jar
	curl -OL https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/5.21.0/apoc-hadoop-dependencies-5.21.0-all.jar
	cd ..
	bin/neo4j console
	*/

	// TODO - Load documents, text-chunks, claims, communities and connect them

	call apoc.load.parquet("create_final_nodes.parquet") yield value
	// return keys(value), value limit 5
	return
	replace(value.type,'"','') as type, value.id, value.level,
	replace(value.title,'"','') as title,
	value.top_level_node_id,
	value.human_readable_id as nr,
	split(value.source_id,",") as sources,
	value.description
	LIMIT 5;

	/*
	{
	"source_id": "01e84646075b255eab0a34d872336a89,10bab8e9773ee6dfbb465bfa45794c34,28f242c45159426edb8589f5ca3c10e6,2f918cd94d1825eb5cbdc2a9d3ce094e,34c3d4a02c4a7e3b8ec57f41075aeeea,3fedcfeffb43c689a33ffa06897ad045,50160bdfa976f5b946c699722c81b412,535f6bed392a62760401b1d4f2aa5e2f,608db27bee139aaab8ded9989997d00a,680dd6d2a970a49082fa4f34bf63a34e,6968390fb201fda828835d2d1fd4e953,6ea022365de9ab0d226801de90139c8a,879b3fc36c9a2427cdb8d5d41b60e11b,972bb34ddd371530f06d006480526d3e,9e59af410db84b25757e3bf90e036f39,da3ca9f93aac15c67f6acf3cca2fc229,e8cf7d2eec5c3bcbeefc60d9f15941ed,f96b5ddf7fae853edbc4d916f66c623f",
	"type": ""ORGANIZATION"",
	"size": 13,
	"id": "b45241d70f0e43fca764df95b2b81f77",
	"title": ""PROJECT GUTENBERG"",
	"level": 0,
	"degree": 13,
	"description": "Project Gutenberg is a pioneering organization dedicated to the free distribution of electronic works, with a focus on those not protected by U.S. copyright law. It was initiated by Professor Michael S. Hart and is supported by a network of volunteers and the Gutenberg Literary Archive Foundation. The organization's mission is to increase the number of public domain and licensed works freely distributed in machine-readable form, thereby promoting free access to literature and electronic works. Project Gutenberg owns a compilation copyright in its collection of electronic works, ensuring their accessibility while requiring compliance with specific copyright and distribution guidelines outlined in their license agreement.

	For over forty years, Project Gutenberg has been creating and distributing eBooks, offering a vast array of works in various formats, including 'Plain Vanilla ASCII'. Its collection includes notable titles like 'A Christmas Carol', available for free under a license that allows copying, giving away, and re-using with almost no restrictions. The organization operates globally, emphasizing copyright status and adherence to its license, which includes a system of royalty payments and refunds under certain conditions. Project Gutenberg's main search facility is accessible through its website, www.gutenberg.org, facilitating easy access to its extensive library.

	Project Gutenberg is committed to keeping its collection freely available for future generations, supported by donations and the efforts of its volunteer network. It promotes the creation, modification, and redistribution of eBooks, especially focusing on works that allow for free copying and distribution in the United States under specific terms. The organization is described as being focused on promoting free access to electronic works, ensuring that literature remains accessible to the public while keeping its name associated with shared works in compliance with its agreement.",
	"top_level_node_id": "b45241d70f0e43fca764df95b2b81f77",
	"human_readable_id": 0,
	"__index_level_0__": 0,
	"y": 0,
	"x": 0
	}
	*/
	create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
	create constraint document_id if not exists for (d:__Document__) require d.id is unique;
	create constraint entity_id if not exists for (c:__Community__) require c.community is unique;

	create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
	create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;
	create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;


	call apoc.load.parquet("create_final_documents.parquet") yield value
	return keys(value),value limit 1;

	// ["__index_level_0__", "raw_content", "id", "title", "text_unit_ids"]

	call apoc.load.parquet("create_final_documents.parquet") yield value
	MERGE (d:__Document__ {id:value.id})
	SET d += value {.title, text_unit_ids:value.text_unit_ids, raw_content:substring(value.raw_content,0,1000)};

	call apoc.load.parquet("create_base_text_units.parquet") yield value
	return keys(value),value limit 1;
	// ["document_ids", "chunk", "n_tokens", "id", "chunk_id"]

	:auto
	call apoc.load.parquet("create_base_text_units.parquet") yield value
	CALL { with value
	MERGE (c:__Chunk__ {id:value.chunk_id})
	SET c += value {.chunk, .n_tokens}
	WITH *
	UNWIND value.document_ids as doc_id
	MATCH (d:__Document__ {id:doc_id})
	MERGE (d)<-[:PART_OF]-(c)
	RETURN count(distinct c) as chunksCreated
	} in transactions of 1000 rows
	RETURN sum(chunksCreated) as chunksCreated;


	:auto
	call apoc.load.parquet("create_final_nodes.parquet") yield value
	call { with value
	MERGE (n:__Entity__ {id:value.id})
	SET n += value {.level, .top_level_node_id, .human_readable_id, .description,
	title:replace(value.title,'"','')}
	WITH n, value
	CALL apoc.create.addLabels(n, case when value.type is null then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
	UNWIND split(value.source_id,",") as source_id
	MATCH (c:__Chunk__ {id:source_id})
	MERGE (c)-[:HAS_ENTITY]->(n)
	RETURN count(distinct n) as created
	} in transactions of 25000 rows
	return sum(created) as createdNodes;


	call apoc.load.parquet("create_final_relationships.parquet") yield value
	return keys(value), value limit 5;

	:auto
	call apoc.load.parquet("create_final_relationships.parquet") yield value
	call { with value
	MATCH (source:__Entity__ {title:replace(value.source,'"','')})
	MATCH (target:__Entity__ {title:replace(value.target,'"','')})
	// todo rel-type from source-target labels?
	MERGE (source)-[rel:RELATED]->(target)
	SET rel += value {.id, .rank, .weight, .human_readable_id, .description, text_unit_ids:value.text_unit_ids}
	RETURN count(*) as created
	} in transactions of 25000 rows
	return sum(created) as createdRels;

	/*
	{
	"id": "b84d71ed9c3b45819eb3205fd28e13a0",
	"target_degree": 7,
	"rank": 20,
	"source_degree": 13,
	"weight": 1.0,
	"source": ""PROJECT GUTENBERG"",
	"description": ""Project Gutenberg is responsible for releasing 'A Christmas Carol' as an eBook."",
	"target": ""A CHRISTMAS CAROL"",
	"human_readable_id": "0",
	"text_unit_ids": [
	"680dd6d2a970a49082fa4f34bf63a34e"
	]
	}
	*/

	:auto
	call apoc.load.parquet("create_final_communities.parquet") yield value
	// return keys(value), value limit 5;
	CALL { with value
	MERGE (c:__Community__ {community:value.id})
	SET c += value {.level, .title}
	/*
	UNWIND value.text_unit_ids as text_unit_id
	MATCH (t:__Chunk__ {id:text_unit_id})
	MERGE (c)-[:HAS_CHUNK]->(t)
	WITH distinct c, value
	*/
	WITH *
	UNWIND value.relationship_ids as rel_id
	MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
	MERGE (start)-[:IN_COMMUNITY]->(c)
	MERGE (end)-[:IN_COMMUNITY]->(c)
	RETURn count(distinct c) as created
	} in transactions of 1000 rows
	RETURN sum(created) as createdCommunities;

	// ["level", "text_unit_ids", "relationship_ids", "id", "title", "raw_community"]

	:auto
	call apoc.load.parquet("create_final_community_reports.parquet") yield value
	CALL { with value
	MERGE (c:__Community__ {community:value.community})
	SET c += value {.level, .title, .summary, .findings, .rank, .rank_explanation, .id}
	RETURn count(distinct c) as created
	} in transactions of 1000 rows
	RETURN sum(created) as createdReports;
	// ["summary", "full_content_json", "level", "findings", "full_content", "rank", "id", "rank_explanation", "title", "community"]