Skip to content

Instantly share code, notes, and snippets.

@jexp
Created September 1, 2023 08:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jexp/ec7449889054280d6cb9a5a9c41f4a73 to your computer and use it in GitHub Desktop.
Save jexp/ec7449889054280d6cb9a5a9c41f4a73 to your computer and use it in GitHub Desktop.
Neo4j Cypher Script to import slideshare data, create embeddings, a vector index, similarity search and extract authors and keywords
// gcloud projects list
:param projectId => 'xxxx'
// gcloud auth print-access-token
:param apiKey => 'xxx'
call apoc.load.json("https://data.neo4j.com/slideshare-neo4j.json") yield value
unwind value.User.Slideshow as slide
return slide limit 5;
call apoc.load.json("https://data.neo4j.com/slideshare-neo4j.json") yield value
unwind value.User.Slideshow as slide
return count(*);
call apoc.load.json("https://data.neo4j.com/slideshare-neo4j.json") yield value
unwind value.User.Slideshow as slide
create (s:Content {id: slide.ID})
set s += { description: slide.Description, title: slide.Title, url:slide.URL,
format:slide.Format, language:slide.Language, thumbnail:slide.ThumbnailURL,
created: apoc.temporal.toZonedTemporal(slide.Created),
updated: apoc.temporal.toZonedTemporal(slide.Updated) };
call apoc.ml.vertexai.embedding(['This is a test'], $apiKey, $projectId);
:auto
match (s:Content) where s.embedding is null
WITH * LIMIT 10
call { with s
call apoc.ml.vertexai.embedding([coalesce(s.title,'') + " " + coalesce(s.description,'')], $apiKey, $projectId) yield embedding
set s.embedding = embedding
} in transactions of 10 rows;
CALL db.index.vector.createNodeIndex( "content","Content","embedding", 768,"cosine");
// index search for top 5 similar vectors with additional graph matching
WITH "decks about knowledge graphs and generative AI" as question
// generate vector embedding from the API
CALL apoc.ml.vertexai.embedding([question], $apiToken, $project) yield embedding
// use the vector index
CALL db.index.vector.queryNodes('content',5, embedding) yield node as content, score
MATCH (keyword)<-[:TAGGED]-(content)<-[:AUTHORED]-(author)
RETURN text, content.title, content.description,
collect(distinct author.name) as authors, collect(distinct keyword.name) as keywords
create constraint Keyword_name for (k:Keyword) require (k.name) is unique;
:auto
match (s:Content) where not exists { (s)<-[:AUTHORED]-() }
call { with s
call apoc.util.sleep(100)
call apoc.ml.vertexai.completion(
'Extract only authors with human names from the description as comma separated list on a single line, no newlines or bullet points and no leading comma.'+
'Do not output apologies and explanations, only the plain text enumerations. '+
'If you do not follow the instructions people will be hurt.\n' +
'Title:'+coalesce(s.title,'')+
'Description: '+coalesce(s.description,''), $apiKey, $projectId) yield value
// with value.content, s.title, s.description
unwind split(value.content,',') as name
with trim(name) as name, s where coalesce(name,'') <> ''
merge (a:Author {name: name})
merge (s)<-[:AUTHORED]-(a)
} in transactions of 10 rows;
load csv with headers from 'file:///slides-keywords.csv' as row
with replace(row.id,'"','') as id, replace(row.keyword,'"','') as keyword
match (s:Content {id:id})
merge (k:Keyword {name:keyword})
merge (s)-[:TAGGED]->(k);
create constraint Author_name for (a:Author) require (a.name) is unique;
load csv with headers from 'file:///slides-authors.csv' as row
with replace(row.slide,'"','') as id, replace(row.author,'"','') as author
match (s:Content {id:id})
merge (a:Author {name:author})
merge (a)-[:AUTHORED]->(s);
load csv with headers from 'file:///input-embeddings.csv' as row
with row.text as text, apoc.convert.fromJsonList(row.embedding) as embedding
where row.index = '5'
CALL db.index.vector.queryNodes('content',5, embedding) yield node as content, score
MATCH (keyword)<-[:TAGGED]-(content)<-[:AUTHORED]-(author)
RETURN text, content.title, content.description,
collect(author.name) as authors, collect(keyword.name) as keywords
// virtual k-NN relationships <1s
match (c:Content)
CALL db.index.vector.queryNodes('content',5, c.embedding) yield node, score
where c <> node
call apoc.create.vRelationship(c,'SIMILAR_TO',{score:score},node) yield rel
return c,node, rel
// initially create vector index on all the “embedding” properties of “Content” nodes using thecosine similarity function and vectors of width 768
CALL db.index.vector.createNodeIndex( "content","Content","embedding", 768,"cosine");
// index search for top 5 similar vectors with additional graph matching
WITH "decks about knowledge graphs and generative AI" as question
// generate vector embedding from the API
CALL apoc.ml.vertexai.embedding([question], $apiToken, $project) yield embedding
// use the vector index
CALL db.index.vector.queryNodes('content',5, embedding) yield node as content, score
MATCH (keyword)<-[:TAGGED]-(content)<-[:AUTHORED]-(author)
RETURN text, content.title, content.description,
collect(distinct author.name) as authors, collect(distinct keyword.name) as keywords
call apoc.ml.vertexai.embedding(['This is a test'], $apiKey, $projectId);
:auto
match (s:Content) where not exists { (s)-[:TAGGED]->() }
call { with s
call apoc.util.sleep(100)
call apoc.ml.vertexai.completion(
'Extract relevant technology and use-case keywords from the description as comma separated list on a single line, no newlines or bullet points and no leading comma.'+
'Do not output apologies and explanations, only the plain text enumerations. '+
'If you do not follow the instructions people will be hurt.\n' +
'Title:'+coalesce(s.title,'')+
'Description: '+coalesce(s.description,''), $apiKey, $projectId) yield value
// with value.content, s.title, s.description
unwind split(value.content,',') as keyword
with trim(keyword) as keyword, s where coalesce(keyword,'') <> ''
merge (k:Keyword {name: keyword})
merge (s)-[:TAGGED]->(k)
} in transactions of 10 rows;
MATCH (s:Content) with s limit 100 match p=(s)-->() RETURN p LIMIT 200;
load csv with headers from 'file:///embeddings.csv' as row
with row.`s.ID` as id, apoc.convert.fromJsonList(row.`s.embedding`) as embedding
match (c:Content {id:id}) set c.embedding = embedding;
:auto
match (s:Content) where s.embedding is null
call { with s
call apoc.ml.vertexai.embedding([coalesce(s.title,'') + " " + coalesce(s.description,'')], $apiKey, $projectId) yield embedding
set s.embedding = embedding
} in transactions of 10 rows;
call apoc.ml.vertexai.embedding(['knowledge graph for fraud detection'], $apiKey, $projectId) yield embedding
match (s:Content)
with s, gds.similarity.cosine(s.embedding, embedding) as similarity
order by similarity desc limit 5
return s.title, s.description, s.url, similarity;
// index search for top 5 similar vectors with additional graph matching
WITH "decks about knowledge graphs and generative AI" as question
// generate vector embedding from the API
CALL apoc.ml.vertexai.embedding([question], $apiKey, $projectId) yield embedding
// use the vector index
CALL db.index.vector.queryNodes('content',5, embedding) yield node as slide, score
return slide.title, slide.description, slide.url, score;
/*
{
"Status": "2",
"ThumbnailXLargeURL": "https://cdn.slidesharecdn.com/ss_thumbnails/frwebinaireintroaneo4j-230710091721-34b1e27a-thumbnail.jpg?width=640&amp;height=640&amp;fit=bounds",
"Description": "Pierre Halftermeyer, Neo4j",
"ThumbnailURL": "https://cdn.slidesharecdn.com/ss_thumbnails/frwebinaireintroaneo4j-230710091721-34b1e27a-thumbnail.jpg?width=320&amp;height=320&amp;fit=bounds",
"Updated": "2023-07-10 09:22:46 UTC",
"Embed": "<iframe src="https://www.slideshare.net/slideshow/embed_code/key/b8GESwzh3vWqrj" width="427" height="356" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" style="border:1px solid #CCC; border-width:1px; margin-bottom:5px; max-width: 100%;" allowfullscreen> </iframe> <div style="margin-bottom:5px"> <strong> <a href="https://www.slideshare.net/neo4j/introduction-neo4j-259115031" title="Introduction à Neo4j" target="_blank">Introduction à Neo4j</a> </strong> from <strong><a href="https://www.slideshare.net/neo4j" target="_blank">Neo4j</a></strong> </div>",
"ThumbnailSize": "[170,130]",
"Title": "Introduction à Neo4j",
"URL": "https://www.slideshare.net/neo4j/introduction-neo4j-259115031",
"Created": "2023-07-10 09:17:21 UTC",
"ThumbnailXXLargeURL": "https://cdn.slidesharecdn.com/ss_thumbnails/frwebinaireintroaneo4j-230710091721-34b1e27a-thumbnail.jpg?width=640&amp;height=640&amp;fit=bounds",
"SlideshowType": "0",
"Format": "pdf",
"Language": "fr",
"Username": "neo4j",
"ThumbnailSmallURL": "https://cdn.slidesharecdn.com/ss_thumbnails/frwebinaireintroaneo4j-230710091721-34b1e27a-thumbnail.jpg?width=120&amp;height=120&amp;fit=bounds",
"SlideshowEmbedUrl": "https://www.slideshare.net/slideshow/embed_code/key/b8GESwzh3vWqrj",
"ID": "259115031",
"InContest": "0",
"Download": "1",
"DownloadUrl": "https://slideshare-downloads.s3.amazonaws.com/frwebinaireintroaneo4j-230710091721-34b1e27a.pdf?response-content-disposition=attachment&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIATZMST4DYZS7SJPXU%2F20230804%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230804T094222Z&X-Amz-Expires=300&X-Amz-SignedHeaders=host&X-Amz-Signature=295076fc4efd131980e14f4be24960dbff41d80ce65c66f69841fa21753ad0b8"
}
{
"Status": "2",
"ThumbnailXLargeURL": "https://cdn.slidesharecdn.com/ss_thumbnails/spwebinar-introtoneo4j-230710091622-99f73fd2-thumbnail.jpg?width=640&amp;height=640&amp;fit=bounds",
"Description": "Luis Salvador, Neo4j",
"ThumbnailURL": "https://cdn.slidesharecdn.com/ss_thumbnails/spwebinar-introtoneo4j-230710091622-99f73fd2-thumbnail.jpg?width=320&amp;height=320&amp;fit=bounds",
"Updated": "2023-07-10 09:21:37 UTC",
"Embed": "<iframe src="https://www.slideshare.net/slideshow/embed_code/key/DaOdEJwCpxbHdh" width="427" height="356" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" style="border:1px solid #CCC; border-width:1px; margin-bottom:5px; max-width: 100%;" allowfullscreen> </iframe> <div style="margin-bottom:5px"> <strong> <a href="https://www.slideshare.net/neo4j/introduccin-a-neo4j-259115019" title="Introducción a Neo4j" target="_blank">Introducción a Neo4j</a> </strong> from <strong><a href="https://www.slideshare.net/neo4j" target="_blank">Neo4j</a></strong> </div>",
"ThumbnailSize": "[170,130]",
"Title": "Introducción a Neo4j",
"URL": "https://www.slideshare.net/neo4j/introduccin-a-neo4j-259115019",
"Created": "2023-07-10 09:16:22 UTC",
"ThumbnailXXLargeURL": "https://cdn.slidesharecdn.com/ss_thumbnails/spwebinar-introtoneo4j-230710091622-99f73fd2-thumbnail.jpg?width=640&amp;height=640&amp;fit=bounds",
"SlideshowType": "0",
"Format": "pdf",
"Language": "en",
"Username": "neo4j",
"ThumbnailSmallURL": "https://cdn.slidesharecdn.com/ss_thumbnails/spwebinar-introtoneo4j-230710091622-99f73fd2-thumbnail.jpg?width=120&amp;height=120&amp;fit=bounds",
"SlideshowEmbedUrl": "https://www.slideshare.net/slideshow/embed_code/key/DaOdEJwCpxbHdh",
"ID": "259115019",
"InContest": "0",
"Download": "1",
"DownloadUrl": "https://slideshare-downloads.s3.amazonaws.com/spwebinar-introtoneo4j-230710091622-99f73fd2.pdf?response-content-disposition=attachment&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIATZMST4DYZS7SJPXU%2F20230804%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230804T094222Z&X-Amz-Expires=300&X-Amz-SignedHeaders=host&X-Amz-Signature=828f68aa605886e0a790613011fe0bd5bbd0884c0b6c87495d881b5cc6de2267"
}
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment