Skip to content

Instantly share code, notes, and snippets.

@gregoryking
Last active July 13, 2023 12:32
Show Gist options
  • Save gregoryking/d59ff1b6c1b433b505c61b7bf246dc5c to your computer and use it in GitHub Desktop.
Save gregoryking/d59ff1b6c1b433b505c61b7bf246dc5c to your computer and use it in GitHub Desktop.
Cypher based PK / FK detection - tested on GitScehma dataset
// Academic paper on Foreign Key Detection - First step includes calculating inclusions which even with SPIDER algorithm is computationally expensive and requires access to full table data.
// https://www.semanticscholar.org/paper/A-Machine-Learning-Approach-to-Foreign-Key-Rostin-Albrecht/691f6eeabbe00451567c9facc282ce9820e9792b
// Article on more lightweight approach - Based solely on column name analysis
// https://www.tonic.ai/blog/foreign-key-detection
// Data for testing efficacy of algorithms
// https://github.com/tdoehmen/gitschemas
// Quick pything snippet to transform download from the above from a large json map into array of maps
//
// import json
// f = open('./GitSchemas_Permissive_Licenses.json')
// dict = json.load(f)
// array = [{'key': k, 'value': dict[k]} for k in dict]
// with open('data.json', 'w') as f:
// json.dump(array, f)
// Create Indexes
CREATE INDEX FOR (s:Schema) ON (s.filename);
CREATE INDEX FOR (t:Table) ON (t.schemaId);
CREATE INDEX FOR (c:Column) ON (c.schemaId);
CREATE INDEX FOR (c:Column) ON (c.filename, c.name);
// Cypher json load of the json array generated above
// Wrapped in APOC periodic iterate because it seems offer better parallelisation than CALL {}
CALL apoc.periodic.iterate(
"
CALL apoc.load.json('file: ///data.json')
YIELD value AS schema
RETURN schema
"
,
"
// Create Schema
WITH schema
WITH schema.key AS key, schema.value AS schema
MERGE (schemaNode:Schema { name: key })
SET schemaNode+= schema.INFO
WITH schemaNode, schema
// Create associated Tables
WITH keys(schema.TABLES) AS tableKeys, schemaNode, schema
UNWIND tableKeys AS tableKey
MERGE (tableNode:Table { name: tableKey, schemaId: schemaNode.name })<-[:CONTAINS_TABLE]-(schemaNode)
// Create linked Columns
WITH schema.TABLES[tableKey] AS table, tableNode, schemaNode
WITH tableNode, table, table.COLUMNS AS columns, schemaNode
UNWIND columns AS column
MERGE (columnNode:Column { name: column[0], type: column[1], schemaId:schemaNode.name })<-[:HAS_COLUMN]-(tableNode)
// Create Primary Keys
WITH tableNode, table, table.PRIMARY_KEYS AS primaryKeys, schemaNode
UNWIND primaryKeys AS primaryKey
MATCH (tableNode)-[:HAS_COLUMN]-(pkc:Column { name: primaryKey })
MERGE (pk:PrimaryKey { table:tableNode.name, schemaId:schemaNode.name })
MERGE (pkc)<-[:PK_COLUMN]-(pk)
// Create Foreign Keys
WITH tableNode, table.FOREIGN_KEYS AS foreignKeys, schemaNode
UNWIND foreignKeys AS foreignKey
UNWIND foreignKey.FOREIGN_KEY AS fk_component
MATCH (tableNode)-[:HAS_COLUMN]-(fkc:Column { name: fk_component })
MERGE (fk:ForeignKey { table:tableNode.name, schemaId:schemaNode.name, fkName:foreignKey.FOREIGN_KEY })
MERGE (fkc)<-[:FK_COLUMN]-(fk)
WITH foreignKey, schemaNode, tableNode, fk
UNWIND foreignKey.REFERENCE_COLUMN AS fk_reference_column
MATCH (schemaNode)-[:CONTAINS_TABLE]->(:Table { name: foreignKey.REFERENCE_TABLE })-[:HAS_COLUMN]->(fkrc:Column {name: fk_reference_column})
MERGE (fkrc)<-[:FK_REFERENCE_COLUMN]-(fk)
",
{ batchSize:100, parallel: true });
// Show an example schema
:param schemaKey=>"003200_tables_oracle.sql";
MATCH (p:Schema { name: $schemaKey })
CALL apoc.path.subgraphAll(p, {
minLevel: 0,
maxLevel: 25
})
YIELD nodes, relationships
RETURN nodes, relationships;
// Initial Foreign Key analysis
MATCH (fk:ForeignKey)
WITH fk
MATCH (fkc:Column)<-[:FK_COLUMN]-(fk)-[:FK_REFERENCE_COLUMN]->(fkrc:Column)
WITH collect( DISTINCT fkc) AS collFkc, collect( DISTINCT fkrc) AS collFkrc, fk
WITH fk, apoc.coll.sort([x IN collFkc | x.name]) AS collFkc, apoc.coll.sort([x IN collFkrc | x.name]) AS collFkrc
WITH fk, collFkc, collFkrc, apoc.coll.intersection(collFkc, collFkrc) AS intersection
WITH *, 1.0 * size(intersection) / size(collFkc) AS propExactlyShared
WHERE propExactlyShared < 1
RETURN fk, collFkc, collFkrc
LIMIT 10
// Analysis of how keys are used in different schemas, inc prevalence of columns use as PK or FK or compounds thereof and how frequently the columns appear in other columns in schemas on average
MATCH (c:Column)<-[:HAS_COLUMN]-(t:Table)<-[:CONTAINS_TABLE]-(s:Schema)
// WITH * // Testing line
// WHERE s.name="024573_Ambari-DDL-Oracle-CREATE.sql" // Testing line
OPTIONAL MATCH (c)<-[:PK_COLUMN]-(pk:PrimaryKey)
OPTIONAL MATCH (pk)-[:PK_COLUMN]->(pkco:Column)
OPTIONAL MATCH (c)<-[:FK_COLUMN]-(fk:ForeignKey)
OPTIONAL MATCH (fk)-[:FK_COLUMN]->(fkco:Column)
MATCH (co:Column { schemaId: s.name, name: c.name })
WHERE co<>c
WITH c, t, s,
fk is NOT null AS isForeignKey,
size(collect( DISTINCT fkco))>1 AS compoundForeign,
pk is NOT null AS isPrimaryKey,
size(collect( DISTINCT pkco))>1 AS compoundPrimary,
count( DISTINCT co) AS otherColumnsInTablesInSchemaWithColumnName
// WHERE isPrimaryKey=True AND compoundPrimary=False // Testing line
WITH s,
t,
c,
CASE WHEN isPrimaryKey= true THEN 1 ELSE 0 END AS isPrimaryKey,
CASE WHEN compoundPrimary= true THEN 1 ELSE 0 END AS compoundPrimary,
CASE WHEN isForeignKey= true THEN 1 ELSE 0 END AS isForeignKey,
CASE WHEN compoundForeign= true THEN 1 ELSE 0 END AS compoundForeign,
otherColumnsInTablesInSchemaWithColumnName
// , [c,t,s] as nodes // Testing line
// LIMIT 3 // Testing line
WITH s, t, c, sum(isPrimaryKey) AS numTimesPkInTableInSchema, sum(compoundPrimary) AS numTimesCompoundPkInTableInSchema, sum(isForeignKey) AS numTimesFkInTableInSchema, sum(compoundForeign) AS numTimesCompoundFkInTableInSchema, avg(otherColumnsInTablesInSchemaWithColumnName) AS avgOtherColumnsInTablesInSchemaWithColumnName
RETURN c.name,
count( DISTINCT t) AS appearsInNumTables,
count( DISTINCT s) AS appearsAcrossNumSchema,
sum(numTimesPkInTableInSchema) AS totalTimesSeenAsPk,
sum(numTimesCompoundPkInTableInSchema) AS totalTimesSeenAsCompoundPk,
sum(numTimesFkInTableInSchema) AS totalTimesSeenAsFk,
sum(numTimesCompoundFkInTableInSchema) AS totalTimesSeenAsCompoundFk,
avg(avgOtherColumnsInTablesInSchemaWithColumnName) AS avgTimeSeenInOtherTables
// Finding schema with certain characteristics
MATCH (s:Schema)
// Find a schema which has at least 2 tables and at least one PK and one FK
WHERE (s)-->(:Table)-->(:Column)-[:FK_COLUMN]-()
AND
(s)-->(:Table)-->(:Column)-[:PK_COLUMN]-()
AND
count { (s)-->() } > 1
WITH s
SKIP 101
LIMIT 1
RETURN s
// Proof of concept simple string based key detection
MATCH (s:Schema)
// Find a schema which has at least 2 tables and at least one PK and one FK
WHERE (s)-->(:Table)-->(:Column)-[:FK_COLUMN]-()
AND
(s)-->(:Table)-->(:Column)-[:PK_COLUMN]-()
AND
count { (s)-->() } > 1
WITH s
SKIP 152
LIMIT 1
MATCH (s)--(t:Table)-->(c:Column)
WITH t, c, s
WHERE (c.name='id' OR c.name ENDSWITH '_id')
WITH s, t, c,
CASE WHEN c.name ENDS WITH '_id' THEN substring(c.name, 0, size(c.name)-3) END AS potentialReferenceTableName
OPTIONAL MATCH (s)-->(rt:Table)
WHERE rt.name CONTAINS potentialReferenceTableName AND rt<>t
WITH *,
CASE WHEN rt IS null THEN 'PrimaryKey' ELSE 'ForeignKey' END AS keyType
RETURN s.name AS schemaName, t.name AS tableName, c.name AS columnName, keyType, rt.name AS refereneTableName
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment