gregoryking/pk_fk_detection_GitSchema.cypher

## pk_fk_detection_GitSchema.cypher
// Academic paper on Foreign Key Detection - First step includes calculating inclusions which even with SPIDER algorithm is computationally expensive and requires access to full table data.
// https://www.semanticscholar.org/paper/A-Machine-Learning-Approach-to-Foreign-Key-Rostin-Albrecht/691f6eeabbe00451567c9facc282ce9820e9792b

// Article on more lightweight approach - Based solely on column name analysis
// https://www.tonic.ai/blog/foreign-key-detection

// Data for testing efficacy of algorithms
// https://github.com/tdoehmen/gitschemas

// Quick pything snippet to transform download from the above from a large json map into array of maps
//
// import json

// f = open('./GitSchemas_Permissive_Licenses.json')
// dict = json.load(f)
// array = [{'key': k, 'value': dict[k]} for k in dict]
// with open('data.json', 'w') as f:
//     json.dump(array, f)

// Create Indexes
CREATE INDEX FOR (s:Schema) ON (s.filename);
CREATE INDEX FOR (t:Table) ON (t.schemaId);
CREATE INDEX FOR (c:Column) ON (c.schemaId);
CREATE INDEX FOR (c:Column) ON (c.filename, c.name);

// Cypher json load of the json array generated above
// Wrapped in APOC periodic iterate because it seems offer better parallelisation than CALL {}
CALL apoc.periodic.iterate(
"
CALL apoc.load.json('file: ///data.json')
YIELD value AS schema
RETURN schema
"
,
"
// Create Schema
WITH schema
WITH schema.key AS key, schema.value AS schema
MERGE (schemaNode:Schema { name: key })
SET schemaNode+= schema.INFO
WITH schemaNode, schema
// Create associated Tables
WITH keys(schema.TABLES) AS tableKeys, schemaNode, schema
UNWIND tableKeys AS tableKey
MERGE (tableNode:Table { name: tableKey, schemaId: schemaNode.name })<-[:CONTAINS_TABLE]-(schemaNode)
// Create linked Columns
WITH schema.TABLES[tableKey] AS table, tableNode, schemaNode
WITH tableNode, table, table.COLUMNS AS columns, schemaNode
UNWIND columns AS column
MERGE (columnNode:Column { name: column[0], type: column[1], schemaId:schemaNode.name })<-[:HAS_COLUMN]-(tableNode)
// Create Primary Keys
WITH tableNode, table, table.PRIMARY_KEYS AS primaryKeys, schemaNode
UNWIND primaryKeys AS primaryKey
MATCH (tableNode)-[:HAS_COLUMN]-(pkc:Column { name: primaryKey })
MERGE (pk:PrimaryKey { table:tableNode.name, schemaId:schemaNode.name })
MERGE (pkc)<-[:PK_COLUMN]-(pk)
// Create Foreign Keys
WITH tableNode, table.FOREIGN_KEYS AS foreignKeys, schemaNode
UNWIND foreignKeys AS foreignKey
UNWIND foreignKey.FOREIGN_KEY AS fk_component
MATCH (tableNode)-[:HAS_COLUMN]-(fkc:Column { name: fk_component })
MERGE (fk:ForeignKey { table:tableNode.name, schemaId:schemaNode.name, fkName:foreignKey.FOREIGN_KEY })
MERGE (fkc)<-[:FK_COLUMN]-(fk)
WITH foreignKey, schemaNode, tableNode, fk
UNWIND foreignKey.REFERENCE_COLUMN AS fk_reference_column
MATCH (schemaNode)-[:CONTAINS_TABLE]->(:Table { name: foreignKey.REFERENCE_TABLE })-[:HAS_COLUMN]->(fkrc:Column {name: fk_reference_column})
MERGE (fkrc)<-[:FK_REFERENCE_COLUMN]-(fk)
",
{ batchSize:100, parallel: true });


// Show an example schema
:param schemaKey=>"003200_tables_oracle.sql";

MATCH (p:Schema { name: $schemaKey })
CALL apoc.path.subgraphAll(p, {
  minLevel: 0,
  maxLevel: 25
  })
  YIELD nodes, relationships
  RETURN nodes, relationships;

// Initial Foreign Key analysis
MATCH (fk:ForeignKey)
WITH fk
MATCH (fkc:Column)<-[:FK_COLUMN]-(fk)-[:FK_REFERENCE_COLUMN]->(fkrc:Column)
WITH collect( DISTINCT fkc) AS collFkc, collect( DISTINCT fkrc) AS collFkrc, fk
WITH fk, apoc.coll.sort([x IN collFkc | x.name]) AS collFkc, apoc.coll.sort([x IN collFkrc | x.name]) AS collFkrc
WITH fk, collFkc, collFkrc, apoc.coll.intersection(collFkc, collFkrc) AS intersection
WITH *, 1.0 * size(intersection) / size(collFkc) AS propExactlyShared
WHERE propExactlyShared < 1
RETURN fk, collFkc, collFkrc
LIMIT 10

// Analysis of how keys are used in different schemas, inc prevalence of columns use as PK or FK or compounds thereof and how frequently the columns appear in other columns in schemas on average

MATCH (c:Column)<-[:HAS_COLUMN]-(t:Table)<-[:CONTAINS_TABLE]-(s:Schema)
// WITH * // Testing line
// WHERE s.name="024573_Ambari-DDL-Oracle-CREATE.sql" // Testing line
OPTIONAL MATCH (c)<-[:PK_COLUMN]-(pk:PrimaryKey)
OPTIONAL MATCH (pk)-[:PK_COLUMN]->(pkco:Column)
OPTIONAL MATCH (c)<-[:FK_COLUMN]-(fk:ForeignKey)
OPTIONAL MATCH (fk)-[:FK_COLUMN]->(fkco:Column)
MATCH (co:Column { schemaId: s.name, name: c.name })
WHERE co<>c
WITH c, t, s,
fk is NOT null AS isForeignKey,
size(collect( DISTINCT fkco))>1 AS compoundForeign,
pk is NOT null AS isPrimaryKey,
size(collect( DISTINCT pkco))>1 AS compoundPrimary,
count( DISTINCT co) AS otherColumnsInTablesInSchemaWithColumnName
// WHERE isPrimaryKey=True AND compoundPrimary=False  // Testing line
WITH s,
t,
c,
CASE WHEN isPrimaryKey= true THEN 1 ELSE 0 END AS isPrimaryKey,
CASE WHEN compoundPrimary= true THEN 1 ELSE 0 END AS compoundPrimary,
CASE WHEN isForeignKey= true THEN 1 ELSE 0 END AS isForeignKey,
CASE WHEN compoundForeign= true THEN 1 ELSE 0 END AS compoundForeign,
otherColumnsInTablesInSchemaWithColumnName
// , [c,t,s] as nodes // Testing line
// LIMIT 3  // Testing line
WITH s, t, c, sum(isPrimaryKey) AS numTimesPkInTableInSchema, sum(compoundPrimary) AS numTimesCompoundPkInTableInSchema, sum(isForeignKey) AS numTimesFkInTableInSchema, sum(compoundForeign) AS numTimesCompoundFkInTableInSchema, avg(otherColumnsInTablesInSchemaWithColumnName) AS avgOtherColumnsInTablesInSchemaWithColumnName
RETURN c.name,
count( DISTINCT t) AS appearsInNumTables,
count( DISTINCT s) AS appearsAcrossNumSchema,
sum(numTimesPkInTableInSchema) AS totalTimesSeenAsPk,
sum(numTimesCompoundPkInTableInSchema) AS totalTimesSeenAsCompoundPk,
sum(numTimesFkInTableInSchema) AS totalTimesSeenAsFk,
sum(numTimesCompoundFkInTableInSchema) AS totalTimesSeenAsCompoundFk,
avg(avgOtherColumnsInTablesInSchemaWithColumnName) AS avgTimeSeenInOtherTables

// Finding schema with certain characteristics
MATCH (s:Schema)
// Find a schema which has at least 2 tables and at least one PK and one FK
WHERE (s)-->(:Table)-->(:Column)-[:FK_COLUMN]-()
 AND
(s)-->(:Table)-->(:Column)-[:PK_COLUMN]-()
 AND
count { (s)-->() } > 1
WITH s
SKIP 101
LIMIT 1
RETURN s

// Proof of concept simple string based key detection
MATCH (s:Schema)
// Find a schema which has at least 2 tables and at least one PK and one FK
WHERE (s)-->(:Table)-->(:Column)-[:FK_COLUMN]-()
      AND
      (s)-->(:Table)-->(:Column)-[:PK_COLUMN]-()
      AND
      count { (s)-->() } > 1
WITH s
SKIP 152
LIMIT 1
MATCH (s)--(t:Table)-->(c:Column)
WITH t, c, s
WHERE (c.name='id' OR c.name ENDSWITH '_id')
WITH s, t, c,
     CASE WHEN c.name ENDS WITH '_id' THEN substring(c.name, 0, size(c.name)-3) END AS potentialReferenceTableName
OPTIONAL MATCH (s)-->(rt:Table)
WHERE rt.name CONTAINS potentialReferenceTableName AND rt<>t
WITH *,
     CASE WHEN rt IS null THEN 'PrimaryKey' ELSE 'ForeignKey' END AS keyType
RETURN s.name AS schemaName, t.name AS tableName, c.name AS columnName, keyType, rt.name AS refereneTableName
	// Academic paper on Foreign Key Detection - First step includes calculating inclusions which even with SPIDER algorithm is computationally expensive and requires access to full table data.
	// https://www.semanticscholar.org/paper/A-Machine-Learning-Approach-to-Foreign-Key-Rostin-Albrecht/691f6eeabbe00451567c9facc282ce9820e9792b

	// Article on more lightweight approach - Based solely on column name analysis
	// https://www.tonic.ai/blog/foreign-key-detection

	// Data for testing efficacy of algorithms
	// https://github.com/tdoehmen/gitschemas

	// Quick pything snippet to transform download from the above from a large json map into array of maps
	//
	// import json

	// f = open('./GitSchemas_Permissive_Licenses.json')
	// dict = json.load(f)
	// array = [{'key': k, 'value': dict[k]} for k in dict]
	// with open('data.json', 'w') as f:
	// json.dump(array, f)

	// Create Indexes
	CREATE INDEX FOR (s:Schema) ON (s.filename);
	CREATE INDEX FOR (t:Table) ON (t.schemaId);
	CREATE INDEX FOR (c:Column) ON (c.schemaId);
	CREATE INDEX FOR (c:Column) ON (c.filename, c.name);

	// Cypher json load of the json array generated above
	// Wrapped in APOC periodic iterate because it seems offer better parallelisation than CALL {}
	CALL apoc.periodic.iterate(
	"
	CALL apoc.load.json('file: ///data.json')
	YIELD value AS schema
	RETURN schema
	"
	,
	"
	// Create Schema
	WITH schema
	WITH schema.key AS key, schema.value AS schema
	MERGE (schemaNode:Schema { name: key })
	SET schemaNode+= schema.INFO
	WITH schemaNode, schema
	// Create associated Tables
	WITH keys(schema.TABLES) AS tableKeys, schemaNode, schema
	UNWIND tableKeys AS tableKey
	MERGE (tableNode:Table { name: tableKey, schemaId: schemaNode.name })<-[:CONTAINS_TABLE]-(schemaNode)
	// Create linked Columns
	WITH schema.TABLES[tableKey] AS table, tableNode, schemaNode
	WITH tableNode, table, table.COLUMNS AS columns, schemaNode
	UNWIND columns AS column
	MERGE (columnNode:Column { name: column[0], type: column[1], schemaId:schemaNode.name })<-[:HAS_COLUMN]-(tableNode)
	// Create Primary Keys
	WITH tableNode, table, table.PRIMARY_KEYS AS primaryKeys, schemaNode
	UNWIND primaryKeys AS primaryKey
	MATCH (tableNode)-[:HAS_COLUMN]-(pkc:Column { name: primaryKey })
	MERGE (pk:PrimaryKey { table:tableNode.name, schemaId:schemaNode.name })
	MERGE (pkc)<-[:PK_COLUMN]-(pk)
	// Create Foreign Keys
	WITH tableNode, table.FOREIGN_KEYS AS foreignKeys, schemaNode
	UNWIND foreignKeys AS foreignKey
	UNWIND foreignKey.FOREIGN_KEY AS fk_component
	MATCH (tableNode)-[:HAS_COLUMN]-(fkc:Column { name: fk_component })
	MERGE (fk:ForeignKey { table:tableNode.name, schemaId:schemaNode.name, fkName:foreignKey.FOREIGN_KEY })
	MERGE (fkc)<-[:FK_COLUMN]-(fk)
	WITH foreignKey, schemaNode, tableNode, fk
	UNWIND foreignKey.REFERENCE_COLUMN AS fk_reference_column
	MATCH (schemaNode)-[:CONTAINS_TABLE]->(:Table { name: foreignKey.REFERENCE_TABLE })-[:HAS_COLUMN]->(fkrc:Column {name: fk_reference_column})
	MERGE (fkrc)<-[:FK_REFERENCE_COLUMN]-(fk)
	",
	{ batchSize:100, parallel: true });


	// Show an example schema
	:param schemaKey=>"003200_tables_oracle.sql";

	MATCH (p:Schema { name: $schemaKey })
	CALL apoc.path.subgraphAll(p, {
	minLevel: 0,
	maxLevel: 25
	})
	YIELD nodes, relationships
	RETURN nodes, relationships;

	// Initial Foreign Key analysis
	MATCH (fk:ForeignKey)
	WITH fk
	MATCH (fkc:Column)<-[:FK_COLUMN]-(fk)-[:FK_REFERENCE_COLUMN]->(fkrc:Column)
	WITH collect( DISTINCT fkc) AS collFkc, collect( DISTINCT fkrc) AS collFkrc, fk
	WITH fk, apoc.coll.sort([x IN collFkc \| x.name]) AS collFkc, apoc.coll.sort([x IN collFkrc \| x.name]) AS collFkrc
	WITH fk, collFkc, collFkrc, apoc.coll.intersection(collFkc, collFkrc) AS intersection
	WITH , 1.0 size(intersection) / size(collFkc) AS propExactlyShared
	WHERE propExactlyShared < 1
	RETURN fk, collFkc, collFkrc
	LIMIT 10

	// Analysis of how keys are used in different schemas, inc prevalence of columns use as PK or FK or compounds thereof and how frequently the columns appear in other columns in schemas on average

	MATCH (c:Column)<-[:HAS_COLUMN]-(t:Table)<-[:CONTAINS_TABLE]-(s:Schema)
	// WITH * // Testing line
	// WHERE s.name="024573_Ambari-DDL-Oracle-CREATE.sql" // Testing line
	OPTIONAL MATCH (c)<-[:PK_COLUMN]-(pk:PrimaryKey)
	OPTIONAL MATCH (pk)-[:PK_COLUMN]->(pkco:Column)
	OPTIONAL MATCH (c)<-[:FK_COLUMN]-(fk:ForeignKey)
	OPTIONAL MATCH (fk)-[:FK_COLUMN]->(fkco:Column)
	MATCH (co:Column { schemaId: s.name, name: c.name })
	WHERE co<>c
	WITH c, t, s,
	fk is NOT null AS isForeignKey,
	size(collect( DISTINCT fkco))>1 AS compoundForeign,
	pk is NOT null AS isPrimaryKey,
	size(collect( DISTINCT pkco))>1 AS compoundPrimary,
	count( DISTINCT co) AS otherColumnsInTablesInSchemaWithColumnName
	// WHERE isPrimaryKey=True AND compoundPrimary=False // Testing line
	WITH s,
	t,
	c,
	CASE WHEN isPrimaryKey= true THEN 1 ELSE 0 END AS isPrimaryKey,
	CASE WHEN compoundPrimary= true THEN 1 ELSE 0 END AS compoundPrimary,
	CASE WHEN isForeignKey= true THEN 1 ELSE 0 END AS isForeignKey,
	CASE WHEN compoundForeign= true THEN 1 ELSE 0 END AS compoundForeign,
	otherColumnsInTablesInSchemaWithColumnName
	// , [c,t,s] as nodes // Testing line
	// LIMIT 3 // Testing line
	WITH s, t, c, sum(isPrimaryKey) AS numTimesPkInTableInSchema, sum(compoundPrimary) AS numTimesCompoundPkInTableInSchema, sum(isForeignKey) AS numTimesFkInTableInSchema, sum(compoundForeign) AS numTimesCompoundFkInTableInSchema, avg(otherColumnsInTablesInSchemaWithColumnName) AS avgOtherColumnsInTablesInSchemaWithColumnName
	RETURN c.name,
	count( DISTINCT t) AS appearsInNumTables,
	count( DISTINCT s) AS appearsAcrossNumSchema,
	sum(numTimesPkInTableInSchema) AS totalTimesSeenAsPk,
	sum(numTimesCompoundPkInTableInSchema) AS totalTimesSeenAsCompoundPk,
	sum(numTimesFkInTableInSchema) AS totalTimesSeenAsFk,
	sum(numTimesCompoundFkInTableInSchema) AS totalTimesSeenAsCompoundFk,
	avg(avgOtherColumnsInTablesInSchemaWithColumnName) AS avgTimeSeenInOtherTables

	// Finding schema with certain characteristics
	MATCH (s:Schema)
	// Find a schema which has at least 2 tables and at least one PK and one FK
	WHERE (s)-->(:Table)-->(:Column)-[:FK_COLUMN]-()
	AND
	(s)-->(:Table)-->(:Column)-[:PK_COLUMN]-()
	AND
	count { (s)-->() } > 1
	WITH s
	SKIP 101
	LIMIT 1
	RETURN s

	// Proof of concept simple string based key detection
	MATCH (s:Schema)
	// Find a schema which has at least 2 tables and at least one PK and one FK
	WHERE (s)-->(:Table)-->(:Column)-[:FK_COLUMN]-()
	AND
	(s)-->(:Table)-->(:Column)-[:PK_COLUMN]-()
	AND
	count { (s)-->() } > 1
	WITH s
	SKIP 152
	LIMIT 1
	MATCH (s)--(t:Table)-->(c:Column)
	WITH t, c, s
	WHERE (c.name='id' OR c.name ENDSWITH '_id')
	WITH s, t, c,
	CASE WHEN c.name ENDS WITH '_id' THEN substring(c.name, 0, size(c.name)-3) END AS potentialReferenceTableName
	OPTIONAL MATCH (s)-->(rt:Table)
	WHERE rt.name CONTAINS potentialReferenceTableName AND rt<>t
	WITH *,
	CASE WHEN rt IS null THEN 'PrimaryKey' ELSE 'ForeignKey' END AS keyType
	RETURN s.name AS schemaName, t.name AS tableName, c.name AS columnName, keyType, rt.name AS refereneTableName