Last active
July 13, 2023 12:32
-
-
Save gregoryking/d59ff1b6c1b433b505c61b7bf246dc5c to your computer and use it in GitHub Desktop.
Cypher based PK / FK detection - tested on GitScehma dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Academic paper on Foreign Key Detection - First step includes calculating inclusions which even with SPIDER algorithm is computationally expensive and requires access to full table data. | |
// https://www.semanticscholar.org/paper/A-Machine-Learning-Approach-to-Foreign-Key-Rostin-Albrecht/691f6eeabbe00451567c9facc282ce9820e9792b | |
// Article on more lightweight approach - Based solely on column name analysis | |
// https://www.tonic.ai/blog/foreign-key-detection | |
// Data for testing efficacy of algorithms | |
// https://github.com/tdoehmen/gitschemas | |
// Quick pything snippet to transform download from the above from a large json map into array of maps | |
// | |
// import json | |
// f = open('./GitSchemas_Permissive_Licenses.json') | |
// dict = json.load(f) | |
// array = [{'key': k, 'value': dict[k]} for k in dict] | |
// with open('data.json', 'w') as f: | |
// json.dump(array, f) | |
// Create Indexes | |
CREATE INDEX FOR (s:Schema) ON (s.filename); | |
CREATE INDEX FOR (t:Table) ON (t.schemaId); | |
CREATE INDEX FOR (c:Column) ON (c.schemaId); | |
CREATE INDEX FOR (c:Column) ON (c.filename, c.name); | |
// Cypher json load of the json array generated above | |
// Wrapped in APOC periodic iterate because it seems offer better parallelisation than CALL {} | |
CALL apoc.periodic.iterate( | |
" | |
CALL apoc.load.json('file: ///data.json') | |
YIELD value AS schema | |
RETURN schema | |
" | |
, | |
" | |
// Create Schema | |
WITH schema | |
WITH schema.key AS key, schema.value AS schema | |
MERGE (schemaNode:Schema { name: key }) | |
SET schemaNode+= schema.INFO | |
WITH schemaNode, schema | |
// Create associated Tables | |
WITH keys(schema.TABLES) AS tableKeys, schemaNode, schema | |
UNWIND tableKeys AS tableKey | |
MERGE (tableNode:Table { name: tableKey, schemaId: schemaNode.name })<-[:CONTAINS_TABLE]-(schemaNode) | |
// Create linked Columns | |
WITH schema.TABLES[tableKey] AS table, tableNode, schemaNode | |
WITH tableNode, table, table.COLUMNS AS columns, schemaNode | |
UNWIND columns AS column | |
MERGE (columnNode:Column { name: column[0], type: column[1], schemaId:schemaNode.name })<-[:HAS_COLUMN]-(tableNode) | |
// Create Primary Keys | |
WITH tableNode, table, table.PRIMARY_KEYS AS primaryKeys, schemaNode | |
UNWIND primaryKeys AS primaryKey | |
MATCH (tableNode)-[:HAS_COLUMN]-(pkc:Column { name: primaryKey }) | |
MERGE (pk:PrimaryKey { table:tableNode.name, schemaId:schemaNode.name }) | |
MERGE (pkc)<-[:PK_COLUMN]-(pk) | |
// Create Foreign Keys | |
WITH tableNode, table.FOREIGN_KEYS AS foreignKeys, schemaNode | |
UNWIND foreignKeys AS foreignKey | |
UNWIND foreignKey.FOREIGN_KEY AS fk_component | |
MATCH (tableNode)-[:HAS_COLUMN]-(fkc:Column { name: fk_component }) | |
MERGE (fk:ForeignKey { table:tableNode.name, schemaId:schemaNode.name, fkName:foreignKey.FOREIGN_KEY }) | |
MERGE (fkc)<-[:FK_COLUMN]-(fk) | |
WITH foreignKey, schemaNode, tableNode, fk | |
UNWIND foreignKey.REFERENCE_COLUMN AS fk_reference_column | |
MATCH (schemaNode)-[:CONTAINS_TABLE]->(:Table { name: foreignKey.REFERENCE_TABLE })-[:HAS_COLUMN]->(fkrc:Column {name: fk_reference_column}) | |
MERGE (fkrc)<-[:FK_REFERENCE_COLUMN]-(fk) | |
", | |
{ batchSize:100, parallel: true }); | |
// Show an example schema | |
:param schemaKey=>"003200_tables_oracle.sql"; | |
MATCH (p:Schema { name: $schemaKey }) | |
CALL apoc.path.subgraphAll(p, { | |
minLevel: 0, | |
maxLevel: 25 | |
}) | |
YIELD nodes, relationships | |
RETURN nodes, relationships; | |
// Initial Foreign Key analysis | |
MATCH (fk:ForeignKey) | |
WITH fk | |
MATCH (fkc:Column)<-[:FK_COLUMN]-(fk)-[:FK_REFERENCE_COLUMN]->(fkrc:Column) | |
WITH collect( DISTINCT fkc) AS collFkc, collect( DISTINCT fkrc) AS collFkrc, fk | |
WITH fk, apoc.coll.sort([x IN collFkc | x.name]) AS collFkc, apoc.coll.sort([x IN collFkrc | x.name]) AS collFkrc | |
WITH fk, collFkc, collFkrc, apoc.coll.intersection(collFkc, collFkrc) AS intersection | |
WITH *, 1.0 * size(intersection) / size(collFkc) AS propExactlyShared | |
WHERE propExactlyShared < 1 | |
RETURN fk, collFkc, collFkrc | |
LIMIT 10 | |
// Analysis of how keys are used in different schemas, inc prevalence of columns use as PK or FK or compounds thereof and how frequently the columns appear in other columns in schemas on average | |
MATCH (c:Column)<-[:HAS_COLUMN]-(t:Table)<-[:CONTAINS_TABLE]-(s:Schema) | |
// WITH * // Testing line | |
// WHERE s.name="024573_Ambari-DDL-Oracle-CREATE.sql" // Testing line | |
OPTIONAL MATCH (c)<-[:PK_COLUMN]-(pk:PrimaryKey) | |
OPTIONAL MATCH (pk)-[:PK_COLUMN]->(pkco:Column) | |
OPTIONAL MATCH (c)<-[:FK_COLUMN]-(fk:ForeignKey) | |
OPTIONAL MATCH (fk)-[:FK_COLUMN]->(fkco:Column) | |
MATCH (co:Column { schemaId: s.name, name: c.name }) | |
WHERE co<>c | |
WITH c, t, s, | |
fk is NOT null AS isForeignKey, | |
size(collect( DISTINCT fkco))>1 AS compoundForeign, | |
pk is NOT null AS isPrimaryKey, | |
size(collect( DISTINCT pkco))>1 AS compoundPrimary, | |
count( DISTINCT co) AS otherColumnsInTablesInSchemaWithColumnName | |
// WHERE isPrimaryKey=True AND compoundPrimary=False // Testing line | |
WITH s, | |
t, | |
c, | |
CASE WHEN isPrimaryKey= true THEN 1 ELSE 0 END AS isPrimaryKey, | |
CASE WHEN compoundPrimary= true THEN 1 ELSE 0 END AS compoundPrimary, | |
CASE WHEN isForeignKey= true THEN 1 ELSE 0 END AS isForeignKey, | |
CASE WHEN compoundForeign= true THEN 1 ELSE 0 END AS compoundForeign, | |
otherColumnsInTablesInSchemaWithColumnName | |
// , [c,t,s] as nodes // Testing line | |
// LIMIT 3 // Testing line | |
WITH s, t, c, sum(isPrimaryKey) AS numTimesPkInTableInSchema, sum(compoundPrimary) AS numTimesCompoundPkInTableInSchema, sum(isForeignKey) AS numTimesFkInTableInSchema, sum(compoundForeign) AS numTimesCompoundFkInTableInSchema, avg(otherColumnsInTablesInSchemaWithColumnName) AS avgOtherColumnsInTablesInSchemaWithColumnName | |
RETURN c.name, | |
count( DISTINCT t) AS appearsInNumTables, | |
count( DISTINCT s) AS appearsAcrossNumSchema, | |
sum(numTimesPkInTableInSchema) AS totalTimesSeenAsPk, | |
sum(numTimesCompoundPkInTableInSchema) AS totalTimesSeenAsCompoundPk, | |
sum(numTimesFkInTableInSchema) AS totalTimesSeenAsFk, | |
sum(numTimesCompoundFkInTableInSchema) AS totalTimesSeenAsCompoundFk, | |
avg(avgOtherColumnsInTablesInSchemaWithColumnName) AS avgTimeSeenInOtherTables | |
// Finding schema with certain characteristics | |
MATCH (s:Schema) | |
// Find a schema which has at least 2 tables and at least one PK and one FK | |
WHERE (s)-->(:Table)-->(:Column)-[:FK_COLUMN]-() | |
AND | |
(s)-->(:Table)-->(:Column)-[:PK_COLUMN]-() | |
AND | |
count { (s)-->() } > 1 | |
WITH s | |
SKIP 101 | |
LIMIT 1 | |
RETURN s | |
// Proof of concept simple string based key detection | |
MATCH (s:Schema) | |
// Find a schema which has at least 2 tables and at least one PK and one FK | |
WHERE (s)-->(:Table)-->(:Column)-[:FK_COLUMN]-() | |
AND | |
(s)-->(:Table)-->(:Column)-[:PK_COLUMN]-() | |
AND | |
count { (s)-->() } > 1 | |
WITH s | |
SKIP 152 | |
LIMIT 1 | |
MATCH (s)--(t:Table)-->(c:Column) | |
WITH t, c, s | |
WHERE (c.name='id' OR c.name ENDSWITH '_id') | |
WITH s, t, c, | |
CASE WHEN c.name ENDS WITH '_id' THEN substring(c.name, 0, size(c.name)-3) END AS potentialReferenceTableName | |
OPTIONAL MATCH (s)-->(rt:Table) | |
WHERE rt.name CONTAINS potentialReferenceTableName AND rt<>t | |
WITH *, | |
CASE WHEN rt IS null THEN 'PrimaryKey' ELSE 'ForeignKey' END AS keyType | |
RETURN s.name AS schemaName, t.name AS tableName, c.name AS columnName, keyType, rt.name AS refereneTableName |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment