Skip to content

Instantly share code, notes, and snippets.

@CJHwong
Last active March 27, 2024 17:22
Show Gist options
  • Save CJHwong/6b7ba77512ad321e8b4f16d97f7f2d44 to your computer and use it in GitHub Desktop.
Save CJHwong/6b7ba77512ad321e8b4f16d97f7f2d44 to your computer and use it in GitHub Desktop.
MySQL Schema Diagram Generator
#!/usr/bin/awk -f
# Authors: @esperlu, @artemyk, @gkuenning, @dumblob
# FIXME detect empty input file and issue a warning
function printerr( s ){ print s | "cat >&2" }
BEGIN {
if( ARGC != 2 ){
printerr( \
"USAGE:\n"\
" mysql2sqlite dump_mysql.sql > dump_sqlite3.sql\n" \
" OR\n" \
" mysql2sqlite dump_mysql.sql | sqlite3 sqlite.db\n" \
"\n" \
"NOTES:\n" \
" Dash in filename is not supported, because dash (-) means stdin." )
no_END = 1
exit 1
}
# Find INT_MAX supported by both this AWK (usually an ISO C signed int)
# and SQlite.
# On non-8bit-based architectures, the additional bits are safely ignored.
# 8bit (lower precision should not exist)
s="127"
# "63" + 0 avoids potential parser misbehavior
if( (s + 0) "" == s ){ INT_MAX_HALF = "63" + 0 }
# 16bit
s="32767"
if( (s + 0) "" == s ){ INT_MAX_HALF = "16383" + 0 }
# 32bit
s="2147483647"
if( (s + 0) "" == s ){ INT_MAX_HALF = "1073741823" + 0 }
# 64bit (as INTEGER in SQlite3)
s="9223372036854775807"
if( (s + 0) "" == s ){ INT_MAX_HALF = "4611686018427387904" + 0 }
# # 128bit
# s="170141183460469231731687303715884105728"
# if( (s + 0) "" == s ){ INT_MAX_HALF = "85070591730234615865843651857942052864" + 0 }
# # 256bit
# s="57896044618658097711785492504343953926634992332820282019728792003956564819968"
# if( (s + 0) "" == s ){ INT_MAX_HALF = "28948022309329048855892746252171976963317496166410141009864396001978282409984" + 0 }
# # 512bit
# s="6703903964971298549787012499102923063739682910296196688861780721860882015036773488400937149083451713845015929093243025426876941405973284973216824503042048"
# if( (s + 0) "" == s ){ INT_MAX_HALF = "3351951982485649274893506249551461531869841455148098344430890360930441007518386744200468574541725856922507964546621512713438470702986642486608412251521024" + 0 }
# # 1024bit
# s="89884656743115795386465259539451236680898848947115328636715040578866337902750481566354238661203768010560056939935696678829394884407208311246423715319737062188883946712432742638151109800623047059726541476042502884419075341171231440736956555270413618581675255342293149119973622969239858152417678164812112068608"
# if( (s + 0) "" == s ){ INT_MAX_HALF = "44942328371557897693232629769725618340449424473557664318357520289433168951375240783177119330601884005280028469967848339414697442203604155623211857659868531094441973356216371319075554900311523529863270738021251442209537670585615720368478277635206809290837627671146574559986811484619929076208839082406056034304" + 0 }
# # higher precision probably not needed
FS=",$"
print "PRAGMA synchronous = OFF;"
print "PRAGMA journal_mode = MEMORY;"
print "BEGIN TRANSACTION;"
}
# historically 3 spaces separate non-argument local variables
function bit_to_int( str_bit, powtwo, i, res, bit, overflow ){
powtwo = 1
overflow = 0
# 011101 = 1*2^0 + 0*2^1 + 1*2^2 ...
for( i = length( str_bit ); i > 0; --i ){
bit = substr( str_bit, i, 1 )
if( overflow || ( bit == 1 && res > INT_MAX_HALF ) ){
printerr( \
NR ": WARN Bit field overflow, number truncated (LSBs saved, MSBs ignored)." )
break
}
res = res + bit * powtwo
# no warning here as it might be the last iteration
if( powtwo > INT_MAX_HALF ){ overflow = 1; continue }
powtwo = powtwo * 2
}
return res
}
# CREATE TRIGGER statements have funny commenting. Remember we are in trigger.
/^\/\*.*(CREATE.*TRIGGER|create.*trigger)/ {
gsub( /^.*(TRIGGER|trigger)/, "CREATE TRIGGER" )
print
inTrigger = 1
next
}
# The end of CREATE TRIGGER has a stray comment terminator
/(END|end) \*\/;;/ { gsub( /\*\//, "" ); print; inTrigger = 0; next }
# The rest of triggers just get passed through
inTrigger != 0 { print; next }
# CREATE VIEW looks like a TABLE in comments
/^\/\*.*(CREATE.*TABLE|create.*table)/ {
inView = 1
next
}
# end of CREATE VIEW
/^(\).*(ENGINE|engine).*\*\/;)/ {
inView = 0
next
}
# content of CREATE VIEW
inView != 0 { next }
# skip comments
/^\/\*/ { next }
# skip PARTITION statements
/^ *[(]?(PARTITION|partition) +[^ ]+/ { next }
# print all INSERT lines
( /^ *\(/ && /\) *[,;] *$/ ) || /^(INSERT|insert|REPLACE|replace)/ {
prev = ""
# first replace \\ by \_ that mysqldump never generates to deal with
# sequnces like \\n that should be translated into \n, not \<LF>.
# After we convert all escapes we replace \_ by backslashes.
gsub( /\\\\/, "\\_" )
# single quotes are escaped by another single quote
gsub( /\\'/, "''" )
gsub( /\\n/, "\n" )
gsub( /\\r/, "\r" )
gsub( /\\"/, "\"" )
gsub( /\\\032/, "\032" ) # substitute char
gsub( /\\_/, "\\" )
# sqlite3 is limited to 16 significant digits of precision
while( match( $0, /0x[0-9a-fA-F]{17}/ ) ){
hexIssue = 1
sub( /0x[0-9a-fA-F]+/, substr( $0, RSTART, RLENGTH-1 ), $0 )
}
if( hexIssue ){
printerr( \
NR ": WARN Hex number trimmed (length longer than 16 chars)." )
hexIssue = 0
}
print
next
}
# CREATE DATABASE is not supported
/^(CREATE DATABASE|create database)/ { next }
# print the CREATE line as is and capture the table name
/^(CREATE|create)/ {
if( $0 ~ /IF NOT EXISTS|if not exists/ || $0 ~ /TEMPORARY|temporary/ ){
caseIssue = 1
printerr( \
NR ": WARN Potential case sensitivity issues with table/column naming\n" \
" (see INFO at the end)." )
}
if( match( $0, /`[^`]+/ ) ){
tableName = substr( $0, RSTART+1, RLENGTH-1 )
}
aInc = 0
prev = ""
firstInTable = 1
print
next
}
# Replace `FULLTEXT KEY` (probably other `XXXXX KEY`)
/^ (FULLTEXT KEY|fulltext key)/ { gsub( /[A-Za-z ]+(KEY|key)/, " KEY" ) }
# Get rid of field lengths in KEY lines
/ (PRIMARY |primary )?(KEY|key)/ { gsub( /\([0-9]+\)/, "" ) }
aInc == 1 && /PRIMARY KEY|primary key/ { next }
# Replace COLLATE xxx_xxxx_xx statements with COLLATE BINARY
/ (COLLATE|collate) [a-z0-9_]*/ { gsub( /(COLLATE|collate) [a-z0-9_]*/, "COLLATE BINARY" ) }
# Print all fields definition lines except the `KEY` lines.
/^ / && !/^( (KEY|key)|\);)/ {
if( match( $0, /[^"`]AUTO_INCREMENT|auto_increment[^"`]/) ){
aInc = 1
gsub( /AUTO_INCREMENT|auto_increment/, "PRIMARY KEY AUTOINCREMENT" )
}
gsub( /(UNIQUE KEY|unique key) (`.*`|".*") /, "UNIQUE " )
gsub( /(CHARACTER SET|character set) [^ ]+[ ,]/, "" )
# FIXME
# CREATE TRIGGER [UpdateLastTime]
# AFTER UPDATE
# ON Package
# FOR EACH ROW
# BEGIN
# UPDATE Package SET LastUpdate = CURRENT_TIMESTAMP WHERE ActionId = old.ActionId;
# END
gsub( /(ON|on) (UPDATE|update) (CURRENT_TIMESTAMP|current_timestamp)(\(\))?/, "" )
gsub( /(DEFAULT|default) (CURRENT_TIMESTAMP|current_timestamp)(\(\))?/, "DEFAULT current_timestamp")
gsub( /(COLLATE|collate) [^ ]+ /, "" )
gsub( /(ENUM|enum)[^)]+\)/, "text " )
gsub( /(SET|set)\([^)]+\)/, "text " )
gsub( /UNSIGNED|unsigned/, "" )
gsub( /_utf8mb3/, "" )
gsub( /` [^ ]*(INT|int|BIT|bit)[^ ]*/, "` integer" )
gsub( /" [^ ]*(INT|int|BIT|bit)[^ ]*/, "\" integer" )
ere_bit_field = "[bB]'[10]+'"
if( match($0, ere_bit_field) ){
sub( ere_bit_field, bit_to_int( substr( $0, RSTART +2, RLENGTH -2 -1 ) ) )
}
# remove USING BTREE and other suffixes for USING, for example: "UNIQUE KEY
# `hostname_domain` (`hostname`,`domain`) USING BTREE,"
gsub( / USING [^, ]+/, "" )
# field comments are not supported
gsub( / (COMMENT|comment).+$/, "" )
# Get commas off end of line
gsub( /,.?$/, "" )
if( prev ){
if( firstInTable ){
print prev
firstInTable = 0
}
else {
print "," prev
}
}
else {
# FIXME check if this is correct in all cases
if( match( $1,
/(CONSTRAINT|constraint) ["].*["] (FOREIGN KEY|foreign key)/ ) ){
print ","
}
}
prev = $1
}
/ ENGINE| engine/ {
if( prev ){
if( firstInTable ){
print prev
firstInTable = 0
}
else {
print "," prev
}
}
prev=""
print ");"
next
}
# `KEY` lines are extracted from the `CREATE` block and stored in array for later print
# in a separate `CREATE KEY` command. The index name is prefixed by the table name to
# avoid a sqlite error for duplicate index name.
/^( (KEY|key)|\);)/ {
if( prev ){
if( firstInTable ){
print prev
firstInTable = 0
}
else {
print "," prev
}
}
prev = ""
if( $0 == ");" ){
print
}
else {
if( match( $0, /`[^`]+/ ) ){
indexName = substr( $0, RSTART+1, RLENGTH-1 )
}
if( match( $0, /\([^()]+/ ) ){
indexKey = substr( $0, RSTART+1, RLENGTH-1 )
}
# idx_ prefix to avoid name clashes (they really happen!)
key[tableName] = key[tableName] "CREATE INDEX \"idx_" \
tableName "_" indexName "\" ON \"" tableName "\" (" indexKey ");\n"
}
}
END {
if( no_END ){ exit 1}
# print all KEY creation lines.
for( table in key ){ printf key[table] }
print "END TRANSACTION;"
if( caseIssue ){
printerr( \
"INFO Pure sqlite identifiers are case insensitive (even if quoted\n" \
" or if ASCII) and doesnt cross-check TABLE and TEMPORARY TABLE\n" \
" identifiers. Thus expect errors like \"table T has no column named F\".")
}
}
-- We start a GraphViz graph
SELECT '
digraph structs {
'
UNION ALL
-- Normally, GraphViz' "dot" command lays out a hierarchical graph from
-- top to bottom. However, we aren't just laying out individual nodes,
-- each node is a vertical list of database fields. To prevent GraphViz
-- from snaking arrows all over the place, we constrain it to draw
-- incoming references on the left of each field, and outgoing references
-- on the right. Since that's the way references flow for each database
-- table, we tell GraphViz to lay the whole graph out left-to-right,
-- which makes its job much easier and produces prettier output.
SELECT '
rankdir="LR"
'
UNION ALL
-- By default, nodes have circles around them. We will draw our own
-- tables below, we do not want the circles.
SELECT '
node [shape=none]
'
UNION ALL
-- This is the big query that renders a node complete with field names
-- for each table in the database. Because we want raw GraphViz output,
-- our query returns rows with a single string field, whose value is a
-- complex calculation using SQL as a templating engine. This is kind
-- of an abuse, but works nicely nevertheless.
SELECT
CASE
-- When the previous row's table name is the same as this one,
-- do nothing.
WHEN LAG(t.name, 1) OVER (ORDER BY t.name) = t.name THEN ''
-- Otherwise, this is the first row of a new table, so start
-- the node markup and add a header row. Normally in GraphViz,
-- the table name would *be* the label of the node, but since
-- we're using the label to represent the entire node, we have
-- to make our own header.
--
-- GraphViz does have a "record" label shape, but it seems tricky
-- to work with and I found the HTML-style label markup easier
-- to get working the way I wanted.
ELSE
t.name || ' [label=<
<TABLE BORDER="0" CELLSPACING="0" CELLBORDER="1">
<TR>
<TD COLSPAN="2"><B>' || t.name || '</B></TD>
</TR>
'
-- After the header (if needed), we have rows for each field in
-- the table.
--
-- The "pk" metadata field is zero for table fields that are not part
-- of the primary key. If the "pk" metadata field is 1 or more, that
-- tells you that table field's order in the (potentially composite)
-- primary key.
--
-- We also add ports to each of the table cells, so that we can
-- later tell GraphViz to specifically connect the ports representing
-- specific fields in each table, instead of connecting the tables
-- generally.
END || '
<TR>
<TD PORT="' || i.name || '_to">' ||
CASE i.pk WHEN 0 THEN '&nbsp;' ELSE '🔑' END ||
'</TD>
<TD PORT="' || i.name || '_from">' || i.name || '</TD>
</TR>
' ||
CASE
-- When the next row's table name is the same as this one,
-- do nothing.
WHEN LEAD(t.name, 1) OVER (ORDER BY t.name) = t.name THEN ''
-- Otherwise, this is the last row of a database table, so end
-- the table markup.
ELSE '
</TABLE>
>];
'
END
-- This is how you get nice relational data out of SQLite's metadata
-- pragmas.
FROM pragma_table_list() AS t
JOIN pragma_table_info(t.name, t.schema) AS i
WHERE
-- SQLite has a bunch of metadata tables in each schema, which
-- are hidden from .tables and .schema but which are reported
-- in pragma_table_list(). They're not user-created and almost
-- certainly user databases don't have foreign keys to them, so
-- let's just filter them out.
t.name NOT LIKE 'sqlite_%'
-- Despite its name, pragma_table_list() also includes views.
-- Since those don't store any information or have any correctness
-- constraints, they're just distracting if you're trying to quickly
-- understand a database's schema, so we'll filter them out too.
AND t.type = 'table'
UNION ALL
-- Now we have all the database tables set up, we can draw the links
-- between them. SQLite gives us the pragma_foreign_key_list() function
-- which (for a given source table) lists all the source fields that are
-- part of a foreign key reference, the target table they refer to, and
-- (if it was created with "REFERENCES table_name(column_name)" syntax,
-- the target column names too. Unfortunately, if the reference was
-- created with "REFERENCES table_name" syntax, the pragma does *not*
-- figure out what the corresponding target fields are, so we'll also need
-- pragma_table_info() to look up the primary key(s) for the target table.
--
-- Once we have everything we need, we just do a bit more string
-- concatenation to build up the GraphViz syntax equivalent.
--
-- Note that we use the ports we defined above, as well as the directional
-- overrides :e and :w, to force GraphViz to give us a layout that's
-- likely to be readable.
SELECT
-- We left-join every foreign key field against pragma_table_info
-- looking for primary keys, and the target table may have a composite
-- primary key even if the foreign key does not reference the primary
-- key, so we may wind up with multiple results describing the same
-- foreign key reference. DISTINCT makes sure we only describe each
-- reference once.
DISTINCT
t.name || ':' || f."from" || '_from:e -> ' ||
-- If the constraint was created with "REFERENCES
-- table_name(column_name)", then f.to will contain 'column_name'.
-- Otherwise, f.to is NULL, and we need to grab the corresponding
-- field from the primary key in i.name.
f."table" || ':' || COALESCE(f."to", i.name) || '_to:w'
FROM pragma_table_list() AS t
JOIN pragma_foreign_key_list(t.name, t.schema) AS f
-- We look up all the fields in the target table, just in case
-- pragma_foreign_key_list() doesn't tell us what the target field
-- name is. SQLite doesn't allow foreign-key references to cross
-- schemas, so it's OK to use the source table's schema name to look
-- up the target table.
--
-- Strictly speaking, we shouldn't need to LEFT JOIN here, a basic
-- JOIN should do. This works around a bug in SQLite 3.16.0 to
-- version 3.45.1: https://sqlite.org/forum/forumpost/b1656fcb39
LEFT JOIN pragma_table_info(f."table", t.schema) AS i
-- f.seq represents the order of fields in a source table's composite foreign key
-- reference, starting at 0. In "FOREIGN KEY (a, b)", "a" would have
-- seq=0 and "b" would have seq=1. i.pk represents the order of fields
-- in a primary key, where "0" means "not part of the primary key".
-- In "PRIMARY KEY (a, b)", "a" would have pk=1 and "b" would have pk=2.
-- For a foreign key reference that specifies the target field name,
-- none of this matters, but if the target field name is missing, then
-- this makes sure that each field of the foreign key reference is joined
-- with the corresponding primary key field of the target table.
WHERE f.seq + 1 = i.pk
UNION ALL
-- Lastly, we close the GraphViz graph.
SELECT '
}';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment