Created
April 27, 2015 19:16
-
-
Save mauricioszabo/b240835453b83e6b9cd9 to your computer and use it in GitHub Desktop.
Converter from mysql to pg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Fixes a MySQL dump made with the right format so it can be directly | |
imported to a new PostgreSQL database. | |
Dump using: | |
mysqldump --compatible=postgresql --default-character-set=utf8 -r databasename.mysql -u root databasename | |
""" | |
import re | |
import sys | |
import os | |
import time | |
import subprocess | |
def parse(input_filename, output_filename): | |
"Feed it a file, and it'll output a fixed one" | |
# State storage | |
if input_filename == "-": | |
num_lines = -1 | |
else: | |
num_lines = int(subprocess.check_output(["wc", "-l", input_filename]).strip().split()[0]) | |
tables = {} | |
current_table = None | |
creation_lines = [] | |
enum_types = [] | |
foreign_key_lines = [] | |
fulltext_key_lines = [] | |
sequence_lines = [] | |
cast_lines = [] | |
num_inserts = 0 | |
started = time.time() | |
# Open output file and write header. Logging file handle will be stdout | |
# unless we're writing output to stdout, in which case NO PROGRESS FOR YOU. | |
if output_filename == "-": | |
output = sys.stdout | |
logging = open(os.devnull, "w") | |
else: | |
output = open(output_filename, "w") | |
logging = sys.stdout | |
if input_filename == "-": | |
input_fh = sys.stdin | |
else: | |
input_fh = open(input_filename) | |
output.write("-- Converted by db_converter\n") | |
output.write("START TRANSACTION;\n") | |
output.write("SET standard_conforming_strings=off;\n") | |
output.write("SET escape_string_warning=off;\n") | |
output.write("SET CONSTRAINTS ALL DEFERRED;\n\n") | |
for i, line in enumerate(input_fh): | |
time_taken = time.time() - started | |
percentage_done = (i+1) / float(num_lines) | |
secs_left = (time_taken / percentage_done) - time_taken | |
logging.write("\rLine %i (of %s: %.2f%%) [%s tables] [%s inserts] [ETA: %i min %i sec]" % ( | |
i + 1, | |
num_lines, | |
((i+1)/float(num_lines))*100, | |
len(tables), | |
num_inserts, | |
secs_left // 60, | |
secs_left % 60, | |
)) | |
logging.flush() | |
line = line.decode("utf8").strip().replace(r"\\", "WUBWUBREALSLASHWUB").replace(r"\'", "''").replace("WUBWUBREALSLASHWUB", r"\\") | |
# Ignore comment lines | |
if line.startswith("--") or line.startswith("/*") or line.startswith("LOCK TABLES") or line.startswith("DROP TABLE") or line.startswith("UNLOCK TABLES") or not line: | |
continue | |
# Outside of anything handling | |
if current_table is None: | |
# Start of a table creation statement? | |
if line.startswith("CREATE TABLE"): | |
current_table = line.split('"')[1] | |
tables[current_table] = {"columns": []} | |
creation_lines = [] | |
# Inserting data into a table? | |
elif line.startswith("INSERT INTO"): | |
output.write(line.encode("utf8").replace("'0000-00-00 00:00:00'", "NULL") + "\n") | |
num_inserts += 1 | |
# ??? | |
else: | |
print("\n ! Unknown line in main body: %s" % line) | |
# Inside-create-statement handling | |
else: | |
# Is it a column? | |
if line.startswith('"'): | |
useless, name, definition = line.strip(",").split('"',2) | |
try: | |
type, extra = definition.strip().split(" ", 1) | |
# This must be a tricky enum | |
if ')' in extra: | |
type, extra = definition.strip().split(")") | |
except ValueError: | |
type = definition.strip() | |
extra = "" | |
extra = re.sub("CHARACTER SET [\w\d]+\s*", "", extra.replace("unsigned", "")) | |
extra = re.sub("COLLATE [\w\d]+\s*", "", extra.replace("unsigned", "")) | |
# See if it needs type conversion | |
final_type = None | |
set_sequence = None | |
if type == "tinyint(1)": | |
type = "int4" | |
set_sequence = True | |
final_type = "boolean" | |
elif type.startswith("int("): | |
type = "integer" | |
set_sequence = True | |
elif type.startswith("bigint("): | |
type = "bigint" | |
set_sequence = True | |
elif type == "longtext": | |
type = "text" | |
elif type == "mediumtext": | |
type = "text" | |
elif type == "tinytext": | |
type = "text" | |
elif type.startswith("varchar("): | |
size = int(type.split("(")[1].rstrip(")")) | |
type = "varchar(%s)" % (size * 2) | |
elif type.startswith("smallint("): | |
type = "int2" | |
set_sequence = True | |
elif type == "datetime": | |
type = "timestamp with time zone" | |
elif type == "double": | |
type = "double precision" | |
elif type == "blob": | |
type = "bytea" | |
elif type.startswith("enum(") or type.startswith("set("): | |
types_str = type.split("(")[1].rstrip(")").rstrip('"') | |
types_arr = [type_str.strip('\'') for type_str in types_str.split(",")] | |
# Considered using values to make a name, but its dodgy | |
# enum_name = '_'.join(types_arr) | |
enum_name = "{0}_{1}".format(current_table, name) | |
if enum_name not in enum_types: | |
output.write("CREATE TYPE {0} AS ENUM ({1}); \n".format(enum_name, types_str)); | |
enum_types.append(enum_name) | |
type = enum_name | |
if final_type: | |
cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" DROP DEFAULT, ALTER COLUMN \"%s\" TYPE %s USING CAST(\"%s\" as %s)" % (current_table, name, name, final_type, name, final_type)) | |
# ID fields need sequences [if they are integers?] | |
if name == "id" and set_sequence is True: | |
sequence_lines.append("CREATE SEQUENCE %s_id_seq" % (current_table)) | |
sequence_lines.append("SELECT setval('%s_id_seq', max(id)) FROM %s" % (current_table, current_table)) | |
sequence_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"id\" SET DEFAULT nextval('%s_id_seq')" % (current_table, current_table)) | |
# Record it | |
creation_lines.append('"%s" %s %s' % (name, type, extra)) | |
tables[current_table]['columns'].append((name, type, extra)) | |
# Is it a constraint or something? | |
elif line.startswith("PRIMARY KEY"): | |
creation_lines.append(line.rstrip(",")) | |
elif line.startswith("CONSTRAINT"): | |
foreign_key_lines.append("ALTER TABLE \"%s\" ADD CONSTRAINT %s DEFERRABLE INITIALLY DEFERRED" % (current_table, line.split("CONSTRAINT")[1].strip().rstrip(","))) | |
foreign_key_lines.append("CREATE INDEX ON \"%s\" %s" % (current_table, line.split("FOREIGN KEY")[1].split("REFERENCES")[0].strip().rstrip(","))) | |
elif line.startswith("UNIQUE KEY"): | |
creation_lines.append("UNIQUE (%s)" % line.split("(")[1].split(")")[0]) | |
elif line.startswith("FULLTEXT KEY"): | |
fulltext_keys = " || ' ' || ".join( line.split('(')[-1].split(')')[0].replace('"', '').split(',') ) | |
fulltext_key_lines.append("CREATE INDEX ON %s USING gin(to_tsvector('english', %s))" % (current_table, fulltext_keys)) | |
elif line.startswith("KEY"): | |
pass | |
# Is it the end of the table? | |
elif line == ");": | |
output.write("CREATE TABLE \"%s\" (\n" % current_table) | |
for i, line in enumerate(creation_lines): | |
#output.write(" %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else "")) | |
try: | |
output.write(" %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else "")) | |
except: | |
output.write(" %s%s\n" % (line.encode('utf-8'), "," if i != (len(creation_lines) - 1) else "")) | |
output.write(');\n\n') | |
current_table = None | |
# ??? | |
else: | |
print("\n ! Unknown line inside table creation: %s" % line) | |
# Finish file | |
output.write("\n-- Post-data save --\n") | |
output.write("COMMIT;\n") | |
output.write("START TRANSACTION;\n") | |
# Write typecasts out | |
output.write("\n-- Typecasts --\n") | |
for line in cast_lines: | |
output.write("%s;\n" % line) | |
# Write FK constraints out | |
output.write("\n-- Foreign keys --\n") | |
for line in foreign_key_lines: | |
output.write("%s;\n" % line) | |
# Write sequences out | |
output.write("\n-- Sequences --\n") | |
for line in sequence_lines: | |
output.write("%s;\n" % line) | |
# Write full-text indexkeyses out | |
output.write("\n-- Full Text keys --\n") | |
for line in fulltext_key_lines: | |
output.write("%s;\n" % line) | |
# Finish file | |
output.write("\n") | |
output.write("COMMIT;\n") | |
print("") | |
if __name__ == "__main__": | |
parse(sys.argv[1], sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usar com:
mysqldump --compatible=postgresql -u root --default-character-set=utf8 -r /tmp/dump.mysql <db_name>
python db_converter.py /tmp/dump.mysql /tmp/dump.psql