bnekolny/migtrate_data.py

## migtrate_data.py
import os
import yaml

## Execution for me was:
## `python migrate_data.py > ./migration_inserts.sql`
## `sed -i '' 's/None/NULL/' ./migration_inserts.sql`
## `psql -f ./migration_inserts.sql`

## NOTE: A few things to know about the script here:
## - Artifacts were stored remotely, so no artifact migration
## - experiment source_type is always LOCAL for us, I avoided the mapping from int -> str
## - experiment status is always FINISHED for us, I avoided the mapping from int -> str
## - experiment source_version is never set
## - experiment lifecycle_stage is always active for us, I avoided the mapping from int -> str
## - metric timestamp is made up, since it was tracked as an array in filesystem and as an epoch in the DB

rootDir = 'MLFLOW_DATA_DIRECTORY_ROOT'

for experiment_id in os.listdir(rootDir):
    if experiment_id in ['.trash']:
        continue

    f = open("{root}/{experiment}/meta.yaml".format(root=rootDir, experiment=experiment_id), "r")
    experiment = yaml.load(f)
    experiment['experiment_id'] = experiment_id
    experiment['lifecycle_stage'] = 'active'
    experiment_insert = "INSERT INTO experiments (experiment_id, name, artifact_location, lifecycle_stage) VALUES ({0}, '{1}', '{2}', '{3}');".format(
        experiment['experiment_id'],
        experiment['name'],
        experiment['artifact_location'],
        experiment['lifecycle_stage'])
    print(experiment_insert)
    for run_uuid in os.listdir("{root}/{experiment}".format(root=rootDir, experiment=experiment_id)):
        if run_uuid in ['meta.yaml']:
            continue
        rf = open("{root}/{experiment}/{run}/meta.yaml".format(root=rootDir, experiment=experiment_id, run=run_uuid), "r")
        run = yaml.load(rf)
        run['run_uuid'] = run_uuid
        run['lifecycle_stage'] = 'active'
        run_insert = "INSERT INTO runs (" \
            "run_uuid, name, source_type, source_name, entry_point_name, user_id, status, start_time, end_time, source_version, lifecycle_stage, artifact_uri, experiment_id" \
            ") VALUES ( '{0}', '{1}', '{2}', '{3}', '{4}', '{5}', '{6}', {7}, {8}, '{9}', '{10}', '{11}', {12});".format(
                    run['run_uuid'],
                    run['name'],
                    'LOCAL',
                    run['source_name'],
                    run['entry_point_name'],
                    run['user_id'],
                    'FINISHED',
                    run['start_time'],
                    run['end_time'],
                    '',
                    'active',
                    run['artifact_uri'],
                    experiment_id)
        print(run_insert)

        # Metrics
        for metric in os.listdir("{root}/{experiment}/{run}/metrics".format(root=rootDir, experiment=experiment_id, run=run_uuid)):
            f = open("{root}/{experiment}/{run}/metrics/{metric}".format(root=rootDir, experiment=experiment_id, run=run_uuid, metric=metric), "r")
            line = f.readline()
            while line:
                #split
                counter, val = line.split()
                metric_insert = "INSERT INTO metrics (" \
                    "key, value, timestamp, run_uuid" \
                    ") VALUES ( '{0}', '{1}', {2}, '{3}' );".format(
                        metric,
                        val,
                        int(run['start_time']) + int(counter),
                        run_uuid)
                print(metric_insert)
                line = f.readline()
            f.close()
        # Params
        for param in os.listdir("{root}/{experiment}/{run}/params".format(root=rootDir, experiment=experiment_id, run=run_uuid)):
            f = open("{root}/{experiment}/{run}/params/{param}".format(root=rootDir, experiment=experiment_id, run=run_uuid, param=param), "r")
            line = f.readline()
            while line:
                param_insert = "INSERT INTO params (" \
                    "key, value, run_uuid" \
                    ") VALUES ( '{0}', '{1}', '{2}' );".format(
                        param,
                        line.strip(),
                        run_uuid)
                print(param_insert)
                line = f.readline()
            f.close()
	import os
	import yaml

	## Execution for me was:
	## `python migrate_data.py > ./migration_inserts.sql`
	## `sed -i '' 's/None/NULL/' ./migration_inserts.sql`
	## `psql -f ./migration_inserts.sql`

	## NOTE: A few things to know about the script here:
	## - Artifacts were stored remotely, so no artifact migration
	## - experiment source_type is always LOCAL for us, I avoided the mapping from int -> str
	## - experiment status is always FINISHED for us, I avoided the mapping from int -> str
	## - experiment source_version is never set
	## - experiment lifecycle_stage is always active for us, I avoided the mapping from int -> str
	## - metric timestamp is made up, since it was tracked as an array in filesystem and as an epoch in the DB

	rootDir = 'MLFLOW_DATA_DIRECTORY_ROOT'

	for experiment_id in os.listdir(rootDir):
	if experiment_id in ['.trash']:
	continue

	f = open("{root}/{experiment}/meta.yaml".format(root=rootDir, experiment=experiment_id), "r")
	experiment = yaml.load(f)
	experiment['experiment_id'] = experiment_id
	experiment['lifecycle_stage'] = 'active'
	experiment_insert = "INSERT INTO experiments (experiment_id, name, artifact_location, lifecycle_stage) VALUES ({0}, '{1}', '{2}', '{3}');".format(
	experiment['experiment_id'],
	experiment['name'],
	experiment['artifact_location'],
	experiment['lifecycle_stage'])
	print(experiment_insert)
	for run_uuid in os.listdir("{root}/{experiment}".format(root=rootDir, experiment=experiment_id)):
	if run_uuid in ['meta.yaml']:
	continue
	rf = open("{root}/{experiment}/{run}/meta.yaml".format(root=rootDir, experiment=experiment_id, run=run_uuid), "r")
	run = yaml.load(rf)
	run['run_uuid'] = run_uuid
	run['lifecycle_stage'] = 'active'
	run_insert = "INSERT INTO runs (" \
	"run_uuid, name, source_type, source_name, entry_point_name, user_id, status, start_time, end_time, source_version, lifecycle_stage, artifact_uri, experiment_id" \
	") VALUES ( '{0}', '{1}', '{2}', '{3}', '{4}', '{5}', '{6}', {7}, {8}, '{9}', '{10}', '{11}', {12});".format(
	run['run_uuid'],
	run['name'],
	'LOCAL',
	run['source_name'],
	run['entry_point_name'],
	run['user_id'],
	'FINISHED',
	run['start_time'],
	run['end_time'],
	'',
	'active',
	run['artifact_uri'],
	experiment_id)
	print(run_insert)

	# Metrics
	for metric in os.listdir("{root}/{experiment}/{run}/metrics".format(root=rootDir, experiment=experiment_id, run=run_uuid)):
	f = open("{root}/{experiment}/{run}/metrics/{metric}".format(root=rootDir, experiment=experiment_id, run=run_uuid, metric=metric), "r")
	line = f.readline()
	while line:
	#split
	counter, val = line.split()
	metric_insert = "INSERT INTO metrics (" \
	"key, value, timestamp, run_uuid" \
	") VALUES ( '{0}', '{1}', {2}, '{3}' );".format(
	metric,
	val,
	int(run['start_time']) + int(counter),
	run_uuid)
	print(metric_insert)
	line = f.readline()
	f.close()
	# Params
	for param in os.listdir("{root}/{experiment}/{run}/params".format(root=rootDir, experiment=experiment_id, run=run_uuid)):
	f = open("{root}/{experiment}/{run}/params/{param}".format(root=rootDir, experiment=experiment_id, run=run_uuid, param=param), "r")
	line = f.readline()
	while line:
	param_insert = "INSERT INTO params (" \
	"key, value, run_uuid" \
	") VALUES ( '{0}', '{1}', '{2}' );".format(
	param,
	line.strip(),
	run_uuid)
	print(param_insert)
	line = f.readline()
	f.close()