linnil1/remove_files_in_database_galaxy.py

## remove_files_in_database_galaxy.py
import csv
import os
import shutil


def hash_id(id):
    # input 12
    # output 000/dataset_12.dat
    s = str(id)
    l = len(s)
    if l < 4:
        s1 = "000" + s
    else:
        s1 = s
    padded = ((3 - len(s) % 3) % 3 * "0") + s1
    s1 = [padded[i * 3:(i + 1) * 3] for i in range(len(padded) // 3)]
    return "/".join(s1[:-1]) + "/dataset_" + s + '.dat'


def deleteFile(f):
    if not os.path.exists(f):
        return
    if os.path.isfile(f):
        print("file", f)
        os.system("rm " + f)
    if os.path.isdir(f):
        print("dir", f)
        os.system("rm -r " + f)


def deleteFiles(id):
    files = id_files.get(id)
    if files:
        files = files.get("files")
    if not files:
        return
    for f in files:
        deleteFile(f)


id_files = {}
for i in os.listdir('.'):
  if i.startswith('0'):
    for j in os.listdir(i):
      if len(j.split(".")[0].split("_")) > 1:
        id = j.split(".")[0].split("_")[1]
        if id not in id_files:
          id_files[id] = {'files': []}
        id_files[id]['files'].append(i + "/" + j)


os.system("psql -h localhost -d galaxy -U galaxy -c \"\copy (SELECT * FROM history_dataset_association JOIN dataset ON dataset.id=history_dataset_association.dataset_id JOIN history ON history.id=history_dataset_association.history_id JOIN galaxy_user ON history.user_id=galaxy_user.id) TO '/tmp/gu' with csv header\"")
os.system("mv /tmp/gu/ ./backup_db.csv")
f = csv.reader(open("backup_db.csv"))
header = next(f)
# ['id', 'history_id', 'dataset_id', 'create_time', 'update_time', 'copied_from_history_dataset_association_id', 'hid', 'name', 'info', 'blurb', 'peek', 'extension', 'metadata', 'parent_id', 'designation', 'deleted', 'visible', 'copied_from_library_dataset_dataset_association_id', 'state', 'purged', 'tool_version', 'extended_metadata_id', 'id', 'create_time', 'update_time', 'state', 'deleted', 'purged', 'purgable', 'external_filename', '_extra_files_path', 'file_size', 'total_size', 'object_store_id', 'uuid', 'id', 'create_time', 'update_time', 'user_id', 'name', 'hid_counter', 'deleted', 'purged', 'genome_build', 'importable', 'slug', 'published', 'importing', 'id', 'create_time', 'update_time', 'email', 'password', 'external', 'deleted', 'purged', 'username', 'form_values_id', 'disk_usage', 'active', 'activation_token']
data = list(f)
email_id = header.index("email")

# deleted
users = set([(i[-10], i[-11]) for i in data])
emails = []
for email, t in users:
    if 2015 <= int(t.split('-')[0]) < 2018:
        emails.append(email)
        # print(email, t)
    elif 2018 == int(t.split('-')[0]) and int(t.split('-')[1]) < 7:
        emails.append(email)
    elif email.endswith("@gmail.com"):
        emails.append(email)
    elif email.endswith("@ntu.edu.tw") and email.startswith("b0"):
        emails.append(email)

emails = list(set(emails) - set([]))

for i in range(0, len(emails), 4):
    for j in range(i, min(i+4, len(emails))):
        print "{0:25}".format(emails[j]),
    print ""

data2 = [i for i in data if i[email_id] in emails]
want_delete = list(set([(i[2], i[32]) for i in data2]))
print(sum(int(i[1]) for i in want_delete if i[1]) / 1024 ** 3, "GB")
for i in want_delete:
    id = i[0]
    f = hash_id(id)
    print(id, f)
    deleteFiles(id)
    os.system("mkdir -p " + "/".join(f.split("/")[:-1]))
    os.system("touch " + f)
	import csv
	import os
	import shutil


	def hash_id(id):
	# input 12
	# output 000/dataset_12.dat
	s = str(id)
	l = len(s)
	if l < 4:
	s1 = "000" + s
	else:
	s1 = s
	padded = ((3 - len(s) % 3) % 3 * "0") + s1
	s1 = [padded[i * 3:(i + 1) * 3] for i in range(len(padded) // 3)]
	return "/".join(s1[:-1]) + "/dataset_" + s + '.dat'


	def deleteFile(f):
	if not os.path.exists(f):
	return
	if os.path.isfile(f):
	print("file", f)
	os.system("rm " + f)
	if os.path.isdir(f):
	print("dir", f)
	os.system("rm -r " + f)


	def deleteFiles(id):
	files = id_files.get(id)
	if files:
	files = files.get("files")
	if not files:
	return
	for f in files:
	deleteFile(f)


	id_files = {}
	for i in os.listdir('.'):
	if i.startswith('0'):
	for j in os.listdir(i):
	if len(j.split(".")[0].split("_")) > 1:
	id = j.split(".")[0].split("_")[1]
	if id not in id_files:
	id_files[id] = {'files': []}
	id_files[id]['files'].append(i + "/" + j)


	os.system("psql -h localhost -d galaxy -U galaxy -c \"\copy (SELECT * FROM history_dataset_association JOIN dataset ON dataset.id=history_dataset_association.dataset_id JOIN history ON history.id=history_dataset_association.history_id JOIN galaxy_user ON history.user_id=galaxy_user.id) TO '/tmp/gu' with csv header\"")
	os.system("mv /tmp/gu/ ./backup_db.csv")
	f = csv.reader(open("backup_db.csv"))
	header = next(f)
	# ['id', 'history_id', 'dataset_id', 'create_time', 'update_time', 'copied_from_history_dataset_association_id', 'hid', 'name', 'info', 'blurb', 'peek', 'extension', 'metadata', 'parent_id', 'designation', 'deleted', 'visible', 'copied_from_library_dataset_dataset_association_id', 'state', 'purged', 'tool_version', 'extended_metadata_id', 'id', 'create_time', 'update_time', 'state', 'deleted', 'purged', 'purgable', 'external_filename', '_extra_files_path', 'file_size', 'total_size', 'object_store_id', 'uuid', 'id', 'create_time', 'update_time', 'user_id', 'name', 'hid_counter', 'deleted', 'purged', 'genome_build', 'importable', 'slug', 'published', 'importing', 'id', 'create_time', 'update_time', 'email', 'password', 'external', 'deleted', 'purged', 'username', 'form_values_id', 'disk_usage', 'active', 'activation_token']
	data = list(f)
	email_id = header.index("email")

	# deleted
	users = set([(i[-10], i[-11]) for i in data])
	emails = []
	for email, t in users:
	if 2015 <= int(t.split('-')[0]) < 2018:
	emails.append(email)
	# print(email, t)
	elif 2018 == int(t.split('-')[0]) and int(t.split('-')[1]) < 7:
	emails.append(email)
	elif email.endswith("@gmail.com"):
	emails.append(email)
	elif email.endswith("@ntu.edu.tw") and email.startswith("b0"):
	emails.append(email)

	emails = list(set(emails) - set([]))

	for i in range(0, len(emails), 4):
	for j in range(i, min(i+4, len(emails))):
	print "{0:25}".format(emails[j]),
	print ""

	data2 = [i for i in data if i[email_id] in emails]
	want_delete = list(set([(i[2], i[32]) for i in data2]))
	print(sum(int(i[1]) for i in want_delete if i[1]) / 1024 ** 3, "GB")
	for i in want_delete:
	id = i[0]
	f = hash_id(id)
	print(id, f)
	deleteFiles(id)
	os.system("mkdir -p " + "/".join(f.split("/")[:-1]))
	os.system("touch " + f)