Skip to content

Instantly share code, notes, and snippets.

@linnil1
Last active November 13, 2021 15:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save linnil1/32b8a95b84b4f517b7df3534d431322d to your computer and use it in GitHub Desktop.
Save linnil1/32b8a95b84b4f517b7df3534d431322d to your computer and use it in GitHub Desktop.
import csv
import os
import shutil
def hash_id(id):
# input 12
# output 000/dataset_12.dat
s = str(id)
l = len(s)
if l < 4:
s1 = "000" + s
else:
s1 = s
padded = ((3 - len(s) % 3) % 3 * "0") + s1
s1 = [padded[i * 3:(i + 1) * 3] for i in range(len(padded) // 3)]
return "/".join(s1[:-1]) + "/dataset_" + s + '.dat'
def deleteFile(f):
if not os.path.exists(f):
return
if os.path.isfile(f):
print("file", f)
os.system("rm " + f)
if os.path.isdir(f):
print("dir", f)
os.system("rm -r " + f)
def deleteFiles(id):
files = id_files.get(id)
if files:
files = files.get("files")
if not files:
return
for f in files:
deleteFile(f)
id_files = {}
for i in os.listdir('.'):
if i.startswith('0'):
for j in os.listdir(i):
if len(j.split(".")[0].split("_")) > 1:
id = j.split(".")[0].split("_")[1]
if id not in id_files:
id_files[id] = {'files': []}
id_files[id]['files'].append(i + "/" + j)
os.system("psql -h localhost -d galaxy -U galaxy -c \"\copy (SELECT * FROM history_dataset_association JOIN dataset ON dataset.id=history_dataset_association.dataset_id JOIN history ON history.id=history_dataset_association.history_id JOIN galaxy_user ON history.user_id=galaxy_user.id) TO '/tmp/gu' with csv header\"")
os.system("mv /tmp/gu/ ./backup_db.csv")
f = csv.reader(open("backup_db.csv"))
header = next(f)
# ['id', 'history_id', 'dataset_id', 'create_time', 'update_time', 'copied_from_history_dataset_association_id', 'hid', 'name', 'info', 'blurb', 'peek', 'extension', 'metadata', 'parent_id', 'designation', 'deleted', 'visible', 'copied_from_library_dataset_dataset_association_id', 'state', 'purged', 'tool_version', 'extended_metadata_id', 'id', 'create_time', 'update_time', 'state', 'deleted', 'purged', 'purgable', 'external_filename', '_extra_files_path', 'file_size', 'total_size', 'object_store_id', 'uuid', 'id', 'create_time', 'update_time', 'user_id', 'name', 'hid_counter', 'deleted', 'purged', 'genome_build', 'importable', 'slug', 'published', 'importing', 'id', 'create_time', 'update_time', 'email', 'password', 'external', 'deleted', 'purged', 'username', 'form_values_id', 'disk_usage', 'active', 'activation_token']
data = list(f)
email_id = header.index("email")
# deleted
users = set([(i[-10], i[-11]) for i in data])
emails = []
for email, t in users:
if 2015 <= int(t.split('-')[0]) < 2018:
emails.append(email)
# print(email, t)
elif 2018 == int(t.split('-')[0]) and int(t.split('-')[1]) < 7:
emails.append(email)
elif email.endswith("@gmail.com"):
emails.append(email)
elif email.endswith("@ntu.edu.tw") and email.startswith("b0"):
emails.append(email)
emails = list(set(emails) - set([]))
for i in range(0, len(emails), 4):
for j in range(i, min(i+4, len(emails))):
print "{0:25}".format(emails[j]),
print ""
data2 = [i for i in data if i[email_id] in emails]
want_delete = list(set([(i[2], i[32]) for i in data2]))
print(sum(int(i[1]) for i in want_delete if i[1]) / 1024 ** 3, "GB")
for i in want_delete:
id = i[0]
f = hash_id(id)
print(id, f)
deleteFiles(id)
os.system("mkdir -p " + "/".join(f.split("/")[:-1]))
os.system("touch " + f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment