Skip to content

Instantly share code, notes, and snippets.

@heborras
Created May 30, 2022 13:43
Show Gist options
  • Save heborras/fc2e54ea56abaf291ee8dab17b5e5f19 to your computer and use it in GitHub Desktop.
Save heborras/fc2e54ea56abaf291ee8dab17b5e5f19 to your computer and use it in GitHub Desktop.
Script to handle GridFS files orphaned by seml
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "7fa08b24",
"metadata": {},
"source": [
"# Script to handle GridFS files orphaned by seml\n",
"The script will use the local seml installation and credentials to connect to a given MongoDB database. There all files referenced by seml/sacred and GridFS will be collected.\n",
"Subsequently files contained in GridFS, but not referenced by seml/sacred will be deleted. This will delete orphaned soruce files and artifacts, which don't get deleted with `seml <collection> delete`.\n",
"\n",
"**Note: The script can not destinguish between files orphaned by seml and files created by other tools, since they look the same to the sript. Thus this script should not be used when the database is used by tools other than seml.**\n",
"\n",
"**Please take the time to verify the script does what you think it does. I will not take responsibility if this script deletes your valuable data, which you didn't intend to delete.**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "446bc759",
"metadata": {},
"outputs": [],
"source": [
"import seml\n",
"from seml.database import get_database, get_mongodb_config\n",
"import gridfs\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0488bee",
"metadata": {},
"outputs": [],
"source": [
"# Connect to the database and to GridFS in the database\n",
"db = get_database(**get_mongodb_config())\n",
"fs = gridfs.GridFS(db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc6935b6",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Get all object IDs referenced by seml or sacred\n",
"seml_object_ids = set()\n",
"\n",
"colls_to_ignore = ['metrics', 'fs.files', 'fs.chunks', 'omniboard.metric.columns', 'omniboard.custom.columns', 'omniboard.settings']\n",
"fields_to_get = ['seml', 'artifacts', 'experiment']\n",
"for coll in db.list_collection_names():\n",
" if coll in colls_to_ignore:\n",
" continue\n",
" print(f\"Getting object IDs form collection: {coll}\")\n",
" all_res = seml.get_results(coll, fields=fields_to_get, to_data_frame=False)\n",
" for res in all_res:\n",
" seml_object_ids.update([x['file_id'] for x in res['artifacts']])\n",
" seml_object_ids.update([x[1] for x in res['experiment']['sources']])\n",
" seml_object_ids.update([x[1] for x in res['seml']['source_files']])\n",
"print(f\"Got {len(seml_object_ids)} object IDs referenced by seml or sacred\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4d431ada",
"metadata": {},
"outputs": [],
"source": [
"# Get all object IDs in GridFS\n",
"all_object_ids = set()\n",
"for obj in db['fs.files'].find():\n",
" all_object_ids.add(obj['_id'])\n",
"print(f\"Found {len(all_object_ids)} object IDs in the GridFS\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1652114c",
"metadata": {},
"outputs": [],
"source": [
"dereferenced_object_ids = all_object_ids - seml_object_ids\n",
"print(f'Number of object IDs in GridFS, but not referenced by seml or sacred: {len(dereferenced_object_ids)}')"
]
},
{
"cell_type": "markdown",
"id": "2191dcb5",
"metadata": {},
"source": [
"# CAUTION!\n",
"### The next cell will delete all files in GridFS, which are not referenced by seml or sacred. If the database is used by other tools than just seml, then unrelated files created by these tools will be deleted as well!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd808755",
"metadata": {},
"outputs": [],
"source": [
"# Delete all dereferenced files\n",
"start_time = time.time()\n",
"for i, id in enumerate(dereferenced_object_ids):\n",
" if fs.exists(id):\n",
" if fs.get(id).metadata is None:\n",
" fs.delete(id)\n",
" else:\n",
" print(f\"Warning: The file with ID {id}, does not exist in GridFS.\")\n",
" if i % 100 == 0:\n",
" if i == 0:\n",
" continue\n",
" estimate = (time.time() - start_time) / i * (len(dereferenced_object_ids) - i)\n",
" print(f\"{i/len(dereferenced_object_ids) * 100:.1f} % complete. Time left: {estimate/60:.0f}:{estimate%60:02.0f} [min:s] \", end='\\r')\n",
"\n",
"print(f'COMPLETE: Deleted {len(dereferenced_object_ids)} files from GridFS.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b49254c2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "35cbb564",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a363c2c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d707bd9f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "21c7c14a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "96dab087",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "75ccc91d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "80cb530d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "111b8720",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "582d6491",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "51dde62f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "f48a0cf5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "46727095",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac5af8dd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment