ajdavis/import_other_db.cpp Secret

## import_other_db.cpp
#include <string>
#include <vector>
#include <wiredtiger.h>

#include "mongo/bson/bsonmisc.h"
#include "mongo/bson/bsonobj.h"
#include "mongo/bson/bsonobjbuilder.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/storage/bson_collection_catalog_entry.h"
#include "mongo/db/storage/durable_catalog.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_util.h"

namespace mongo {

struct WTImportArguments {
    std::string filepath;
    // Just the base name, no "table:" nor "file:" prefix. No ".wt" suffix.
    std::string ident;
    // When querying WT metadata for "table:<ident>"
    std::string tableMetadata;
    // When querying WT metadata for "file:<ident>.wt"
    std::string fileMetadata;
};


struct CollectionMetadata {
    WTImportArguments collection;
    std::string namespaceString;
    BSONObj catalogObject;
    BSONObj sizeStorerObject;
    std::vector<WTImportArguments> indexes;
};

void buildStorageMetadata(BSONObjBuilder& toAddTo, const WTImportArguments& fields) {
    toAddTo << fields.ident
            << BSON("tableMetadata" << fields.tableMetadata << "fileMetadata"
                                    << fields.fileMetadata);
}

std::vector<CollectionMetadata> rollbackToStableAndExportConfigs(std::string importingDbpath);

void importFromBackupCursor(OperationContext* opCtxFromRealDatabase, std::string importingDbpath) {
    std::vector<CollectionMetadata> collectionsToImport =
        rollbackToStableAndExportConfigs(importingDbpath);

    for (const auto& coll : collectionsToImport) {
        BSONObjBuilder storageMetadata;
        // I kept things structured on the exporting side. The import though takes WT's metadata in
        // as a single BSONObj blob per collection with what I believe is the following format (via
        // quick reading of `WTKVENgine::importRecordStore`, `WTKVENgine::importSortedDataInterface`
        // -> `WiredTigerUtil::generateImportString`):
        //
        // { "collection-123...": { "tableMetadata": "<tableMetadata>", "fileMetadata":
        // "<fileMetadata>" },
        //   "index-124...": { "tableMetadata": "<tableMetadata>", "fileMetadata": "<fileMetadata>"
        //   }, ... }

        buildStorageMetadata(storageMetadata, coll.collection);
        for (const auto& indexImportArgs : coll.indexes) {
            buildStorageMetadata(storageMetadata, indexImportArgs);
        }

        uassertStatusOK(
            DurableCatalog::get(opCtxFromRealDatabase)
                ->importCollection(
                    opCtxFromRealDatabase,
                    NamespaceString(coll.namespaceString),
                    coll.catalogObject,
                    storageMetadata.done(),
                    DurableCatalog::ImportCollectionUUIDOption::kKeepOld /* or kGenerateNew? */));
    }

    // At this point we also need to construct collection objects and initialize counts. See
    // `live_import/import_collection.cpp::importCollection()` for how we construct/initializes
    // those MDB classes.
}

std::vector<CollectionMetadata> rollbackToStableAndExportConfigs(std::string importingDbpath) {
    WT_CONNECTION* conn;
    // Open WT where the backed up data files were placed. Tell WT that the journal files are
    // compressed with snappy and inside "<importingDbpath>/journal".
    //
    // Note that by not passing any configuration regarding `checkpoint=`, WT defaults to not taking
    // checkpoints outside of opening and closing.
    invariantWTOK(
        wiredtiger_open(importingDbpath.c_str(),
                        nullptr,
                        "config_base=false,log=(enabled=true,path=journal,compressor=snappy)",
                        &conn));

    // WT is open. `WiredTiger.backup` is converted into a fresh `WiredTiger.wt`
    // file. RollbackToStable was implicitly performed and a checkpoint was taken on completion. We
    // rely on checkpoints being disabled to make exporting the WT metadata (byte offset to the root
    // node) consistent with the new file that was written out.
    WT_SESSION* session;
    invariantWTOK(conn->open_session(conn, nullptr, nullptr, &session));

    WT_CURSOR* mdbCatalogCursor;
    WT_CURSOR* sizeStorerCursor;
    invariantWTOK(
        session->open_cursor(session, "table:_mdb_catalog", nullptr, nullptr, &mdbCatalogCursor));
    invariantWTOK(
        session->open_cursor(session, "table:sizeStorer", nullptr, nullptr, &sizeStorerCursor));
    while (true) {
        int ret = mdbCatalogCursor->next(mdbCatalogCursor);
        if (ret == WT_NOTFOUND) {
            break;
        }
        invariantWTOK(ret);

        WT_ITEM value;
        invariantWTOK(mdbCatalogCursor->get_value(mdbCatalogCursor, &value));
        BSONObj rawCatalogEntry(value.data);
        if (DurableCatalogImpl::isFeatureDocument(rawCatalogEntry)) {
            // Not actually a public method.
            //
            // Re: this "feature document", we may "technically" need to copy its flag bits over,
            // but realistically it's not meaningful. I defer to storage execution on the best thing
            // to do.
            continue;
        }

        // Lots of options here. I'm providing something that demonstrates we can avoid some of the
        // higher level classes such as `Collection` or `IndexCatalog` that `exportCollection`
        // otherwise depends on. I'm pessimistic those classes will work out of the box when we're
        // trying to point them at catalog stuff in a different mdb_catalog file.
        //
        // Can also attempt to make a version of `enterprise/live_import/exportCollection` that
        // doesn't depend on these classes and share the code.

        // Please refactor this BSONCollectionCatalogEntry stuff if you feel the urge.
        auto catalogEntry = BSONCollectionCatalogEntry::MetaData::parse(rawCatalogEntry);

        // If exportCollection() is refactored so this code can hook into that -- the "output" of
        // this loop would presumably become the bson object we can pass into the `importCollection`
        // command.
        //
        // The `importCollection` command though (unfortunately) does all the replication bits of
        // importing. I presume secondaries on a merging recipient will be independently copying the
        // donor files and getting them consistent with a rollback to stable step. The primaries WT
        // metadata cannot be correctly used by a secondary in this case.
        //
        // Instead what I'll opt for is calling `DurableCatalog::importCollection()` which basically
        // takes these inputs.
        CollectionMetadata toAdd;
        toAdd.collection.filepath =
            importingDbpath + "/" + rawCatalogEntry["ident"].String() + ".wt";
        // Ident === `collection-123-456`.
        toAdd.collection.ident = rawCatalogEntry["ident"].String();
        toAdd.namespaceString = catalogEntry.ns;
        // I'm pretty sure it's fine to use the same session for querying WT's metadata (i.e:
        // WiredTiger.wt).
        toAdd.collection.tableMetadata = uassertStatusOK(WiredTigerUtil::getMetadata(
            session, "table:" + toAdd.collection.ident));  // table:collection-...
        toAdd.collection.fileMetadata = uassertStatusOK(WiredTigerUtil::getMetadata(
            session, "file:" + toAdd.collection.ident));  // file:collection-...
        toAdd.catalogObject = rawCatalogEntry.getOwned();
        /* do a sizeStorerCursor->search on I believe the `toAdd.collUri` string, get an owned bson
         * obj and shove it in here:
         * toAdd.sizeStorerObject = ownedSizeBSONObj;
         */

        for (const BSONCollectionCatalogEntry::IndexMetaData& index : catalogEntry.indexes) {
            // At this point I learned that IndexMetaData does not provide the "ident" string for
            // indexes either. E.g: `index-123-456`. Thus my plea to refactor the
            // BSONCollectionCatalogEntry file...
            //
            // For now we'll pretend it does...
            WTImportArguments indexImport;
            indexImport.filepath = importingDbpath + "/" + index.ident + ".wt";
            // Ident === `index-124-456`.
            indexImport.ident = index.ident;
            indexImport.tableMetadata =
                WiredTigerUtil::getMetadata(session, "table:" + index.ident);
            indexImport.fileMetadata =
                WiredTigerUtil::getMetadata(session, "file:" + index.ident + ".wt");
        }

        // Closing WT I believe will take a checkpoint. Even though we did no logical writes, it's
        // not obvious to me that the root node offsets we saved as part of the WT metadata sit
        // still. The most conservative thing to do is copy the files at this step before closing
        // WT. Obviously that's less desirable than just moving the files. Moving the files
        // underneath WT while running will almost certainly result WT taking down the whole mongod
        // process.
        //
        // Worth clarifying with WT how these files can be safely and optimally transfered to the
        // "real" dbpath. If my hunch is right that moving files after closing is not safe today --
        // my guess is it's easy for WT to add configuration string to `WT_CONNECTION::close` that
        // skips taking a checkpoint or find some equivalent solution.
        conn->close(conn, nullptr);
    }
}
}  // namespace mongo
	#include <string>
	#include <vector>
	#include <wiredtiger.h>

	#include "mongo/bson/bsonmisc.h"
	#include "mongo/bson/bsonobj.h"
	#include "mongo/bson/bsonobjbuilder.h"
	#include "mongo/db/operation_context.h"
	#include "mongo/db/storage/bson_collection_catalog_entry.h"
	#include "mongo/db/storage/durable_catalog.h"
	#include "mongo/db/storage/wiredtiger/wiredtiger_util.h"

	namespace mongo {

	struct WTImportArguments {
	std::string filepath;
	// Just the base name, no "table:" nor "file:" prefix. No ".wt" suffix.
	std::string ident;
	// When querying WT metadata for "table:<ident>"
	std::string tableMetadata;
	// When querying WT metadata for "file:<ident>.wt"
	std::string fileMetadata;
	};


	struct CollectionMetadata {
	WTImportArguments collection;
	std::string namespaceString;
	BSONObj catalogObject;
	BSONObj sizeStorerObject;
	std::vector<WTImportArguments> indexes;
	};

	void buildStorageMetadata(BSONObjBuilder& toAddTo, const WTImportArguments& fields) {
	toAddTo << fields.ident
	<< BSON("tableMetadata" << fields.tableMetadata << "fileMetadata"
	<< fields.fileMetadata);
	}

	std::vector<CollectionMetadata> rollbackToStableAndExportConfigs(std::string importingDbpath);

	void importFromBackupCursor(OperationContext* opCtxFromRealDatabase, std::string importingDbpath) {
	std::vector<CollectionMetadata> collectionsToImport =
	rollbackToStableAndExportConfigs(importingDbpath);

	for (const auto& coll : collectionsToImport) {
	BSONObjBuilder storageMetadata;
	// I kept things structured on the exporting side. The import though takes WT's metadata in
	// as a single BSONObj blob per collection with what I believe is the following format (via
	// quick reading of `WTKVENgine::importRecordStore`, `WTKVENgine::importSortedDataInterface`
	// -> `WiredTigerUtil::generateImportString`):
	//
	// { "collection-123...": { "tableMetadata": "<tableMetadata>", "fileMetadata":
	// "<fileMetadata>" },
	// "index-124...": { "tableMetadata": "<tableMetadata>", "fileMetadata": "<fileMetadata>"
	// }, ... }

	buildStorageMetadata(storageMetadata, coll.collection);
	for (const auto& indexImportArgs : coll.indexes) {
	buildStorageMetadata(storageMetadata, indexImportArgs);
	}

	uassertStatusOK(
	DurableCatalog::get(opCtxFromRealDatabase)
	->importCollection(
	opCtxFromRealDatabase,
	NamespaceString(coll.namespaceString),
	coll.catalogObject,
	storageMetadata.done(),
	DurableCatalog::ImportCollectionUUIDOption::kKeepOld /* or kGenerateNew? */));
	}

	// At this point we also need to construct collection objects and initialize counts. See
	// `live_import/import_collection.cpp::importCollection()` for how we construct/initializes
	// those MDB classes.
	}

	std::vector<CollectionMetadata> rollbackToStableAndExportConfigs(std::string importingDbpath) {
	WT_CONNECTION* conn;
	// Open WT where the backed up data files were placed. Tell WT that the journal files are
	// compressed with snappy and inside "<importingDbpath>/journal".
	//
	// Note that by not passing any configuration regarding `checkpoint=`, WT defaults to not taking
	// checkpoints outside of opening and closing.
	invariantWTOK(
	wiredtiger_open(importingDbpath.c_str(),
	nullptr,
	"config_base=false,log=(enabled=true,path=journal,compressor=snappy)",
	&conn));

	// WT is open. `WiredTiger.backup` is converted into a fresh `WiredTiger.wt`
	// file. RollbackToStable was implicitly performed and a checkpoint was taken on completion. We
	// rely on checkpoints being disabled to make exporting the WT metadata (byte offset to the root
	// node) consistent with the new file that was written out.
	WT_SESSION* session;
	invariantWTOK(conn->open_session(conn, nullptr, nullptr, &session));

	WT_CURSOR* mdbCatalogCursor;
	WT_CURSOR* sizeStorerCursor;
	invariantWTOK(
	session->open_cursor(session, "table:_mdb_catalog", nullptr, nullptr, &mdbCatalogCursor));
	invariantWTOK(
	session->open_cursor(session, "table:sizeStorer", nullptr, nullptr, &sizeStorerCursor));
	while (true) {
	int ret = mdbCatalogCursor->next(mdbCatalogCursor);
	if (ret == WT_NOTFOUND) {
	break;
	}
	invariantWTOK(ret);

	WT_ITEM value;
	invariantWTOK(mdbCatalogCursor->get_value(mdbCatalogCursor, &value));
	BSONObj rawCatalogEntry(value.data);
	if (DurableCatalogImpl::isFeatureDocument(rawCatalogEntry)) {
	// Not actually a public method.
	//
	// Re: this "feature document", we may "technically" need to copy its flag bits over,
	// but realistically it's not meaningful. I defer to storage execution on the best thing
	// to do.
	continue;
	}

	// Lots of options here. I'm providing something that demonstrates we can avoid some of the
	// higher level classes such as `Collection` or `IndexCatalog` that `exportCollection`
	// otherwise depends on. I'm pessimistic those classes will work out of the box when we're
	// trying to point them at catalog stuff in a different mdb_catalog file.
	//
	// Can also attempt to make a version of `enterprise/live_import/exportCollection` that
	// doesn't depend on these classes and share the code.

	// Please refactor this BSONCollectionCatalogEntry stuff if you feel the urge.
	auto catalogEntry = BSONCollectionCatalogEntry::MetaData::parse(rawCatalogEntry);

	// If exportCollection() is refactored so this code can hook into that -- the "output" of
	// this loop would presumably become the bson object we can pass into the `importCollection`
	// command.
	//
	// The `importCollection` command though (unfortunately) does all the replication bits of
	// importing. I presume secondaries on a merging recipient will be independently copying the
	// donor files and getting them consistent with a rollback to stable step. The primaries WT
	// metadata cannot be correctly used by a secondary in this case.
	//
	// Instead what I'll opt for is calling `DurableCatalog::importCollection()` which basically
	// takes these inputs.
	CollectionMetadata toAdd;
	toAdd.collection.filepath =
	importingDbpath + "/" + rawCatalogEntry["ident"].String() + ".wt";
	// Ident === `collection-123-456`.
	toAdd.collection.ident = rawCatalogEntry["ident"].String();
	toAdd.namespaceString = catalogEntry.ns;
	// I'm pretty sure it's fine to use the same session for querying WT's metadata (i.e:
	// WiredTiger.wt).
	toAdd.collection.tableMetadata = uassertStatusOK(WiredTigerUtil::getMetadata(
	session, "table:" + toAdd.collection.ident)); // table:collection-...
	toAdd.collection.fileMetadata = uassertStatusOK(WiredTigerUtil::getMetadata(
	session, "file:" + toAdd.collection.ident)); // file:collection-...
	toAdd.catalogObject = rawCatalogEntry.getOwned();
	/* do a sizeStorerCursor->search on I believe the `toAdd.collUri` string, get an owned bson
	* obj and shove it in here:
	* toAdd.sizeStorerObject = ownedSizeBSONObj;
	*/

	for (const BSONCollectionCatalogEntry::IndexMetaData& index : catalogEntry.indexes) {
	// At this point I learned that IndexMetaData does not provide the "ident" string for
	// indexes either. E.g: `index-123-456`. Thus my plea to refactor the
	// BSONCollectionCatalogEntry file...
	//
	// For now we'll pretend it does...
	WTImportArguments indexImport;
	indexImport.filepath = importingDbpath + "/" + index.ident + ".wt";
	// Ident === `index-124-456`.
	indexImport.ident = index.ident;
	indexImport.tableMetadata =
	WiredTigerUtil::getMetadata(session, "table:" + index.ident);
	indexImport.fileMetadata =
	WiredTigerUtil::getMetadata(session, "file:" + index.ident + ".wt");
	}

	// Closing WT I believe will take a checkpoint. Even though we did no logical writes, it's
	// not obvious to me that the root node offsets we saved as part of the WT metadata sit
	// still. The most conservative thing to do is copy the files at this step before closing
	// WT. Obviously that's less desirable than just moving the files. Moving the files
	// underneath WT while running will almost certainly result WT taking down the whole mongod
	// process.
	//
	// Worth clarifying with WT how these files can be safely and optimally transfered to the
	// "real" dbpath. If my hunch is right that moving files after closing is not safe today --
	// my guess is it's easy for WT to add configuration string to `WT_CONNECTION::close` that
	// skips taking a checkpoint or find some equivalent solution.
	conn->close(conn, nullptr);
	}
	}
	} // namespace mongo