Skip to content

Instantly share code, notes, and snippets.

@niujiabenben
Last active January 3, 2020 10:39
Show Gist options
  • Save niujiabenben/f79b438bdb9f8001fd9a784a086b43f7 to your computer and use it in GitHub Desktop.
Save niujiabenben/f79b438bdb9f8001fd9a784a086b43f7 to your computer and use it in GitHub Desktop.
lmdb使用示例
#include <lmdb.h>
#include "common.h"
#include "util.h"
#include "timer.h"
int main(int argc, char *argv[]) {
google::InitGoogleLogging(argv[0]);
google::LogToStderr();
const std::string lmdb_file("./data/sample_lmdb");
const std::string single_dir("./data/single_dir");
const std::string multiple_dir("./data/multiple_dir");
const std::string sample_file("./data/samples.txt");
std::ifstream infile(sample_file);
CHECK(infile.is_open()) << "Failed to open file: " << sample_file;
std::vector<std::string> samples;
std::vector<std::string> names;
std::string path;
std::string name;
while (infile >> path >> name) {
for (int i = 0; i < 10; ++i) {
samples.push_back(path);
names.push_back(std::to_string(i) + "_" + name);
}
}
infile.close();
LOG(INFO) << "Total samples: " << samples.size();
Timer timer;
MDB_env* env = NULL;
MDB_txn* txn = NULL;
MDB_dbi dbi;
MDB_val key;
MDB_val data;
CHECK_EQ(mdb_env_create(&env), 0);
CHECK_EQ(mdb_env_set_maxreaders(env, 1), 0);
CHECK_EQ(mdb_env_set_mapsize(env, 1073741824L * 1024L), 0);
int option = MDB_FIXEDMAP | MDB_NOLOCK;
CHECK_EQ(mdb_env_open(env, lmdb_file.c_str(), option, 0664), 0);
CHECK_EQ(mdb_txn_begin(env, NULL, 0, &txn), 0);
CHECK_EQ(mdb_dbi_open(txn, NULL, 0, &dbi), 0);
for (size_t i = 0; i < samples.size(); ++i) {
const auto content = ReadFile(samples[i], true);
CHECK(!content.empty()) << "Failed to read file: " << samples[i];
timer.Start();
key.mv_size = names[i].length();
key.mv_data = (void*) names[i].data();
data.mv_size = content.length();
data.mv_data = (void*) content.data();
CHECK_EQ(mdb_put(txn, dbi, &key, &data, 0), 0);
timer.Accumulate();
if (i % 1000 == 0) {
CHECK_EQ(mdb_txn_commit(txn), 0);
CHECK_EQ(mdb_txn_begin(env, NULL, 0, &txn), 0);
CHECK_EQ(mdb_dbi_open(txn, NULL, 0, &dbi), 0);
LOG(INFO) << i << ": " << timer.AverageMilliSeconds();
}
}
timer.Start();
LOG(INFO) << "final: " << timer.MilliSeconds();
mdb_dbi_close(env, dbi);
mdb_env_close(env);
return 0;
}
#! /home/chenli/Documents/tools/anaconda3/envs/pytorch/bin/python
# coding: utf-8
import os
import lmdb
import time
import logging
import init
import lib.util
import PIL.Image
def run_convert_to_lmdb(args):
with open(args.sample_file, "r") as srcfile:
samples = [l.strip().split()[0] for l in srcfile]
### map_size单位为Byte, 下面是512G
map_size = 256 * 1024 * 1024 * 1024
env = lmdb.open(args.lmdb_file, map_size=map_size)
with env.begin(write=True) as txn:
start_time = time.time()
for i, name in enumerate(samples):
path = os.path.join(args.sample_root, name)
image = PIL.Image.open(path).convert("RGB")
txn.put(key=name.encode(), value=image.tobytes())
if time.time() - start_time > 5:
logging.info("Progress: {}/{}".format(i, len(samples)))
start_time = time.time()
env.close()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Convert dataset to lmdb format.")
parser.add_argument(
"--lmdb_file", type=str,
help="directory where the lmdb-formated database is saved.")
lib.util.add_common_argument(parser, {
"sample_file": "",
"sample_root": ""
})
args = parser.parse_args()
assert args.lmdb_file is not None
lib.util.initialize_logger()
run_convert_to_lmdb(args)
logging.info("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment