Created
January 30, 2012 13:34
-
-
Save oleiade/1704423 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <string> | |
#include <boost/algorithm/string.hpp> | |
#include "dump.hh" | |
#include "constants.hh" | |
#include "leveldb/db.h" | |
#include "leveldb/write_batch.h" | |
Dump::Dump() | |
{ | |
leveldb::Options options; | |
options.create_if_missing = true; | |
this->_status = leveldb::DB::Open(options, "./db", &this->_db); | |
assert(this->_status.ok()); | |
} | |
Dump::~Dump() {} | |
freebase::Node* Dump::_assertion_to_node(freebase::Node* node, assertion &ast) | |
{ | |
if (ast.size() >= 3) | |
{ | |
node->set_property(ast[1]); | |
node->set_destination(ast[2]); | |
if (ast.size() == 4) | |
node->set_value(ast[3]); | |
} | |
return (node); | |
} | |
leveldb::WriteBatch Dump::_buffer_to_batch() | |
{ | |
leveldb::WriteBatch batch; | |
assertion_buffer& cycle_buffer = this->getCycleBuffer(); | |
assertion_buffer::iterator it; | |
for (it = cycle_buffer.begin(); it != cycle_buffer.end(); it++) | |
{ | |
std::string value; | |
if (!(*it).second.SerializeToString(&value)) | |
std::cerr << "Failed to write entity to batch" << std::endl; | |
batch.Put((*it).first, value); | |
} | |
return (batch); | |
} | |
void Dump::_process_assertion(leveldb::WriteBatch& batch, | |
assertion& ast) | |
{ | |
std::string const key = ast[0]; | |
std::string str_value; | |
freebase::Entity entity; | |
assertion_buffer& cycle_buffer = this->getCycleBuffer(); | |
/* met mid */ | |
if (Dump::in_set(this->getMetMids(), key)) | |
{ | |
/* mid not found in buffer */ | |
if (!Dump::in_assertion_buffer(cycle_buffer, key)) | |
cycle_buffer[key] = freebase::Entity(); | |
freebase::Entity& buffered_entity = cycle_buffer[key]; | |
freebase::Node* node = entity.add_node(); | |
this->_assertion_to_node(node, ast); | |
} | |
/* new mid */ | |
else | |
{ | |
freebase::Node* value_node = entity.add_node(); | |
this->_assertion_to_node(value_node, ast); | |
if (!entity.SerializeToString(&str_value)) | |
std::cerr << "Failed to write address book" << std::endl; | |
batch.Put(key, str_value); | |
this->getMetMids().insert(key); | |
} | |
} | |
/* | |
** Function launching a freebase dump routine to | |
** a (ondisk operations) LevelDB database. | |
*/ | |
bool Dump::dump_freebase_to_db() | |
{ | |
// std::ifstream fb_fd("../dumps/freebase-datadump-quadruples.tsv"); | |
std::ifstream fb_fd(FREEBASE_QD_PATH); | |
std::set<std::string> met_mids; | |
leveldb::WriteBatch batch; | |
int i = 0; | |
if (fb_fd.is_open()) | |
{ | |
while (fb_fd.good()) | |
{ | |
std::string line; | |
getline(fb_fd, line); | |
assertion line_elems; | |
boost::split(line_elems, line, boost::is_any_of("\t"));; | |
this->_process_assertion(batch, line_elems); | |
if (i == 5000000) | |
{ | |
leveldb::WriteBatch buffer_batch = this->_buffer_to_batch(); | |
this->getDb()->Write(leveldb::WriteOptions(), &batch); | |
this->getDb()->Write(leveldb::WriteOptions(), &buffer_batch); | |
std::cout << "Done." << std::endl; | |
break; | |
} | |
else | |
i += 1; | |
} | |
} | |
return true; | |
} | |
void Dump::setDb(leveldb::DB* const newDb) | |
{ | |
this->_db = newDb; | |
} | |
void Dump::setStatus(leveldb::Status const& st) | |
{ | |
this->_status = st; | |
} | |
void Dump::setCycleBuffer(assertion_buffer const& ast_buf) | |
{ | |
this->_cycle_buffer = ast_buf; | |
} | |
void Dump::setMetMids(std::set<std::string> const& met_mids) | |
{ | |
this->_met_mids = met_mids; | |
} | |
void Dump::setOpCount(unsigned int const& op_count) | |
{ | |
this->_op_count = op_count; | |
} | |
leveldb::DB* Dump::getDb() const | |
{ | |
return (this->_db); | |
} | |
leveldb::Status& Dump::getStatus() | |
{ | |
return (this->_status); | |
} | |
assertion_buffer& Dump::getCycleBuffer() | |
{ | |
return (this->_cycle_buffer); | |
} | |
std::set<std::string>& Dump::getMetMids() | |
{ | |
return (this->_met_mids); | |
} | |
unsigned int const Dump::getOpCount() const | |
{ | |
return (this->_op_count); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment