Skip to content

Instantly share code, notes, and snippets.

@oleiade
Created January 30, 2012 13:34
Show Gist options
  • Save oleiade/1704423 to your computer and use it in GitHub Desktop.
Save oleiade/1704423 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <fstream>
#include <string>
#include <boost/algorithm/string.hpp>
#include "dump.hh"
#include "constants.hh"
#include "leveldb/db.h"
#include "leveldb/write_batch.h"
Dump::Dump()
{
leveldb::Options options;
options.create_if_missing = true;
this->_status = leveldb::DB::Open(options, "./db", &this->_db);
assert(this->_status.ok());
}
Dump::~Dump() {}
freebase::Node* Dump::_assertion_to_node(freebase::Node* node, assertion &ast)
{
if (ast.size() >= 3)
{
node->set_property(ast[1]);
node->set_destination(ast[2]);
if (ast.size() == 4)
node->set_value(ast[3]);
}
return (node);
}
leveldb::WriteBatch Dump::_buffer_to_batch()
{
leveldb::WriteBatch batch;
assertion_buffer& cycle_buffer = this->getCycleBuffer();
assertion_buffer::iterator it;
for (it = cycle_buffer.begin(); it != cycle_buffer.end(); it++)
{
std::string value;
if (!(*it).second.SerializeToString(&value))
std::cerr << "Failed to write entity to batch" << std::endl;
batch.Put((*it).first, value);
}
return (batch);
}
void Dump::_process_assertion(leveldb::WriteBatch& batch,
assertion& ast)
{
std::string const key = ast[0];
std::string str_value;
freebase::Entity entity;
assertion_buffer& cycle_buffer = this->getCycleBuffer();
/* met mid */
if (Dump::in_set(this->getMetMids(), key))
{
/* mid not found in buffer */
if (!Dump::in_assertion_buffer(cycle_buffer, key))
cycle_buffer[key] = freebase::Entity();
freebase::Entity& buffered_entity = cycle_buffer[key];
freebase::Node* node = entity.add_node();
this->_assertion_to_node(node, ast);
}
/* new mid */
else
{
freebase::Node* value_node = entity.add_node();
this->_assertion_to_node(value_node, ast);
if (!entity.SerializeToString(&str_value))
std::cerr << "Failed to write address book" << std::endl;
batch.Put(key, str_value);
this->getMetMids().insert(key);
}
}
/*
** Function launching a freebase dump routine to
** a (ondisk operations) LevelDB database.
*/
bool Dump::dump_freebase_to_db()
{
// std::ifstream fb_fd("../dumps/freebase-datadump-quadruples.tsv");
std::ifstream fb_fd(FREEBASE_QD_PATH);
std::set<std::string> met_mids;
leveldb::WriteBatch batch;
int i = 0;
if (fb_fd.is_open())
{
while (fb_fd.good())
{
std::string line;
getline(fb_fd, line);
assertion line_elems;
boost::split(line_elems, line, boost::is_any_of("\t"));;
this->_process_assertion(batch, line_elems);
if (i == 5000000)
{
leveldb::WriteBatch buffer_batch = this->_buffer_to_batch();
this->getDb()->Write(leveldb::WriteOptions(), &batch);
this->getDb()->Write(leveldb::WriteOptions(), &buffer_batch);
std::cout << "Done." << std::endl;
break;
}
else
i += 1;
}
}
return true;
}
void Dump::setDb(leveldb::DB* const newDb)
{
this->_db = newDb;
}
void Dump::setStatus(leveldb::Status const& st)
{
this->_status = st;
}
void Dump::setCycleBuffer(assertion_buffer const& ast_buf)
{
this->_cycle_buffer = ast_buf;
}
void Dump::setMetMids(std::set<std::string> const& met_mids)
{
this->_met_mids = met_mids;
}
void Dump::setOpCount(unsigned int const& op_count)
{
this->_op_count = op_count;
}
leveldb::DB* Dump::getDb() const
{
return (this->_db);
}
leveldb::Status& Dump::getStatus()
{
return (this->_status);
}
assertion_buffer& Dump::getCycleBuffer()
{
return (this->_cycle_buffer);
}
std::set<std::string>& Dump::getMetMids()
{
return (this->_met_mids);
}
unsigned int const Dump::getOpCount() const
{
return (this->_op_count);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment