Skip to content

Instantly share code, notes, and snippets.

@rimms
Last active December 15, 2015 06:48
Show Gist options
  • Save rimms/5218540 to your computer and use it in GitHub Desktop.
Save rimms/5218540 to your computer and use it in GitHub Desktop.
Save/Load Ideas

[1] Format of File

Ideas:

  • pfi::data::serialization (currently using)
  • MessagePack
  • Protocol Buffer
  • Thrift
  • Avro
  • JSON ... depends on the perser when use big integer(> 53bit)
  • XML
  • YAML

View Points:

  • Interoperability with other software implemented in various language (other than C++)
  • Ease of handling nested structure data (container data like array map)
  • Compact size
  • Smaller overhead to load

I think, MessagePack is seem good. Because of followings:

  • Already use in Jubatus (not need new dependencies)
  • Support Many language(Ruby, Python, Perl, Java, Node, Erlang .. and more).
  • Compact compare to JSON, XML, YAML.

1) Extend Currently Implement

  • Use pfi::data::serialization
  • Add Information that must be included using new struct and mixable_holder
struct snap_info {
  template <class Ar>
  void serialize(Ar& ar) {
    ar & MEMBER(version)
       & MEMBER(config)
       & MEMBER(type)
       & MEMBER(timestamp)

  std::string version;
  pfi::text::json::json config;
  std::string type;
  std::string timestamp;
}

bool server_base::save(const std::string& id) {
  (.. snip ..)

  snap_info info;
  info.config = config_;

  (.. snip ..)

  std::ofstream ofs(path.c_str(), std::ios::trunc|std::ios::binary);
  pfi::data::serialization::binary_oarchive oa(ofs);
  oa << info;
  std::vector<mixable0*> mixables = get_mixable_holder()->get_mixables();
  for (size_t i = 0; i < mixables.size(); ++i) {
    mixables[i]->save(ofs);
  }

  (.. snip ..)
}

2) Replace Currently Implement to MessagePack

  • Use messagepack::pack for interoperability
  • Add Information that must be included using new struct and mixable_holder
struct snap_info {
  std::string version;
  pfi::text::json::json config;
  std::string type;
  std::string timestamp;

  MSGPACK_DEFINE(version, config, type, timestamp);
}

bool server_base::save(const std::string& id) {
  (.. snip ..)

  snap_info info;
  info.config = config_;

  (.. snip ..)

  std::ofstream ofs(path.c_str(), std::ios::trunc|std::ios::binary);
  msgpack::packer<std::ofstream> packer(&ofs);
  packer.pack(info);
  std::vector<mixable0*> mixables = get_mixable_holder()->get_mixables();
  for (size_t i = 0; i < mixables.size(); ++i) {
    mixables[i]->save(ofs);
  }

  (.. snip ..)
}

[2] Handling of Target

  • Currently, the targets of save is mixable classes held by mixable_holder. Only mixable? Not redundant?
  • Must be divided targets of save and mixable.

1) Devide mixable and savable

  • Add externalizable class and target class of save inherit this class
class externalizable {
  virtual bool write(std::ostream&) = 0;
  virtual bool read(std::istream&) = 0;
}

class snap_info : public externalizable {
 private:
  std::string version;
  pfi::text::json::json config;
  std::string type;
  std::string timestamp;

 public:
  MSGPACK_DEFINE(version, config, type, timestamp);
}

class save_target_class : public externalizable, public mixable<> {}
 private:
  std::unordered_map columns;
  std::unordered_map rows;

 public:
  MSGPACK_DEFINE(columns,rows);

  (.. snip ..)

  bool write(std::ostream& ofs) {
    msgpack::pack(ofs, this)
  }

  (.. snip ..)
}

bool server_base::save(const std::string& id) {
  (.. snip ..)

  std::ofstream ofs(path.c_str(), std::ios::trunc|std::ios::binary);
  msgpack::packer<std::ofstream> packer(&ofs);
  std::vector<externalizable*> externalizables = get_externalizable_holder()->get_externalizables();
  for (size_t i = 0; i < externalizables.size(); ++i) {
    packer.pack(externalizables[i]->write(ofs));
  }

  (.. snip ..)
}

2) Devide commons and task dependencies

We have to consider more about the details

class server_base {
  (.. snip ..)

  static const std::string VERSION(JUBATUS_VERSION);
  static const uint64_t FORMAT_VERSION = 1;
  std::string system_data_containor_sum;
  system_data_containor s_containor;
  std::string user_data_containors_sum;
  vector<data_containor*> u_containors;

  MSGPACK_DEFINE(VERSION, FORMAT_VERSION, system_data_containor_sum, s_containor, user_data_containors_sum, u_containors);

  void register_user_data_containor(data_containor* d) {
    u_containors.push_back(d);
  }

  (.. snip ..)

  bool server_base::save(const std::string& id) {
  (.. snip ..)

    std::ofstream ofs(path.c_str(), std::ios::trunc|std::ios::binary);
    msgpack::packer<std::ofstream> packer(&ofs)

    // fixed order
    packer.pack(VERSION);
    packer.pack(FORMAT_VERSION);
    system_data_containor(s_containor);
    packer.pack(md5sum::md5sum(&s_containor))
    packer.pack(s_containor);
    packer.pack(md5sum::md5sum(&u_containors))
    packer.pack(u_containors);

  (.. snip ..)
  }

}

class data_containor {
 public:
  data_containor : version_(1) {}
  uint64_t version_;
  virtual uint64_t version() = 0;
  MSGPACK_DEFINE(version_);
}

class system_data_containor : public data_containor {
  time_t timestamp;
  std::string type;     // task(engine) type name
  std::string id;       // unique name
  std::string config;   // if impossible to compare, may be use JSON object

  uint64_t version() { return 1 ; }
  MSGPACK_DEFINE(version, timestamp, type, id, config);
}

class user_data_containor : public data_containor {
  model_t model;
  storage_t storage;
  weight_mgr_t wm;
  fv_converter_t fv;
  id_generator_t idgen;
  update_cnt_t update_cnt;

  uint64_t version() { return 1 ; }
  MSGPACK_DEFINE(version, model, storage, wm, fv, idgen, update_cnt);
}

class classifier_serv : public framework::server_base {
  (.. snip ..)

  shared_ptr<user_data_containor> d_;
  d_->model = model_;
  d_->storage = storage_;
  d_->wm = wm_;
     :

  register_user_data_containor(d_);

  (.. snip ..)
}
#
# Jubatus Format
#
- jubatus_version # check with running Jubatus version, WARNING
- format_version # check with running Jubatus version, ERROR
- system_data_containor_sum # check with system_data_containor, ERROR
- system_data_containor # extends data_containor
-
- version # check with running Jubatus version, ERROR
- system_data
-
- config # check with running Jubatus's config, WARNING
- timestamp # meta-data, print for log
- type # meta-data, print for log
- id # meta-data, print for log
- user_data_containors_sum # check with user_data_containors, ERROR
- user_data_containors
-
- user_data_containor1 # extends data_containor
- version # check with running Jubatus version, ERROR
- user_data1
-
- model
- storage
- fv_converter
- global_id
- weight_manager
- update count
- user-defined type
- user_data_containor2 # extends data_containor
- version # check with running Jubatus version, ERROR
- user_data2
-
- user-defined type
#
# data_containor
#
- data_containor
- version
@rimms
Copy link
Author

rimms commented Mar 25, 2013

  • mode(standalone/distributed)
  • mix type
  • command line parameter

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment