Skip to content

Instantly share code, notes, and snippets.

@knoguchi
Last active December 6, 2018 21:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save knoguchi/d16f6f2d7536608d8e93867235d73018 to your computer and use it in GitHub Desktop.
Save knoguchi/d16f6f2d7536608d8e93867235d73018 to your computer and use it in GitHub Desktop.
dynamic protobuf parsing
#include <unistd.h>
#include <iostream>
#define BOOST_FILESYSTEM_NO_DEPRECATED
#define BOOST_FILESYSTEM_VERSION 3
#include <boost/filesystem.hpp>
#include <google/protobuf/compiler/importer.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include <google/protobuf/dynamic_message.h>
#include <google/protobuf/text_format.h>
#include "myprj.h"
using namespace std;
using namespace google::protobuf::compiler;
using namespace google::protobuf;
using namespace boost::filesystem;
class ErrorPrinter: public MultiFileErrorCollector,
public io::ErrorCollector,
public DescriptorPool::ErrorCollector {
public:
ErrorPrinter(ErrorFormat format, DiskSourceTree *tree = NULL):
found_errors_(false) {}
~ErrorPrinter() {}
// implements MultiFileErrorCollector
void AddError(const string& filename, int line, int column, const string& message) {
found_errors_ = true;
AddErrorOrWarning(filename, line, column, message, "error", cerr);
}
void AddWarning(const string& filename, int line, int column, const string& message) {
found_errors_ = true;
AddErrorOrWarning(filename, line, column, message, "warning", clog);
}
// implements io::ErrorCollector
void AddError(int line, int column, const string& message) {
AddError("input", line, column, message);
}
void AddWarning(int line, int column, const string& message) {
AddErrorOrWarning("input", line, column, message, "warning", clog);
}
// implements DescriptorPool::ErrorCollector
void AddError(
const string& filename,
const string& element_name,
const Message* descriptor,
ErrorLocation location,
const string& message) {
AddErrorOrWarning(filename, -1, -1, message, "error", cerr);
}
void AddWarning(
const string& filename,
const string& element_name,
const Message* descriptor,
ErrorLocation location,
const string& message) {
AddErrorOrWarning(filename, -1, -1, message, "warning", cerr);
}
bool FoundErrors() const { return found_errors_;}
private:
void AddErrorOrWarning(const string& filename, int line, int column, const string& message, const string& type, ostream& out) {
}
bool found_errors_;
};
bool InitializeDiskSourceTree(DiskSourceTree* source_tree) {
// Add default proto paths
// Set up the source tree.
for (int i=0; i<proto_path_.size(); i++) {
source_tree->MapPath(proto_path_[i].first, proto_path_[i].second);
}
/*
// Map input files to virtual paths if possible.
if(!MakeInputsBeProtoPathRelative(source_tree)) {
return false;
}
*/
return true;
}
bool MakeProtoProtoPathRelative(
DiskSourceTree* source_tree, string* proto,
DescriptorDatabase* fallback_database) {
FileDescriptorProto fallback_file;
bool in_fallback_database = fallback_database != nullptr && fallback_database->FindFileByName(*proto, &fallback_file);
if(access(proto->c_str(), F_OK) < 0) {
string disk_file;
if(source_tree->VirtualFileToDiskFile(*proto, &disk_file) ||
in_fallback_database) {
return true;
}
else {
cerr <<*proto <<": "<< strerror(ENOENT) << endl;
return false;
}
}
string virtual_file, shadowing_disk_file;
switch(source_tree->DiskFileToVirtualFile(
*proto, &virtual_file, &shadowing_disk_file)) {
case DiskSourceTree::SUCCESS:
*proto = virtual_file;
break;
case DiskSourceTree::SHADOWED:
cerr<< *proto
<< ": Input is shadowed in the --proto_path by \""
<< shadowing_disk_file
<< "\". Fiether use the latter file as your input or reoder "
" the --proto_path so that the former file's location comes firs,." << endl;
return false;
case DiskSourceTree::CANNOT_OPEN:
if (in_fallback_database) {
return true;
}
cerr << *proto << ": " << strerror(errno) << endl;
return false;
case DiskSourceTree::NO_MAPPING:
{
// try to intepret the path as a virtual path.
string disk_file;
if (source_tree->VirtualFileToDiskFile(*proto, &disk_file) ||
in_fallback_database) {
return true;
} else {
// the input file path can't be mapped to any --proto_path and it also
// can't be interpreted as a virtual path.
return false;
}
}
}
return true;
}
bool MakeInputsBeProtoPathRelative(
DiskSourceTree* source_tree, DescriptorDatabase* fallback_database) {
for (auto& input_file: input_files_) {
if(!MakeProtoProtoPathRelative(source_tree, &input_file, fallback_database)) {
return false;
}
}
return true;
}
bool ParseInputFiles(
DescriptorPool* descriptor_pool,
vector<const FileDescriptor*>* parsed_files) {
for (int i=0; i < input_files_.size(); i++) {
// import the file
descriptor_pool->AddUnusedImportTrackFile(input_files_[i]);
const FileDescriptor* parsed_file = descriptor_pool->FindFileByName(input_files_[i]);
descriptor_pool->ClearUnusedImportTrackFiles();
if (parsed_file == NULL) {
if (!descriptor_set_in_names_.empty()) {
cerr <<input_files_[i] <<": " <<strerror(ENOENT) << endl;
}
return false;
}
parsed_files->push_back(parsed_file);
}
return true;
}
bool EncodeOrDecode(const DescriptorPool* pool) {
const Descriptor* type = pool->FindMessageTypeByName(codec_type_);
if (type == NULL) {
cerr << "Type not defined: " << codec_type_ << endl;
return false;
}
DynamicMessageFactory dynamic_factory(pool);
unique_ptr<Message> message(dynamic_factory.GetPrototype(type)->New());
cout << type->field_count();
io::FileInputStream in(STDIN_FILENO);
io::FileOutputStream out(STDOUT_FILENO);
if (!message->ParsePartialFromZeroCopyStream(&in)) {
cerr << "Failed to parse input." << endl;
return false;
}
if(!message->IsInitialized()) {
cerr << "warning Input message is missing required fields: "
<< message->InitializationErrorString() << endl;
}
if(!TextFormat::Print(*message, &out)) {
cerr << "output: I/O error" << endl;
return false;
}
return true;
}
int main(int argc, char *argv[]) {
vector<const FileDescriptor*> parsed_files;
unique_ptr<DiskSourceTree> disk_source_tree;
unique_ptr<ErrorPrinter> error_collector;
unique_ptr<DescriptorPool> descriptor_pool;
unique_ptr<DescriptorDatabase> descriptor_database;
unique_ptr<SourceTreeDescriptorDatabase> source_tree_database;
proto_path_.clear();
const path format_schema_dir = "format_schema";
if(!exists(format_schema_dir) || !is_directory(format_schema_dir)) {
cout << "format_schema directory does not exist: " << format_schema_dir;
return 2;
}
recursive_directory_iterator it(format_schema_dir);
recursive_directory_iterator endit;
while(it != endit)
{
if(is_regular_file(*it)) {
if( it->path().extension().c_str() == string(".proto")) {
cout << "Adding " << it->path().c_str() << endl;
search_path_.insert(it->path().parent_path().c_str());
input_files_.push_back(it->path().filename().c_str());
}
}
++it;
}
for (auto const& element : search_path_) {
proto_path_.push_back(pair<string, string>("", element.c_str()));
}
// add the top directory in case no search path provided
if (proto_path_.empty() && descriptor_set_in_names_.empty()) {
proto_path_.push_back(pair<string, string>("", "."));
}
if (descriptor_set_in_names_.empty()) {
// add proto files
disk_source_tree.reset(new DiskSourceTree());
if(!InitializeDiskSourceTree(disk_source_tree.get())) {
return 1;
}
error_collector.reset(new ErrorPrinter(error_format_, disk_source_tree.get()));
// at this point, it knows the dirs to look for proto files
SourceTreeDescriptorDatabase* database = new SourceTreeDescriptorDatabase(disk_source_tree.get());
database->RecordErrorsTo(error_collector.get());
descriptor_database.reset(database);
descriptor_pool.reset(new DescriptorPool(descriptor_database.get(), database->GetValidationErrorCollector()));
}
descriptor_pool->EnforceWeakDependencies(true);
if(!ParseInputFiles(descriptor_pool.get(), &parsed_files)) {
return 1;
}
// check the type exists
codec_type_ = argv[1];
const Descriptor* type = descriptor_pool->FindMessageTypeByName(codec_type_);
if (type == NULL) {
cerr << "Type not defined: " << codec_type_ << endl;
return 1;
}
// ready to parse
cout << "Ready to parse. Waiting for input\n";
if (!EncodeOrDecode(descriptor_pool.get())) {
return 1;
}
return 0;
}
using namespace std;
vector<string> input_files_;
vector<pair<string, string>> proto_path_;
vector<string> descriptor_set_in_names_;
set<string> search_path_;
string codec_type_;
#define PROTO_EXT ".proto"
enum ErrorFormat {
ERROR_FORMAT_BLAH
};
ErrorFormat error_format_;
@knoguchi
Copy link
Author

knoguchi commented Nov 28, 2018

What does this gist do?

Read *.proto files from multiple directories, then deserialize protobuf message that is fed from STDIN.
Based on protobuf v3.0.0 protoc code.

Why so complicated?

Kenton Varda, the original author of Protobuf once said

A DescriptorPool actually sets on top of a DescritporDatabase. In theory, a DescriptorDatabase could contain an infinite number of types. In real life, we have DescriptorDatabase implementations backed by remote servers containing large databases of types. There's also SourceFileDescriptorDatabase, which searches for files on the filesystem on-demand.

Apparently there are protobuf type databases that are remotely accessed at Google. The db is backed by disk file system and possibly another remote back end dbs. The client accesses the db via a thread safe pool. The enterprise features were removed for the open source version protobuf. We don't need all the complexity when we locally use the protobuf but it greatly helps if you know the design.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment