Skip to content

Instantly share code, notes, and snippets.

@odashi
Last active January 26, 2021 07:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save odashi/3aa72dc5cbe1abc73739f7625e281299 to your computer and use it in GitHub Desktop.
Save odashi/3aa72dc5cbe1abc73739f7625e281299 to your computer and use it in GitHub Desktop.
Simple KyTea word segmenter-only pipeline.
// Wrapper class of KyTea word segmenter.
// Author: odashi
// Date: 2021-01-26
// License: MIT
#include <memory>
#include <string>
#include "kytea/kytea.h"
#include "kytea/string-util.h"
using kytea::Kytea;
using kytea::KyteaConfig;
using kytea::KyteaSentence;
class WordSegmenter {
public:
// ctor. Currently this takes only a model path.
WordSegmenter(std::string const& model_path) {
auto config = std::make_unique<KyteaConfig>();
config->setEncoding("utf8");
config->setDoTags(false);
kytea_ = std::make_unique<Kytea>(config.release());
kytea_->readModel(model_path.c_str());
}
// Performs segmentation.
std::vector<std::string> Perform(std::string const& input) {
auto* str_util = kytea_->getStringUtil();
auto sentence = std::make_unique<KyteaSentence>();
sentence->surface = str_util->mapString(input);
sentence->norm = str_util->normalize(sentence->surface);
if (sentence->surface.length() != 0) {
sentence->wsConfs.resize(sentence->surface.length() - 1, 0.0);
}
kytea_->calculateWS(*sentence);
std::vector<std::string> words;
words.reserve(sentence->words.size());
for (auto const& word : sentence->words) {
words.push_back(str_util->showString(word.surface));
}
return words;
}
private:
std::unique_ptr<Kytea> kytea_;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment