Skip to content

Instantly share code, notes, and snippets.

@inferrna
Last active July 28, 2017 10:54
Show Gist options
  • Save inferrna/c8d22af97e03db5855372b55397024a2 to your computer and use it in GitHub Desktop.
Save inferrna/c8d22af97e03db5855372b55397024a2 to your computer and use it in GitHub Desktop.
Patch for produce morph-label-set, morphology-map and char_ngram_map files from syntaxnet train process.
diff --git a/syntaxnet/syntaxnet/lexicon_builder.cc b/syntaxnet/syntaxnet/lexicon_builder.cc
index f1346a8..e9fb413 100644
--- a/syntaxnet/syntaxnet/lexicon_builder.cc
+++ b/syntaxnet/syntaxnet/lexicon_builder.cc
@@ -18,6 +18,7 @@ limitations under the License.
#include "syntaxnet/affix.h"
#include "syntaxnet/dictionary.pb.h"
+#include "syntaxnet/morphology_label_set.h"
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/segmenter_utils.h"
#include "syntaxnet/sentence.pb.h"
@@ -27,6 +28,7 @@ limitations under the License.
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/env.h"
+#include "syntaxnet/segmenter_utils.h"
// A task that collects term statistics over a corpus and saves a set of
// term maps; these saved mappings are used to map strings to ints in both the
@@ -77,6 +79,11 @@ class LexiconBuilder : public OpKernel {
TermFrequencyMap categories;
TermFrequencyMap labels;
TermFrequencyMap chars;
+ TermFrequencyMap morphs;
+ MorphologyLabelSet morph_label_set;
+ TermFrequencyMap char_ngram_map;
+ int max_char_ngram_length = task_context_.Get("lexicon_max_char_ngram_length", 3);
+ bool use_terminators = task_context_.Get("lexicon_char_ngram_include_terminators", false);
// Affix tables to be populated by the corpus.
AffixTable prefixes(AffixTable::PREFIX, max_prefix_length_);
@@ -124,6 +131,33 @@ class LexiconBuilder : public OpKernel {
if (!c_str.empty() && !HasSpaces(c_str)) chars.Increment(c_str);
}
+ const TokenMorphology &token_morphology = token.GetExtension(TokenMorphology::morphology);
+ for (const TokenMorphology::Attribute &att : token_morphology.attribute()) {
+ morphs.Increment(tensorflow::strings::StrCat(att.name(), "=", att.value()));
+ }
+
+ // Only add non-empty morphologies
+ if(token_morphology.attribute_size() > 0)
+ morph_label_set.Add(token_morphology);
+
+ //Char-ngram generation below
+ vector<tensorflow::StringPiece> char_spN;
+ if (use_terminators) char_spN.push_back("^");
+ SegmenterUtils::GetUTF8Chars(token.word(), &char_spN);
+ if (use_terminators) char_spN.push_back("$");
+ for (int start = 0; start < char_spN.size(); ++start) {
+ string char_ngram;
+ for (int index = 0;
+ index < max_char_ngram_length && start + index < char_spN.size();
+ ++index) {
+ tensorflow::StringPiece c = char_spN[start + index];
+ if (c == " ") break; // Never add char ngrams containing spaces.
+ tensorflow::strings::StrAppend(&char_ngram, c);
+
+ char_ngram_map.Increment(char_ngram);
+ }
+ }
+
// Update the number of processed tokens.
++num_tokens;
}
@@ -142,6 +176,9 @@ class LexiconBuilder : public OpKernel {
TaskContext::InputFile(*task_context_.GetInput("category-map")));
labels.Save(TaskContext::InputFile(*task_context_.GetInput("label-map")));
chars.Save(TaskContext::InputFile(*task_context_.GetInput("char-map")));
+ morphs.Save(TaskContext::InputFile(*task_context_.GetInput("morphology-map")));
+ morph_label_set.Write(TaskContext::InputFile(*task_context_.GetInput("morph-label-set")));
+ char_ngram_map.Save(TaskContext::InputFile(*task_context_.GetInput("char-ngram-map")));
// Write affixes to disk.
WriteAffixTable(prefixes, TaskContext::InputFile(
@fiammettaAtGit
Copy link

Hi, I'm just trying to make working my re-trained syntaxnet for Italian Language.
As much of us, I just experienced a successsful training process but painful execution, so my test failed because of the lack of the morphology-map and char-ngram-map files.
Please, could you say me how I should use your patch for lexicon_builder.cc ?
Thanks in advance,
Regards,
Fiammetta

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment