Skip to content

Instantly share code, notes, and snippets.

@jiqiujia
Created September 4, 2019 06:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jiqiujia/a182f2e25ceb42f3cc694ebf9bced43f to your computer and use it in GitHub Desktop.
Save jiqiujia/a182f2e25ceb42f3cc694ebf9bced43f to your computer and use it in GitHub Desktop.
berkeley lm
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import edu.berkeley.nlp.lm.ConfigOptions;
import edu.berkeley.nlp.lm.StringWordIndexer;
import edu.berkeley.nlp.lm.io.ArpaLmReader;
import edu.berkeley.nlp.lm.io.LmReaders;
import edu.berkeley.nlp.lm.util.Logger;
public class BuildLM {
private static void usage() {
System.err.println("Usage: <lmOrder> <ARPA lm output file> <textfiles>*");
System.exit(1);
}
public void makelml(String [] argv)
{
if (argv.length < 2) {
usage();
}
final int lmOrder = Integer.parseInt(argv[0]);
final String outputFile = argv[1];
final List<String> inputFiles = new ArrayList<>();
for (int i = 2; i < argv.length; ++i) {
inputFiles.add(argv[i]);
}
if (inputFiles.isEmpty()) inputFiles.add("-");
Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err));
Logger.startTrack("Reading text files " + inputFiles + " and writing to file " + outputFile);
final StringWordIndexer wordIndexer = new StringWordIndexer();
wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL);
wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL);
wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL);
LmReaders.createKneserNeyLmFromTextFiles(inputFiles, wordIndexer, lmOrder, new File(outputFile), new ConfigOptions());
Logger.endTrack();
}
public static void main(String[] args) {
BuildLM d = new BuildLM();
// String inputfile = "local_data/cloth/segdata/wxf_seg_cloth_top_part_noauthor.txt";
// String outputfile = "local_data/cloth/segdata/cloth.arpa";
// String s[]={"3",outputfile,inputfile};
// d.makelml(s);
String inputfile = "local_data/cloth/segdata/segSentences.txt";
String outputfile = "local_data/cloth/segdata/sentence.arpa";
String s[]={"3",outputfile,inputfile};
d.makelml(s);
}
}
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.seg.common.Term;
import edu.berkeley.nlp.lm.ArrayEncodedProbBackoffLm;
import edu.berkeley.nlp.lm.ConfigOptions;
import edu.berkeley.nlp.lm.NgramLanguageModel;
import edu.berkeley.nlp.lm.StringWordIndexer;
import edu.berkeley.nlp.lm.collections.Counter;
import edu.berkeley.nlp.lm.io.LmReaders;
public class LoadLM {
public static ArrayEncodedProbBackoffLm<String> getLm(boolean compress,String file) {
final File lmFile = new File(file);
final ConfigOptions configOptions = new ConfigOptions();
configOptions.unknownWordLogProb = 0.0f;
final ArrayEncodedProbBackoffLm<String> lm = LmReaders.readArrayEncodedLmFromArpa(lmFile.getPath(), compress, new StringWordIndexer(), configOptions,
Integer.MAX_VALUE);
return lm;
}
public static void main(String[] args) throws IOException {
LmReaders readers = new LmReaders();
ArrayEncodedProbBackoffLm<String> model = LoadLM.getLm(false, "local_data/cloth/segdata/cloth.arpa");
String sentence = "棉质的手感,前面有一条压线";
Segmentor.loadCustomDictionary(new FileInputStream("server_data/customDictionary.txt"), false);
List<String> terms = Segmentor.seg(sentence);
float score = model.getLogProb(terms);
System.out.println(score);
Counter<String> c = NgramLanguageModel.StaticMethods.getDistributionOverNextWords(model, terms);
c.pruneKeysBelowThreshold(0.001);
for(Map.Entry<String, Double> entry: c.getEntriesSortedByDecreasingCount()){
System.out.println(entry.getKey()+"\t"+entry.getValue());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment