jiqiujia/BuildLM.java

## BuildLM.java
import java.io.File;
import java.util.ArrayList;
import java.util.List;

import edu.berkeley.nlp.lm.ConfigOptions;
import edu.berkeley.nlp.lm.StringWordIndexer;
import edu.berkeley.nlp.lm.io.ArpaLmReader;
import edu.berkeley.nlp.lm.io.LmReaders;
import edu.berkeley.nlp.lm.util.Logger;

public class BuildLM {


    private static void usage() {
        System.err.println("Usage: <lmOrder> <ARPA lm output file> <textfiles>*");
        System.exit(1);
    }

    public void makelml(String [] argv)
    {
        if (argv.length < 2) {
            usage();
        }
        final int lmOrder = Integer.parseInt(argv[0]);
        final String outputFile = argv[1];
        final List<String> inputFiles = new ArrayList<>();
        for (int i = 2; i < argv.length; ++i) {
            inputFiles.add(argv[i]);
        }
        if (inputFiles.isEmpty()) inputFiles.add("-");
        Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err));
        Logger.startTrack("Reading text files " + inputFiles + " and writing to file " + outputFile);
        final StringWordIndexer wordIndexer = new StringWordIndexer();
        wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL);
        wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL);
        wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL);
        LmReaders.createKneserNeyLmFromTextFiles(inputFiles, wordIndexer, lmOrder, new File(outputFile), new ConfigOptions());

        Logger.endTrack();
    }

    public static void main(String[] args) {

        BuildLM d = new BuildLM();
//        String inputfile = "local_data/cloth/segdata/wxf_seg_cloth_top_part_noauthor.txt";
//        String outputfile = "local_data/cloth/segdata/cloth.arpa";
//        String s[]={"3",outputfile,inputfile};
//        d.makelml(s);

        String inputfile = "local_data/cloth/segdata/segSentences.txt";
        String outputfile = "local_data/cloth/segdata/sentence.arpa";
        String s[]={"3",outputfile,inputfile};
        d.makelml(s);
    }

}

## LoadLM.java

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.seg.common.Term;
import edu.berkeley.nlp.lm.ArrayEncodedProbBackoffLm;
import edu.berkeley.nlp.lm.ConfigOptions;
import edu.berkeley.nlp.lm.NgramLanguageModel;
import edu.berkeley.nlp.lm.StringWordIndexer;
import edu.berkeley.nlp.lm.collections.Counter;
import edu.berkeley.nlp.lm.io.LmReaders;

public class LoadLM {

    public static ArrayEncodedProbBackoffLm<String> getLm(boolean compress,String file) {
        final File lmFile = new File(file);
        final ConfigOptions configOptions = new ConfigOptions();
        configOptions.unknownWordLogProb = 0.0f;
        final ArrayEncodedProbBackoffLm<String> lm = LmReaders.readArrayEncodedLmFromArpa(lmFile.getPath(), compress, new StringWordIndexer(), configOptions,
                Integer.MAX_VALUE);
        return lm;
    }


    public static void main(String[] args) throws IOException {
        LmReaders readers = new LmReaders();
        ArrayEncodedProbBackoffLm<String> model = LoadLM.getLm(false, "local_data/cloth/segdata/cloth.arpa");
        String sentence = "棉质的手感，前面有一条压线";
        Segmentor.loadCustomDictionary(new FileInputStream("server_data/customDictionary.txt"), false);
        List<String> terms = Segmentor.seg(sentence);
        float score = model.getLogProb(terms);
        System.out.println(score);

        Counter<String> c = NgramLanguageModel.StaticMethods.getDistributionOverNextWords(model, terms);
        c.pruneKeysBelowThreshold(0.001);
        for(Map.Entry<String, Double> entry: c.getEntriesSortedByDecreasingCount()){
            System.out.println(entry.getKey()+"\t"+entry.getValue());
        }
    }

}
	import java.io.File;
	import java.util.ArrayList;
	import java.util.List;

	import edu.berkeley.nlp.lm.ConfigOptions;
	import edu.berkeley.nlp.lm.StringWordIndexer;
	import edu.berkeley.nlp.lm.io.ArpaLmReader;
	import edu.berkeley.nlp.lm.io.LmReaders;
	import edu.berkeley.nlp.lm.util.Logger;

	public class BuildLM {


	private static void usage() {
	System.err.println("Usage: <lmOrder> <ARPA lm output file> <textfiles>*");
	System.exit(1);
	}

	public void makelml(String [] argv)
	{
	if (argv.length < 2) {
	usage();
	}
	final int lmOrder = Integer.parseInt(argv[0]);
	final String outputFile = argv[1];
	final List<String> inputFiles = new ArrayList<>();
	for (int i = 2; i < argv.length; ++i) {
	inputFiles.add(argv[i]);
	}
	if (inputFiles.isEmpty()) inputFiles.add("-");
	Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err));
	Logger.startTrack("Reading text files " + inputFiles + " and writing to file " + outputFile);
	final StringWordIndexer wordIndexer = new StringWordIndexer();
	wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL);
	wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL);
	wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL);
	LmReaders.createKneserNeyLmFromTextFiles(inputFiles, wordIndexer, lmOrder, new File(outputFile), new ConfigOptions());

	Logger.endTrack();
	}

	public static void main(String[] args) {

	BuildLM d = new BuildLM();
	// String inputfile = "local_data/cloth/segdata/wxf_seg_cloth_top_part_noauthor.txt";
	// String outputfile = "local_data/cloth/segdata/cloth.arpa";
	// String s[]={"3",outputfile,inputfile};
	// d.makelml(s);

	String inputfile = "local_data/cloth/segdata/segSentences.txt";
	String outputfile = "local_data/cloth/segdata/sentence.arpa";
	String s[]={"3",outputfile,inputfile};
	d.makelml(s);
	}

	}

	import java.io.*;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map;

	import com.hankcs.hanlp.HanLP;
	import com.hankcs.hanlp.dictionary.CustomDictionary;
	import com.hankcs.hanlp.seg.common.Term;
	import edu.berkeley.nlp.lm.ArrayEncodedProbBackoffLm;
	import edu.berkeley.nlp.lm.ConfigOptions;
	import edu.berkeley.nlp.lm.NgramLanguageModel;
	import edu.berkeley.nlp.lm.StringWordIndexer;
	import edu.berkeley.nlp.lm.collections.Counter;
	import edu.berkeley.nlp.lm.io.LmReaders;

	public class LoadLM {

	public static ArrayEncodedProbBackoffLm<String> getLm(boolean compress,String file) {
	final File lmFile = new File(file);
	final ConfigOptions configOptions = new ConfigOptions();
	configOptions.unknownWordLogProb = 0.0f;
	final ArrayEncodedProbBackoffLm<String> lm = LmReaders.readArrayEncodedLmFromArpa(lmFile.getPath(), compress, new StringWordIndexer(), configOptions,
	Integer.MAX_VALUE);
	return lm;
	}


	public static void main(String[] args) throws IOException {
	LmReaders readers = new LmReaders();
	ArrayEncodedProbBackoffLm<String> model = LoadLM.getLm(false, "local_data/cloth/segdata/cloth.arpa");
	String sentence = "棉质的手感，前面有一条压线";
	Segmentor.loadCustomDictionary(new FileInputStream("server_data/customDictionary.txt"), false);
	List<String> terms = Segmentor.seg(sentence);
	float score = model.getLogProb(terms);
	System.out.println(score);

	Counter<String> c = NgramLanguageModel.StaticMethods.getDistributionOverNextWords(model, terms);
	c.pruneKeysBelowThreshold(0.001);
	for(Map.Entry<String, Double> entry: c.getEntriesSortedByDecreasingCount()){
	System.out.println(entry.getKey()+"\t"+entry.getValue());
	}
	}

	}