chetanmeh/Tokenizer.java

## output.md

      
    Raw
  

              output.md
            
          
    dumpTokenized("中文标题", new CJKAnalyzer(Version.LUCENE_47));
Text to tokenize [中文标题] via LimitTokenCountAnalyzer 
[中] [文] [标] [题] 


## Tokenizer.java
    private void dumpTokenized(String text, Analyzer analyzer) throws IOException {
        List<String> tokens = tokenize(text, analyzer);

        System.out.printf("Text to tokenize [%s] via %s %n", text, analyzer.getClass().getSimpleName());
        for (String t : tokens){
            System.out.printf("[%s] ", t);
        }
        System.out.println();
    }

    private List<String> tokenize(String text, Analyzer analyzer) throws IOException {
        List<String> tokens = new ArrayList<String>();

        TokenStream stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));

        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);

        stream.reset();

        while (stream.incrementToken()) {
            String term = termAtt.toString();
            tokens.add(term);
        }

        stream.close();
        return tokens;
    }
	private void dumpTokenized(String text, Analyzer analyzer) throws IOException {
	List<String> tokens = tokenize(text, analyzer);

	System.out.printf("Text to tokenize [%s] via %s %n", text, analyzer.getClass().getSimpleName());
	for (String t : tokens){
	System.out.printf("[%s] ", t);
	}
	System.out.println();
	}

	private List<String> tokenize(String text, Analyzer analyzer) throws IOException {
	List<String> tokens = new ArrayList<String>();

	TokenStream stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));

	CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);

	stream.reset();

	while (stream.incrementToken()) {
	String term = termAtt.toString();
	tokens.add(term);
	}

	stream.close();
	return tokens;
	}