khaosans/gist:fe16099c8f3bdbde3f2f49a3432946f7

## gistfile1.txt
package com.jamasoftware.search.util.analyzer;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*

WARNING: if you change JamaTokenizerImpl.jflex and need to regenerate
      the tokenizer, use JFlex 1.5.0!

***JAMA Note***: this is file based on ClassicTokenizerImpl.jflex
*/

import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;

%%

%class JamaTokenizerImpl
%implements StandardTokenizerInterface
//%unicode 3.0
%integer
%function getNextToken
%pack
%char

%{

public static final int ALPHANUM          = JamaTokenizer.ALPHANUM;
public static final int APOSTROPHE        = JamaTokenizer.APOSTROPHE;
public static final int ACRONYM           = JamaTokenizer.ACRONYM;
public static final int COMPANY           = JamaTokenizer.COMPANY;
public static final int EMAIL             = JamaTokenizer.EMAIL;
public static final int HOST              = JamaTokenizer.HOST;
public static final int NUM               = JamaTokenizer.NUM;
public static final int CJ                = JamaTokenizer.CJ;

//***JAMA Changes***
public static final int COMPOUND                = JamaTokenizer.COMPOUND;
public static final int ATMENTION               = JamaTokenizer.ATMENTION;
public static final int EXTRACHARS              = JamaTokenizer.EXTRACHARS;
/**
 * @deprecated this solves a bug where HOSTs that end with '.' are identified
 *             as ACRONYMs.
 */
@Deprecated
public static final int ACRONYM_DEP       = JamaTokenizer.ACRONYM_DEP;

public static final String [] TOKEN_TYPES = JamaTokenizer.TOKEN_TYPES;

public final int yychar()
{
    return yychar;
}

/**
 * Fills CharTermAttribute with the current token text.
 */
public final void getText(CharTermAttribute t) {
  t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}

@Override
public void setBufferSize(int i) {
      throw new UnsupportedOperationException();
}


%}

THAI       = [\u0E00-\u0E59]

// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
ALPHANUM   = ({LETTER}|{THAI}|{EXTRA_NON_MENTION_CHARS}|[:digit:]|["_"])+

//***JAMA Changes*** Treat "_" as a letter
COMPOUND   = {ALPHANUM} (("_"|"-") {ALPHANUM})+

//***JAMA Changes*** Keep @ mentions in tact
ATMENTION = {ATMENTION_CHARS} {COMPOUND}

ATMENTION_CHARS = ("@"|"#")

// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possessives
APOSTROPHE =  {ALPHA} ("'" {ALPHA})+

// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
ACRONYM    =  {LETTER} "." ({LETTER} ".")+

ACRONYM_DEP	= {ALPHANUM} "." ({ALPHANUM} ".")+

// company names like AT&T and Excite@Home.
COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}

// email addresses
EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+

// hostname
HOST       =  {ALPHANUM} ((".") {ALPHANUM})+

// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
           | {HAS_DIGIT} {P} {ALPHANUM}
           | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
           | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
           | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
           | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)

// punctuation
P	         = ("_"|"-"|"/"|"."|",")

// at least one digit
HAS_DIGIT  = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*

ALPHA      = ({LETTER})+

// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
LETTER     = !(![:letter:]|{CJ})

// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
CJ         = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]

//***JAMA Changes*** Care about certain characters
EXTRACHARS = ({EXTRA_NON_MENTION_CHARS}|{ATMENTION_CHARS})

EXTRA_NON_MENTION_CHARS = ("!"|"?"|"^"|"$"|"&"|"%")

%%

//***JAMA CHANGES***
{COMPOUND}                                                     { return COMPOUND; }
//***JAMA CHANGES***
{ATMENTION}                                                    { return ATMENTION; }
{APOSTROPHE}                                                   { return APOSTROPHE; }
{ACRONYM}                                                      { return ACRONYM; }
{COMPANY}                                                      { return COMPANY; }
{EMAIL}                                                        { return EMAIL; }
{HOST}                                                         { return HOST; }
{NUM}                                                          { return NUM; }
{CJ}                                                           { return CJ; }
{ACRONYM_DEP}                                                  { return ACRONYM_DEP; }
{ALPHANUM}                                                     { return ALPHANUM; }
//***JAMA CHANGES***
{EXTRACHARS}                                                   { return EXTRACHARS; }

/** Ignore the rest */
[^]                                              { /* ignore */ }
	package com.jamasoftware.search.util.analyzer;

	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/*

	WARNING: if you change JamaTokenizerImpl.jflex and need to regenerate
	the tokenizer, use JFlex 1.5.0!

	*JAMA Note*: this is file based on ClassicTokenizerImpl.jflex
	*/

	import java.io.Reader;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.standard.StandardTokenizerInterface;

	%%

	%class JamaTokenizerImpl
	%implements StandardTokenizerInterface
	//%unicode 3.0
	%integer
	%function getNextToken
	%pack
	%char

	%{

	public static final int ALPHANUM = JamaTokenizer.ALPHANUM;
	public static final int APOSTROPHE = JamaTokenizer.APOSTROPHE;
	public static final int ACRONYM = JamaTokenizer.ACRONYM;
	public static final int COMPANY = JamaTokenizer.COMPANY;
	public static final int EMAIL = JamaTokenizer.EMAIL;
	public static final int HOST = JamaTokenizer.HOST;
	public static final int NUM = JamaTokenizer.NUM;
	public static final int CJ = JamaTokenizer.CJ;

	//*JAMA Changes*
	public static final int COMPOUND = JamaTokenizer.COMPOUND;
	public static final int ATMENTION = JamaTokenizer.ATMENTION;
	public static final int EXTRACHARS = JamaTokenizer.EXTRACHARS;
	/**
	* @deprecated this solves a bug where HOSTs that end with '.' are identified
	* as ACRONYMs.
	*/
	@Deprecated
	public static final int ACRONYM_DEP = JamaTokenizer.ACRONYM_DEP;

	public static final String [] TOKEN_TYPES = JamaTokenizer.TOKEN_TYPES;

	public final int yychar()
	{
	return yychar;
	}

	/**
	* Fills CharTermAttribute with the current token text.
	*/
	public final void getText(CharTermAttribute t) {
	t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
	}

	@Override
	public void setBufferSize(int i) {
	throw new UnsupportedOperationException();
	}


	%}

	THAI = [\u0E00-\u0E59]

	// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
	ALPHANUM = ({LETTER}\|{THAI}\|{EXTRA_NON_MENTION_CHARS}\|[:digit:]\|["_"])+

	//*JAMA Changes* Treat "_" as a letter
	COMPOUND = {ALPHANUM} (("_"\|"-") {ALPHANUM})+

	//*JAMA Changes* Keep @ mentions in tact
	ATMENTION = {ATMENTION_CHARS} {COMPOUND}

	ATMENTION_CHARS = ("@"\|"#")

	// internal apostrophes: O'Reilly, you're, O'Reilly's
	// use a post-filter to remove possessives
	APOSTROPHE = {ALPHA} ("'" {ALPHA})+

	// acronyms: U.S.A., I.B.M., etc.
	// use a post-filter to remove dots
	ACRONYM = {LETTER} "." ({LETTER} ".")+

	ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+

	// company names like AT&T and Excite@Home.
	COMPANY = {ALPHA} ("&"\|"@") {ALPHA}

	// email addresses
	EMAIL = {ALPHANUM} (("."\|"-"\|"_") {ALPHANUM})* "@" {ALPHANUM} (("."\|"-") {ALPHANUM})+

	// hostname
	HOST = {ALPHANUM} ((".") {ALPHANUM})+

	// floating point, serial, model numbers, ip addresses, etc.
	// every other segment must have at least one digit
	NUM = ({ALPHANUM} {P} {HAS_DIGIT}
	\| {HAS_DIGIT} {P} {ALPHANUM}
	\| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
	\| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
	\| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
	\| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)

	// punctuation
	P = ("_"\|"-"\|"/"\|"."\|",")

	// at least one digit
	HAS_DIGIT = ({LETTER}\|[:digit:])* [:digit:] ({LETTER}\|[:digit:])*

	ALPHA = ({LETTER})+

	// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>\|<b>)"
	LETTER = !(![:letter:]\|{CJ})

	// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
	CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]

	//*JAMA Changes* Care about certain characters
	EXTRACHARS = ({EXTRA_NON_MENTION_CHARS}\|{ATMENTION_CHARS})

	EXTRA_NON_MENTION_CHARS = ("!"\|"?"\|"^"\|"$"\|"&"\|"%")

	%%

	//*JAMA CHANGES*
	{COMPOUND} { return COMPOUND; }
	//*JAMA CHANGES*
	{ATMENTION} { return ATMENTION; }
	{APOSTROPHE} { return APOSTROPHE; }
	{ACRONYM} { return ACRONYM; }
	{COMPANY} { return COMPANY; }
	{EMAIL} { return EMAIL; }
	{HOST} { return HOST; }
	{NUM} { return NUM; }
	{CJ} { return CJ; }
	{ACRONYM_DEP} { return ACRONYM_DEP; }
	{ALPHANUM} { return ALPHANUM; }
	//*JAMA CHANGES*
	{EXTRACHARS} { return EXTRACHARS; }

	/** Ignore the rest */
	[^] { /* ignore */ }