Skip to content

Instantly share code, notes, and snippets.

@khaosans
Created April 26, 2016 04:23
Show Gist options
  • Save khaosans/fe16099c8f3bdbde3f2f49a3432946f7 to your computer and use it in GitHub Desktop.
Save khaosans/fe16099c8f3bdbde3f2f49a3432946f7 to your computer and use it in GitHub Desktop.
Ignore the rest
package com.jamasoftware.search.util.analyzer;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
WARNING: if you change JamaTokenizerImpl.jflex and need to regenerate
the tokenizer, use JFlex 1.5.0!
***JAMA Note***: this is file based on ClassicTokenizerImpl.jflex
*/
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
%%
%class JamaTokenizerImpl
%implements StandardTokenizerInterface
//%unicode 3.0
%integer
%function getNextToken
%pack
%char
%{
public static final int ALPHANUM = JamaTokenizer.ALPHANUM;
public static final int APOSTROPHE = JamaTokenizer.APOSTROPHE;
public static final int ACRONYM = JamaTokenizer.ACRONYM;
public static final int COMPANY = JamaTokenizer.COMPANY;
public static final int EMAIL = JamaTokenizer.EMAIL;
public static final int HOST = JamaTokenizer.HOST;
public static final int NUM = JamaTokenizer.NUM;
public static final int CJ = JamaTokenizer.CJ;
//***JAMA Changes***
public static final int COMPOUND = JamaTokenizer.COMPOUND;
public static final int ATMENTION = JamaTokenizer.ATMENTION;
public static final int EXTRACHARS = JamaTokenizer.EXTRACHARS;
/**
* @deprecated this solves a bug where HOSTs that end with '.' are identified
* as ACRONYMs.
*/
@Deprecated
public static final int ACRONYM_DEP = JamaTokenizer.ACRONYM_DEP;
public static final String [] TOKEN_TYPES = JamaTokenizer.TOKEN_TYPES;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
@Override
public void setBufferSize(int i) {
throw new UnsupportedOperationException();
}
%}
THAI = [\u0E00-\u0E59]
// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
ALPHANUM = ({LETTER}|{THAI}|{EXTRA_NON_MENTION_CHARS}|[:digit:]|["_"])+
//***JAMA Changes*** Treat "_" as a letter
COMPOUND = {ALPHANUM} (("_"|"-") {ALPHANUM})+
//***JAMA Changes*** Keep @ mentions in tact
ATMENTION = {ATMENTION_CHARS} {COMPOUND}
ATMENTION_CHARS = ("@"|"#")
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possessives
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
ACRONYM = {LETTER} "." ({LETTER} ".")+
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
// company names like AT&T and Excite@Home.
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
// email addresses
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
// hostname
HOST = {ALPHANUM} ((".") {ALPHANUM})+
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
| {HAS_DIGIT} {P} {ALPHANUM}
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
// punctuation
P = ("_"|"-"|"/"|"."|",")
// at least one digit
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
ALPHA = ({LETTER})+
// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
LETTER = !(![:letter:]|{CJ})
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
//***JAMA Changes*** Care about certain characters
EXTRACHARS = ({EXTRA_NON_MENTION_CHARS}|{ATMENTION_CHARS})
EXTRA_NON_MENTION_CHARS = ("!"|"?"|"^"|"$"|"&"|"%")
%%
//***JAMA CHANGES***
{COMPOUND} { return COMPOUND; }
//***JAMA CHANGES***
{ATMENTION} { return ATMENTION; }
{APOSTROPHE} { return APOSTROPHE; }
{ACRONYM} { return ACRONYM; }
{COMPANY} { return COMPANY; }
{EMAIL} { return EMAIL; }
{HOST} { return HOST; }
{NUM} { return NUM; }
{CJ} { return CJ; }
{ACRONYM_DEP} { return ACRONYM_DEP; }
{ALPHANUM} { return ALPHANUM; }
//***JAMA CHANGES***
{EXTRACHARS} { return EXTRACHARS; }
/** Ignore the rest */
[^] { /* ignore */ }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment