Created
April 26, 2016 04:23
-
-
Save khaosans/fe16099c8f3bdbde3f2f49a3432946f7 to your computer and use it in GitHub Desktop.
Ignore the rest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.jamasoftware.search.util.analyzer; | |
/** | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
/* | |
WARNING: if you change JamaTokenizerImpl.jflex and need to regenerate | |
the tokenizer, use JFlex 1.5.0! | |
***JAMA Note***: this is file based on ClassicTokenizerImpl.jflex | |
*/ | |
import java.io.Reader; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.standard.StandardTokenizerInterface; | |
%% | |
%class JamaTokenizerImpl | |
%implements StandardTokenizerInterface | |
//%unicode 3.0 | |
%integer | |
%function getNextToken | |
%pack | |
%char | |
%{ | |
public static final int ALPHANUM = JamaTokenizer.ALPHANUM; | |
public static final int APOSTROPHE = JamaTokenizer.APOSTROPHE; | |
public static final int ACRONYM = JamaTokenizer.ACRONYM; | |
public static final int COMPANY = JamaTokenizer.COMPANY; | |
public static final int EMAIL = JamaTokenizer.EMAIL; | |
public static final int HOST = JamaTokenizer.HOST; | |
public static final int NUM = JamaTokenizer.NUM; | |
public static final int CJ = JamaTokenizer.CJ; | |
//***JAMA Changes*** | |
public static final int COMPOUND = JamaTokenizer.COMPOUND; | |
public static final int ATMENTION = JamaTokenizer.ATMENTION; | |
public static final int EXTRACHARS = JamaTokenizer.EXTRACHARS; | |
/** | |
* @deprecated this solves a bug where HOSTs that end with '.' are identified | |
* as ACRONYMs. | |
*/ | |
@Deprecated | |
public static final int ACRONYM_DEP = JamaTokenizer.ACRONYM_DEP; | |
public static final String [] TOKEN_TYPES = JamaTokenizer.TOKEN_TYPES; | |
public final int yychar() | |
{ | |
return yychar; | |
} | |
/** | |
* Fills CharTermAttribute with the current token text. | |
*/ | |
public final void getText(CharTermAttribute t) { | |
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); | |
} | |
@Override | |
public void setBufferSize(int i) { | |
throw new UnsupportedOperationException(); | |
} | |
%} | |
THAI = [\u0E00-\u0E59] | |
// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function) | |
ALPHANUM = ({LETTER}|{THAI}|{EXTRA_NON_MENTION_CHARS}|[:digit:]|["_"])+ | |
//***JAMA Changes*** Treat "_" as a letter | |
COMPOUND = {ALPHANUM} (("_"|"-") {ALPHANUM})+ | |
//***JAMA Changes*** Keep @ mentions in tact | |
ATMENTION = {ATMENTION_CHARS} {COMPOUND} | |
ATMENTION_CHARS = ("@"|"#") | |
// internal apostrophes: O'Reilly, you're, O'Reilly's | |
// use a post-filter to remove possessives | |
APOSTROPHE = {ALPHA} ("'" {ALPHA})+ | |
// acronyms: U.S.A., I.B.M., etc. | |
// use a post-filter to remove dots | |
ACRONYM = {LETTER} "." ({LETTER} ".")+ | |
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+ | |
// company names like AT&T and Excite@Home. | |
COMPANY = {ALPHA} ("&"|"@") {ALPHA} | |
// email addresses | |
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+ | |
// hostname | |
HOST = {ALPHANUM} ((".") {ALPHANUM})+ | |
// floating point, serial, model numbers, ip addresses, etc. | |
// every other segment must have at least one digit | |
NUM = ({ALPHANUM} {P} {HAS_DIGIT} | |
| {HAS_DIGIT} {P} {ALPHANUM} | |
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ | |
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ | |
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ | |
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+) | |
// punctuation | |
P = ("_"|"-"|"/"|"."|",") | |
// at least one digit | |
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])* | |
ALPHA = ({LETTER})+ | |
// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)" | |
LETTER = !(![:letter:]|{CJ}) | |
// Chinese and Japanese (but NOT Korean, which is included in [:letter:]) | |
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f] | |
//***JAMA Changes*** Care about certain characters | |
EXTRACHARS = ({EXTRA_NON_MENTION_CHARS}|{ATMENTION_CHARS}) | |
EXTRA_NON_MENTION_CHARS = ("!"|"?"|"^"|"$"|"&"|"%") | |
%% | |
//***JAMA CHANGES*** | |
{COMPOUND} { return COMPOUND; } | |
//***JAMA CHANGES*** | |
{ATMENTION} { return ATMENTION; } | |
{APOSTROPHE} { return APOSTROPHE; } | |
{ACRONYM} { return ACRONYM; } | |
{COMPANY} { return COMPANY; } | |
{EMAIL} { return EMAIL; } | |
{HOST} { return HOST; } | |
{NUM} { return NUM; } | |
{CJ} { return CJ; } | |
{ACRONYM_DEP} { return ACRONYM_DEP; } | |
{ALPHANUM} { return ALPHANUM; } | |
//***JAMA CHANGES*** | |
{EXTRACHARS} { return EXTRACHARS; } | |
/** Ignore the rest */ | |
[^] { /* ignore */ } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment