Created
July 26, 2023 08:48
-
-
Save KANATAKA/ad92778084685ca9b8cd670169a66ddc to your computer and use it in GitHub Desktop.
Solr-8.11.2でKuromoji + UniDic をビルドするためのパッチ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -Naur a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml | |
--- a/lucene/analysis/kuromoji/build.xml 2022-06-14 01:26:41.000000000 +0900 | |
+++ b/lucene/analysis/kuromoji/build.xml 2023-07-20 18:41:44.533480829 +0900 | |
@@ -31,24 +31,39 @@ | |
<property name="ivy.default.configuration" value="default"/> | |
<import file="../analysis-module-build.xml"/> | |
- <!-- default configuration: uses mecab-ipadic --> | |
- <property name="ipadic.type" value="ipadic"/> | |
- <property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" /> | |
+ <!-- alternative configuration: uses mecab-ipadic | |
+ <property name="dict.type" value="ipadic"/> | |
+ <property name="dict.version" value="mecab-ipadic-2.7.0-20070801" /> | |
+ <property name="dict.is.ipadic" value="true"/> | |
+ --> | |
<!-- alternative configuration: uses mecab-naist-jdic | |
- <property name="ipadic.type" value="naist"/> | |
- <property name="ipadic.version" value="mecab-naist-jdic-0.6.3b-20111013" /> | |
+ <property name="dict.type" value="naist"/> | |
+ <property name="dict.version" value="mecab-naist-jdic-0.6.3b-20111013" /> | |
+ <property name="dict.is.ipadic" value="true"/> | |
--> | |
- <property name="dict.src.file" value="${ipadic.version}.tar.gz" /> | |
- <property name="dict.src.dir" value="${build.dir}/${ipadic.version}" /> | |
+ <!-- default configuration: uses UniDic --> | |
+ <property name="dict.type" value="unidic"/> | |
+ <property name="dict.version" value="unidic-mecab-2.1.2_src"/> | |
+ <property name="dict.src.file" value="${dict.version}.zip" /> | |
+ <property name="dict.is.unidic" value="true"/> | |
+ | |
+ <!-- for ipadic | |
+ <property name="dict.src.file" value="${dict.version}.tar.gz" /> | |
<property name="dict.encoding" value="euc-jp"/> | |
<property name="dict.format" value="ipadic"/> | |
+ --> | |
+ | |
+ <!-- for UniDic --> | |
+ <property name="dict.encoding" value="utf-8"/> | |
+ <property name="dict.format" value="unidic"/> | |
+ | |
+ <property name="dict.src.dir" value="${build.dir}/${dict.version}" /> | |
<property name="dict.normalize" value="false"/> | |
<property name="dict.target.dir" location="${resources.dir}"/> | |
- | |
- <available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/> | |
+ <available type="dir" file="${build.dir}/${dict.version}" property="dict.available"/> | |
<path id="classpath"> | |
<dirset dir="${build.dir}"> | |
@@ -59,14 +74,20 @@ | |
</path> | |
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" /> | |
- <target name="download-dict" depends="ivy-availability-check,ivy-fail,ivy-configure" unless="dict.available"> | |
- <ivy:retrieve pattern="${build.dir}/${dict.src.file}" conf="${ipadic.type}" symlink="${ivy.symlink}"/> | |
- <!-- TODO: we should checksum too --> | |
- <gunzip src="${build.dir}/${dict.src.file}"/> | |
- <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/> | |
+ <target name="download-dict" depends="ivy-availability-check,ivy-fail,ivy-configure" unless="dict.available" if="dict.is.ipadic"> | |
+ <ivy:retrieve pattern="${build.dir}/${dict.src.file}" conf="${dict.type}" symlink="${ivy.symlink}"/> | |
+ <!-- TODO: we should checksum too --> | |
+ <gunzip src="${build.dir}/${dict.src.file}"/> | |
+ <untar src="${build.dir}/${dict.version}.tar" dest="${build.dir}"/> | |
+ </target> | |
+ | |
+ <target name="download-dict-unidic" depends="ivy-availability-check,ivy-fail,ivy-configure" unless="dict.available" if="dict.is.unidic"> | |
+ <ivy:retrieve pattern="${build.dir}/${dict.src.file}" conf="${dict.type}" symlink="${ivy.symlink}"/> | |
+ <!-- TODO: we should checksum too --> | |
+ <unzip src="${build.dir}/${dict.src.file}" dest="${build.dir}"/> | |
</target> | |
- <target name="patch-dict" depends="download-dict"> | |
+ <target name="patch-dict" depends="download-dict,download-dict-unidic" if="dict.is.ipadic"> | |
<patch patchfile="src/tools/patches/Noun.proper.csv.patch" | |
originalfile="${dict.src.dir}/Noun.proper.csv"/> | |
</target> | |
diff -Naur a/lucene/analysis/kuromoji/ivy.xml b/lucene/analysis/kuromoji/ivy.xml | |
--- a/lucene/analysis/kuromoji/ivy.xml 2022-06-14 01:26:41.000000000 +0900 | |
+++ b/lucene/analysis/kuromoji/ivy.xml 2023-07-20 18:50:49.848619771 +0900 | |
@@ -19,10 +19,11 @@ | |
<ivy-module version="2.0"> | |
<info organisation="org.apache.lucene" module="analyzers-kuromoji"/> | |
- <configurations defaultconfmapping="ipadic->default;naist->default"> <!-- 'master' conf not available to map to --> | |
+ <configurations defaultconfmapping="ipadic->default;naist->default;unidic->default"> <!-- 'master' conf not available to map to --> | |
<conf name="default" description="explicitly declare this configuration in order to not download dictionaries unless explicitly called for"/> | |
<conf name="ipadic" description="ipadic dictionary" transitive="false"/> | |
<conf name="naist" description="naist-jdic dictionary" transitive="false"/> | |
+ <conf name="unidic" description="unidic dictionary" transitive="false"/> | |
</configurations> | |
<dependencies> | |
@@ -32,6 +33,9 @@ | |
<dependency org="mecab" name="mecab-naist-jdic" rev="${/mecab/mecab-naist-jdic}" conf="naist"> | |
<artifact name="mecab-naist-jdic" type=".tar.gz" url=" https://rwthaachen.dl.osdn.jp/naist-jdic/53500/mecab-naist-jdic-0.6.3b-20111013.tar.gz"/> | |
</dependency> | |
+ <dependency org="mecab" name="mecab-unidic" rev="${/mecab/mecab-unidic}" conf="unidic"> | |
+ <artifact name="unidic" type=".zip" url=" http://ja.osdn.net/frs/redir.php?m=iij&f=unidic%2F58338%2Funidic-mecab-2.1.2_src.zip"/> | |
+ </dependency> | |
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> | |
</dependencies> | |
</ivy-module> | |
diff -Naur a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java | |
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java 2022-06-14 01:26:41.000000000 +0900 | |
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java 2023-07-20 18:56:52.181437721 +0900 | |
@@ -58,7 +58,7 @@ | |
.build(inputDir) | |
.write(outputDir); | |
- new UnknownDictionaryBuilder(encoding) | |
+ new UnknownDictionaryBuilder(format, encoding) | |
.build(inputDir) | |
.write(outputDir); | |
diff -Naur a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java | |
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java 2022-06-14 01:26:41.000000000 +0900 | |
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java 2023-07-20 18:59:51.606919901 +0900 | |
@@ -73,8 +73,10 @@ | |
while ((line = reader.readLine()) != null) { | |
String[] entry = CSVUtil.parse(line); | |
- if (entry.length < 13) { | |
+ if (this.format == DictionaryFormat.IPADIC && entry.length < 13) { | |
throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line); | |
+ } else if (this.format == DictionaryFormat.UNIDIC && entry.length < 21) { | |
+ throw new IllegalArgumentException("Entry in CSV is not valid (21 field values expected): " + line); | |
} | |
lines.add(formatEntry(entry)); | |
@@ -149,9 +151,10 @@ | |
* 3 - word cost | |
* 4-9 - pos | |
* 10 - base form reading | |
- * 11 - base form | |
+ * 11 - lexeme - not used | |
* 12 - surface form | |
* 13 - surface reading | |
+ * 14 - orth form | |
*/ | |
private String[] formatEntry(String[] features) { | |
@@ -169,7 +172,7 @@ | |
features2[7] = features[7]; | |
features2[8] = features[8]; | |
features2[9] = features[9]; | |
- features2[10] = features[11]; | |
+ features2[10] = features[14]; | |
// If the surface reading is non-existent, use surface form for reading and pronunciation. | |
// This happens with punctuation in UniDic and there are possibly other cases as well | |
diff -Naur a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java | |
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java 2022-06-14 01:26:41.000000000 +0900 | |
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java 2023-07-20 19:04:50.218408259 +0900 | |
@@ -28,14 +28,17 @@ | |
import java.util.List; | |
import org.apache.lucene.analysis.ja.dict.CharacterDefinition; | |
+import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat; | |
class UnknownDictionaryBuilder { | |
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*"; | |
private final String encoding; | |
+ private final DictionaryFormat format; | |
- UnknownDictionaryBuilder(String encoding) { | |
+ UnknownDictionaryBuilder(DictionaryFormat format, String encoding) { | |
this.encoding = encoding; | |
+ this.format = format; | |
} | |
public UnknownDictionaryWriter build(Path dir) throws IOException { | |
@@ -61,7 +64,12 @@ | |
while ((line = lineReader.readLine()) != null) { | |
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, | |
// even though the unknown dictionary returns hardcoded null here. | |
- final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry | |
+ final String[] parsed; | |
+ if (this.format == DictionaryFormat.UNIDIC) { | |
+ parsed = CSVUtil.parse(line + ",*,*,*"); // UniDic needs one more column | |
+ } else { | |
+ parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry | |
+ } | |
lines.add(parsed); | |
} | |
} | |
diff -Naur a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties | |
--- a/lucene/ivy-versions.properties 2022-06-14 01:26:41.000000000 +0900 | |
+++ b/lucene/ivy-versions.properties 2023-07-20 19:06:27.587328559 +0900 | |
@@ -155,6 +155,7 @@ | |
/mecab/mecab-ipadic = 2.7.0-20070801 | |
/mecab/mecab-ko-dic = 2.0.3-20170922 | |
/mecab/mecab-naist-jdic = 0.6.3b-20111013 | |
+/mecab/mecab-unidic = 2.1.2 | |
/net.arnx/jsonic = 1.2.7 | |
/net.bytebuddy/byte-buddy = 1.9.3 | |
/net.hydromatic/eigenbase-properties = 1.1.5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment