Skip to content

Instantly share code, notes, and snippets.

@KANATAKA
Created July 26, 2023 08:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KANATAKA/ad92778084685ca9b8cd670169a66ddc to your computer and use it in GitHub Desktop.
Save KANATAKA/ad92778084685ca9b8cd670169a66ddc to your computer and use it in GitHub Desktop.
Solr-8.11.2でKuromoji + UniDic をビルドするためのパッチ
diff -Naur a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml
--- a/lucene/analysis/kuromoji/build.xml 2022-06-14 01:26:41.000000000 +0900
+++ b/lucene/analysis/kuromoji/build.xml 2023-07-20 18:41:44.533480829 +0900
@@ -31,24 +31,39 @@
<property name="ivy.default.configuration" value="default"/>
<import file="../analysis-module-build.xml"/>
- <!-- default configuration: uses mecab-ipadic -->
- <property name="ipadic.type" value="ipadic"/>
- <property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" />
+ <!-- alternative configuration: uses mecab-ipadic
+ <property name="dict.type" value="ipadic"/>
+ <property name="dict.version" value="mecab-ipadic-2.7.0-20070801" />
+ <property name="dict.is.ipadic" value="true"/>
+ -->
<!-- alternative configuration: uses mecab-naist-jdic
- <property name="ipadic.type" value="naist"/>
- <property name="ipadic.version" value="mecab-naist-jdic-0.6.3b-20111013" />
+ <property name="dict.type" value="naist"/>
+ <property name="dict.version" value="mecab-naist-jdic-0.6.3b-20111013" />
+ <property name="dict.is.ipadic" value="true"/>
-->
- <property name="dict.src.file" value="${ipadic.version}.tar.gz" />
- <property name="dict.src.dir" value="${build.dir}/${ipadic.version}" />
+ <!-- default configuration: uses UniDic -->
+ <property name="dict.type" value="unidic"/>
+ <property name="dict.version" value="unidic-mecab-2.1.2_src"/>
+ <property name="dict.src.file" value="${dict.version}.zip" />
+ <property name="dict.is.unidic" value="true"/>
+
+ <!-- for ipadic
+ <property name="dict.src.file" value="${dict.version}.tar.gz" />
<property name="dict.encoding" value="euc-jp"/>
<property name="dict.format" value="ipadic"/>
+ -->
+
+ <!-- for UniDic -->
+ <property name="dict.encoding" value="utf-8"/>
+ <property name="dict.format" value="unidic"/>
+
+ <property name="dict.src.dir" value="${build.dir}/${dict.version}" />
<property name="dict.normalize" value="false"/>
<property name="dict.target.dir" location="${resources.dir}"/>
-
- <available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/>
+ <available type="dir" file="${build.dir}/${dict.version}" property="dict.available"/>
<path id="classpath">
<dirset dir="${build.dir}">
@@ -59,14 +74,20 @@
</path>
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
- <target name="download-dict" depends="ivy-availability-check,ivy-fail,ivy-configure" unless="dict.available">
- <ivy:retrieve pattern="${build.dir}/${dict.src.file}" conf="${ipadic.type}" symlink="${ivy.symlink}"/>
- <!-- TODO: we should checksum too -->
- <gunzip src="${build.dir}/${dict.src.file}"/>
- <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
+ <target name="download-dict" depends="ivy-availability-check,ivy-fail,ivy-configure" unless="dict.available" if="dict.is.ipadic">
+ <ivy:retrieve pattern="${build.dir}/${dict.src.file}" conf="${dict.type}" symlink="${ivy.symlink}"/>
+ <!-- TODO: we should checksum too -->
+ <gunzip src="${build.dir}/${dict.src.file}"/>
+ <untar src="${build.dir}/${dict.version}.tar" dest="${build.dir}"/>
+ </target>
+
+ <target name="download-dict-unidic" depends="ivy-availability-check,ivy-fail,ivy-configure" unless="dict.available" if="dict.is.unidic">
+ <ivy:retrieve pattern="${build.dir}/${dict.src.file}" conf="${dict.type}" symlink="${ivy.symlink}"/>
+ <!-- TODO: we should checksum too -->
+ <unzip src="${build.dir}/${dict.src.file}" dest="${build.dir}"/>
</target>
- <target name="patch-dict" depends="download-dict">
+ <target name="patch-dict" depends="download-dict,download-dict-unidic" if="dict.is.ipadic">
<patch patchfile="src/tools/patches/Noun.proper.csv.patch"
originalfile="${dict.src.dir}/Noun.proper.csv"/>
</target>
diff -Naur a/lucene/analysis/kuromoji/ivy.xml b/lucene/analysis/kuromoji/ivy.xml
--- a/lucene/analysis/kuromoji/ivy.xml 2022-06-14 01:26:41.000000000 +0900
+++ b/lucene/analysis/kuromoji/ivy.xml 2023-07-20 18:50:49.848619771 +0900
@@ -19,10 +19,11 @@
<ivy-module version="2.0">
<info organisation="org.apache.lucene" module="analyzers-kuromoji"/>
- <configurations defaultconfmapping="ipadic->default;naist->default"> <!-- 'master' conf not available to map to -->
+ <configurations defaultconfmapping="ipadic->default;naist->default;unidic->default"> <!-- 'master' conf not available to map to -->
<conf name="default" description="explicitly declare this configuration in order to not download dictionaries unless explicitly called for"/>
<conf name="ipadic" description="ipadic dictionary" transitive="false"/>
<conf name="naist" description="naist-jdic dictionary" transitive="false"/>
+ <conf name="unidic" description="unidic dictionary" transitive="false"/>
</configurations>
<dependencies>
@@ -32,6 +33,9 @@
<dependency org="mecab" name="mecab-naist-jdic" rev="${/mecab/mecab-naist-jdic}" conf="naist">
<artifact name="mecab-naist-jdic" type=".tar.gz" url=" https://rwthaachen.dl.osdn.jp/naist-jdic/53500/mecab-naist-jdic-0.6.3b-20111013.tar.gz"/>
</dependency>
+ <dependency org="mecab" name="mecab-unidic" rev="${/mecab/mecab-unidic}" conf="unidic">
+ <artifact name="unidic" type=".zip" url=" http://ja.osdn.net/frs/redir.php?m=iij&amp;f=unidic%2F58338%2Funidic-mecab-2.1.2_src.zip"/>
+ </dependency>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
diff -Naur a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java 2022-06-14 01:26:41.000000000 +0900
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java 2023-07-20 18:56:52.181437721 +0900
@@ -58,7 +58,7 @@
.build(inputDir)
.write(outputDir);
- new UnknownDictionaryBuilder(encoding)
+ new UnknownDictionaryBuilder(format, encoding)
.build(inputDir)
.write(outputDir);
diff -Naur a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java 2022-06-14 01:26:41.000000000 +0900
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java 2023-07-20 18:59:51.606919901 +0900
@@ -73,8 +73,10 @@
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);
- if (entry.length < 13) {
+ if (this.format == DictionaryFormat.IPADIC && entry.length < 13) {
throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line);
+ } else if (this.format == DictionaryFormat.UNIDIC && entry.length < 21) {
+ throw new IllegalArgumentException("Entry in CSV is not valid (21 field values expected): " + line);
}
lines.add(formatEntry(entry));
@@ -149,9 +151,10 @@
* 3 - word cost
* 4-9 - pos
* 10 - base form reading
- * 11 - base form
+ * 11 - lexeme - not used
* 12 - surface form
* 13 - surface reading
+ * 14 - orth form
*/
private String[] formatEntry(String[] features) {
@@ -169,7 +172,7 @@
features2[7] = features[7];
features2[8] = features[8];
features2[9] = features[9];
- features2[10] = features[11];
+ features2[10] = features[14];
// If the surface reading is non-existent, use surface form for reading and pronunciation.
// This happens with punctuation in UniDic and there are possibly other cases as well
diff -Naur a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java 2022-06-14 01:26:41.000000000 +0900
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java 2023-07-20 19:04:50.218408259 +0900
@@ -28,14 +28,17 @@
import java.util.List;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
+import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
private final String encoding;
+ private final DictionaryFormat format;
- UnknownDictionaryBuilder(String encoding) {
+ UnknownDictionaryBuilder(DictionaryFormat format, String encoding) {
this.encoding = encoding;
+ this.format = format;
}
public UnknownDictionaryWriter build(Path dir) throws IOException {
@@ -61,7 +64,12 @@
while ((line = lineReader.readLine()) != null) {
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
// even though the unknown dictionary returns hardcoded null here.
- final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
+ final String[] parsed;
+ if (this.format == DictionaryFormat.UNIDIC) {
+ parsed = CSVUtil.parse(line + ",*,*,*"); // UniDic needs one more column
+ } else {
+ parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
+ }
lines.add(parsed);
}
}
diff -Naur a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties
--- a/lucene/ivy-versions.properties 2022-06-14 01:26:41.000000000 +0900
+++ b/lucene/ivy-versions.properties 2023-07-20 19:06:27.587328559 +0900
@@ -155,6 +155,7 @@
/mecab/mecab-ipadic = 2.7.0-20070801
/mecab/mecab-ko-dic = 2.0.3-20170922
/mecab/mecab-naist-jdic = 0.6.3b-20111013
+/mecab/mecab-unidic = 2.1.2
/net.arnx/jsonic = 1.2.7
/net.bytebuddy/byte-buddy = 1.9.3
/net.hydromatic/eigenbase-properties = 1.1.5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment