timrae/FuriganaTools.java

## FuriganaTools.java
/****************************************************************************************
 * Copyright (c) 2015, Timothy Rae                                                      *
 * All rights reserved.                                                                 *
 *                                                                                      *
 * Redistribution and use in source and binary forms, with or without                   *
 * modification, are permitted provided that the following conditions are met:          *
 *     * Redistributions of source code must retain the above copyright                 *
 *       notice, this list of conditions and the following disclaimer.                  *
 *     * Redistributions in binary form must reproduce the above copyright              *
 *       notice, this list of conditions and the following disclaimer in the            *
 *       documentation and/or other materials provided with the distribution.           *
 *     * Neither the name of the copyright holders nor the                              *
 *       names of its contributors may be used to endorse or promote products           *
 *       derived from this software without specific prior written permission.          *
 *                                                                                      *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND      *
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED        *
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE               *
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY                *
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES           *
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;         *
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND          *
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                         *
 ****************************************************************************************/

public class FuriganaTools {
    private static final Pattern KANA_REGEXP = Pattern.compile("[\u3041-\u309e\uff66-\uff9d\u30a1-\u30fe]+");
    private static final String RUBY = " %s[%s]";

    /**
     * Add the reading to a kanji as Ruby furigana in the Anki format,
     * ensuring that there is only furigana above the kanji,
     * not above any of the hiragana included in the word.
     *
     * Example: ("振り返る", "ふりかえる") -> "振[ふ]り 返[かえ]る"
     *
     * @param kanji the word to which furigana should be applied
     * @param reading the hiragana reading corresponding to the kanji word
     * @return the furigana corresponding the input parameters
     */
    public static String makeFurigana(String kanji, String reading) {
        Matcher kanaMatcher = KANA_REGEXP.matcher(kanji);
        // All characeters are kanji; simple replacement will work
        if (!kanaMatcher.find()) {
            return String.format(RUBY, kanji, reading);
        }
        // Strip off any kana from the beginning of the word
        StringBuilder output = new StringBuilder();
        if (kanaMatcher.start() == 0) {
            String prefix = kanaMatcher.group();
            kanji = kanji.substring(prefix.length());
            reading = reading.substring(prefix.length());
            output.append(prefix);
            kanaMatcher = KANA_REGEXP.matcher(kanji);
        } else {
            kanaMatcher.reset();
        }
        // Keep track of number of kana added to output to see if the algorithm was successful
        int numKana = output.length();
        // Now step through each kanji
        int lastKanaEnd = 0;
        int lastReadingKanaEnd = 0;

        while (kanaMatcher.find()) {
            // Find the next kana in the kanji string
            int kanaStart = kanaMatcher.start();
            String currentKana = kanaMatcher.group();
            // Extract the kanji in-between the current kana and the previous kana
            String currentKanji = kanji.substring(lastKanaEnd, kanaStart);
            // Set the end index of current kana in kanji string for next loop iteration
            lastKanaEnd = kanaMatcher.end();
            // Find the current kana in the reading string
            // Not perfect. Here we take the first occurrence at least number of kanji after the last kana
            int readingKanaStart = reading.indexOf(currentKana, lastReadingKanaEnd + currentKanji.length());
            // Extract the reading in-between the kana found in the kanji this time and last time
            String currentReading = reading.substring(lastReadingKanaEnd, readingKanaStart);
            // Set the end index of current kana in reading string for next loop iteration
            lastReadingKanaEnd = readingKanaStart + currentKana.length();
            // Append current kanji and reading to the StringBuilder as furigana
            output.append(String.format(RUBY, currentKanji, currentReading));
            // Append the current kana to the StringBuilder (outside the furigana)
            output.append(currentKana);
            // Keep track of number of kana addded to see if the algorithm was successful
            numKana += currentReading.length() + currentKana.length();
        }
        // Add any kanji / reading at the end of the string to the builder
        if (lastKanaEnd < kanji.length()) {
            String currentKanji = kanji.substring(lastKanaEnd+1);
            String currentReading = reading.substring(lastReadingKanaEnd + 1);
            output.append(String.format(RUBY, currentKanji, currentReading));
            numKana += currentReading.length();
        }
        // Do sanity check, returning naiive substitution if it failed
        if (numKana < reading.length()) {
            return String.format(RUBY, kanji, reading);
        }
        return output.toString().trim();
    }
}
	/****************************************************************************************
	* Copyright (c) 2015, Timothy Rae *
	* All rights reserved. *
	* *
	* Redistribution and use in source and binary forms, with or without *
	* modification, are permitted provided that the following conditions are met: *
	* * Redistributions of source code must retain the above copyright *
	* notice, this list of conditions and the following disclaimer. *
	* * Redistributions in binary form must reproduce the above copyright *
	* notice, this list of conditions and the following disclaimer in the *
	* documentation and/or other materials provided with the distribution. *
	* * Neither the name of the copyright holders nor the *
	* names of its contributors may be used to endorse or promote products *
	* derived from this software without specific prior written permission. *
	* *
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
	* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY *
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
	****************************************************************************************/

	public class FuriganaTools {
	private static final Pattern KANA_REGEXP = Pattern.compile("[\u3041-\u309e\uff66-\uff9d\u30a1-\u30fe]+");
	private static final String RUBY = " %s[%s]";

	/**
	* Add the reading to a kanji as Ruby furigana in the Anki format,
	* ensuring that there is only furigana above the kanji,
	* not above any of the hiragana included in the word.
	*
	* Example: ("振り返る", "ふりかえる") -> "振[ふ]り返[かえ]る"
	*
	* @param kanji the word to which furigana should be applied
	* @param reading the hiragana reading corresponding to the kanji word
	* @return the furigana corresponding the input parameters
	*/
	public static String makeFurigana(String kanji, String reading) {
	Matcher kanaMatcher = KANA_REGEXP.matcher(kanji);
	// All characeters are kanji; simple replacement will work
	if (!kanaMatcher.find()) {
	return String.format(RUBY, kanji, reading);
	}
	// Strip off any kana from the beginning of the word
	StringBuilder output = new StringBuilder();
	if (kanaMatcher.start() == 0) {
	String prefix = kanaMatcher.group();
	kanji = kanji.substring(prefix.length());
	reading = reading.substring(prefix.length());
	output.append(prefix);
	kanaMatcher = KANA_REGEXP.matcher(kanji);
	} else {
	kanaMatcher.reset();
	}
	// Keep track of number of kana added to output to see if the algorithm was successful
	int numKana = output.length();
	// Now step through each kanji
	int lastKanaEnd = 0;
	int lastReadingKanaEnd = 0;

	while (kanaMatcher.find()) {
	// Find the next kana in the kanji string
	int kanaStart = kanaMatcher.start();
	String currentKana = kanaMatcher.group();
	// Extract the kanji in-between the current kana and the previous kana
	String currentKanji = kanji.substring(lastKanaEnd, kanaStart);
	// Set the end index of current kana in kanji string for next loop iteration
	lastKanaEnd = kanaMatcher.end();
	// Find the current kana in the reading string
	// Not perfect. Here we take the first occurrence at least number of kanji after the last kana
	int readingKanaStart = reading.indexOf(currentKana, lastReadingKanaEnd + currentKanji.length());
	// Extract the reading in-between the kana found in the kanji this time and last time
	String currentReading = reading.substring(lastReadingKanaEnd, readingKanaStart);
	// Set the end index of current kana in reading string for next loop iteration
	lastReadingKanaEnd = readingKanaStart + currentKana.length();
	// Append current kanji and reading to the StringBuilder as furigana
	output.append(String.format(RUBY, currentKanji, currentReading));
	// Append the current kana to the StringBuilder (outside the furigana)
	output.append(currentKana);
	// Keep track of number of kana addded to see if the algorithm was successful
	numKana += currentReading.length() + currentKana.length();
	}
	// Add any kanji / reading at the end of the string to the builder
	if (lastKanaEnd < kanji.length()) {
	String currentKanji = kanji.substring(lastKanaEnd+1);
	String currentReading = reading.substring(lastReadingKanaEnd + 1);
	output.append(String.format(RUBY, currentKanji, currentReading));
	numKana += currentReading.length();
	}
	// Do sanity check, returning naiive substitution if it failed
	if (numKana < reading.length()) {
	return String.format(RUBY, kanji, reading);
	}
	return output.toString().trim();
	}
	}