tenten0213/CostMatrixBuilder.java

## build.xml
<?xml version="1.0"?>
<!--
  Usage:

  To use a web proxy to download the dictionary data, do the following:

  $ ant -Dproxy.host=proxy.hoehoe.jp -Dproxy.port=8080

-->
<project name="ipadic" default="compile" basedir=".">
  <property name="ipadic.home" value="http://sourceforge.jp/projects/ipadic/downloads/24435/ipadic-2.7.0.tar.gz"/>
  <property name="ipadic.version" value="2.7.0"/>
  <property name="ipadic.archive"  value="ipadic-${ipadic.version}.tar.gz"/>
  <property name="ipadic.dir"  value="ipadic-${ipadic.version}"/>

  <!-- Checks the current build status -->
  <!-- Sets the property "dics.unpacked" if the dictionary is already unpacked -->
  <!-- Sets the property "ipadic.archive.present" if the ipadic archive is already present -->
  <!-- Sets the property "dics.preprocessed" if the dictionary is already preprocessed -->
  <!-- Sets the property "dics.complete" if the dictionary is already compiled -->
  <target name="check-build-status">
    <available file="${ipadic.archive}" property="ipadic.archive.present"/>
    <condition property="dics.unpacked">
      <and>
        <available file="ipadic-${ipadic.version}/Noun.dic"/>
      </and>
    </condition>
    <condition property="dics.preprocessed">
      <and>
        <available file="dic.csv"/>
        <available file="connect.csv"/>
      </and>
    </condition>
    <condition property="dics.complete">
      <and>
        <available file="da.sen"/>
        <available file="matrix.sen"/>
        <available file="posInfo.sen"/>
        <available file="token.sen"/>
      </and>
    </condition>
  </target>

  <!--
  <target name="download" depends="prepare-proxy,check-build-status" unless="ipadic.archive.present">
    <get src="${ipadic.home}/${ipadic.archive}" dest="${ipadic.archive}" />
  </target>
  -->

  <!-- Unpacks the ipadic dictionary -->
  <target name="unpack" depends="check-build-status" unless="dics.unpacked">
    <gunzip src="${ipadic.archive}"/>
    <untar src="${ipadic.dir}.tar" dest="." />
    <delete file="${ipadic.dir}.tar"/>
  </target>

  <!-- Deletes the ipadic dictionary and compiled files -->
  <target name="clean">
    <delete>
      <fileset dir="." includes="*.sen"/>
      <fileset dir="." includes="*.csv"/>
    </delete>
    <delete dir="ipadic-${ipadic.version}"/>
    <delete file="${ipadic.archive}" />
  </target>

  <!-- Preprocesses the ipadic dictionary for compilation -->
  <target name="preprocess" depends="unpack" unless="dics.preprocessed">
    <java classname="net.java.sen.tools.DictionaryPreprocessor"
          fork="true">
      <classpath>
        <pathelement location="."/>
        <pathelement location="../../bin"/>
        <pathelement location="../../jisx0213-1.0.jar"/>
        <pathelement path="${java.class.path}"/>
      </classpath>
      <arg line="X-EUC-JISX0213" />
      <arg line="ipadic-${ipadic.version}" />
      <arg line="." />
    </java>
  </target>

  <!-- Default task - compiles the ipadic dictionary -->
  <target name="compile" depends="preprocess" unless="dics.complete">
    <java classname="net.java.sen.tools.DictionaryCompiler"
          fork="true">
      <classpath>
        <pathelement location="."/>
        <pathelement location="../../bin"/>
        <pathelement path="${java.class.path}"/>
      </classpath>
    </java>
  </target>

  <!-- Downloads and compiles the ipadic dictionary from scratch -->
  <target name="all" depends="clean,unpack,compile"/>

</project>


## CostMatrixBuilder.java
/*
 * Copyright (C) 2001-2007
 * Taku Kudoh <taku-ku@is.aist-nara.ac.jp>
 * Takashi Okamoto <tora@debian.org>
 * Matt Francis <asbel@neosheffield.co.uk>
 *
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 */

package net.java.sen.compiler;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.Vector;


/**
 * Builds an axis of the Connection Cost matrix from supplied part-of-speech /
 * conjugation data
 *
 * TODO The workings of this class are relatively simple but somewhat magical.
 * It could use some explanation from someone who understands what exactly it's
 * doing.
 */
class CostMatrixBuilder {

	/**
	 * Set containing all unique values from one column of the Connection Cost CSV file
	 */
	private LinkedHashSet<String> ruleSet = new LinkedHashSet<String>();

	/**
	 * The input rules (from ruleSet) split into individual values
	 */
	private Vector<String[]> ruleList = new Vector<String[]>();


	/**
	 * TODO This is magic. How does this work?
	 */
	private Vector<Vector<Integer>> idList = new Vector<Vector<Integer>>();

	/**
	 * // dic2IdHash('word type')= id for word type
	 * TODO This is magic. How does this work?
	 */
	private Map<String, Integer> dicIndex = new HashMap<String, Integer>();


	/**
	 * A map containing a unique integer ID for each rule added
	 */
	private Map<String, Integer> ruleIndex = new HashMap<String, Integer>();

	/**
	 * Contains the set of the rules' last fields where the field is not equal to '*'
	 * TODO This is magic. How does this work?
	 */
	private Set<String> lexicalized = new HashSet<String>();


	/**
	 * Converts a list of part-of-speech / conjugation identifier strings to
	 * a vector of IDs unique to each string
	 * TODO This is magic. How does this work?
	 *
	 * @param csv The part-of-speech / conjugation strings
	 * @param parent TODO How does this work?
	 * @return A vector of IDs for the strings
	 */
	private Vector<Integer> getIdList(String csv[], boolean parent) {

		Vector<Integer> results = new Vector<Integer>(this.ruleList.size());
		results.setSize(this.ruleList.size());

		for (int j = 0; j < this.ruleList.size(); j++) {
			results.set(j, j);
		}

		for (int j = 0; j < csv.length; j++) {
			int k = 0;
			for (int n = 0; n < results.size(); n++) {
				int i = results.get(n);
				String rl_ij = this.ruleList.get(i)[j];
				if (
						   ((!parent) && (csv[j].charAt(0) == '*'))
						|| ((parent) && (rl_ij.charAt(0) == '*'))
						|| rl_ij.equals(csv[j])
				   )
				{
					results.set(k++, results.get(n));
				}
			}
			results.setSize(k);
		}

		return results;

	}


	/**
	 * Calculates a unique(?) ID for a split rule
	 * TODO This is magic. How does this work?
	 *
	 * @param csv The split rule
	 * @return The calculated ID
	 */
	private int getDicIdNoCache(String csv[]) {

		Vector<Integer> results = getIdList(csv, true);

/*		if (results.size() == 0) {
			throw new IllegalArgumentException();
		}*/

		int priority[] = new int[results.size()];
		int max = 0;
		for (int i = 0; i < results.size(); i++) {
			String csvValues[] = this.ruleList.get(results.get(i));
			for (int j = 0; j < csvValues.length; j++) {
				if (csvValues[j].charAt(0) != '*') {
					priority[i]++;
				}
			}
			if (priority[max] < priority[i]) {
				max = i;
			}
		}

		return results.size() > 0 ? results.get(max) : 0;

	}


	/**
	 * Adds a Connection Cost CSV value to the builder
	 *
	 * @param rule The rule to add
	 */
	public void add(String rule) {

		this.ruleSet.add(rule);

	}


	/**
	 * Builds the matrix axis based on the data passed to {@link #add(String)}.
	 * It is an error to call {@link #add(String)} after calling
	 * {@link #build()}.
	 */
	public void build() {

		int i = 0;

		this.ruleList.setSize(this.ruleSet.size());
		for (Iterator<String> iterator = this.ruleSet.iterator(); iterator.hasNext();) {
			String str = iterator.next();
			this.ruleIndex.put(str, i);

			String tokenList[] = str.split(",");

			this.ruleList.set(i, tokenList);
			if (tokenList[tokenList.length - 1].charAt(0) != '*') {
				this.lexicalized.add(tokenList[tokenList.length - 1]);
			}
			i++;
		}

		this.ruleSet.clear();

		this.idList.setSize(this.ruleList.size());
		for (int j = 0; j < this.ruleList.size(); j++) {
			Vector<Integer> results = getIdList(this.ruleList.get(j), false);
			this.idList.set(j, results);
		}

	}


	/**
	 * Returns the size of the built matrix axis
	 *
	 * @return The size of the built matrix axis
	 */
	public int size() {

		return this.ruleList.size();

	}


	/**
	 * TODO This is magic. How does this work?
	 *
	 * @param rule The rule
	 * @return TODO how is this ID defined?
	 */
	public int getDicId(String rule) {

		String csv[] = rule.split(",");

		String lex = csv[csv.length - 1];

		if (this.lexicalized.contains(lex)) {

			return getDicIdNoCache(csv);

		}
		// Remove end field
		String partOfSpeech = rule.substring(0, rule.lastIndexOf(","));

		Integer r = this.dicIndex.get(partOfSpeech);
		if ((r != null) && (r != 0)) {

			// 0 if empty
			return r - 1;
		}

		int rg = getDicIdNoCache(csv);

		this.dicIndex.put(partOfSpeech, rg + 1);
		return rg;

	}


	/**
	 * Converts a rule to a vector of IDs unique to each component part
	 *
	 * @param rule The rule
	 * @return A vector of IDs for the component parts
	 */
	public Vector<Integer> getRuleIdList(String rule) {

		return this.idList.get(this.ruleIndex.get(rule));

	}


}

## DictionaryBuilder.java
/*
 * Copyright (C) 2002-2007
 * Taku Kudoh <taku-ku@is.aist-nara.ac.jp>
 * Takashi Okamoto <tora@debian.org>
 * Matt Francis <asbel@neosheffield.co.uk>
 *
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 */

package net.java.sen.compiler;

import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer;
import java.nio.ShortBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;

import net.java.sen.dictionary.CToken;
import net.java.sen.trie.TrieBuilder;
import net.java.sen.util.CSVData;
import net.java.sen.util.CSVParser;


/**
 * Compiles CSV source data into the data files used for analysis
 */
public class DictionaryBuilder {

	/**
	 * Input dictionary CSV filename
	 */
	private static final String DICTIONARY_CSV_FILENAME = "dictionary.csv";

	/**
	 * Input connection CSV filename
	 */
	private static final String CONNECTION_CSV_FILENAME = "connection.csv";

	/**
	 * Compiled connection cost data filename
	 */
	private static final String CONNECTION_COST_DATA_FILENAME = "connectionCost.sen";

	/**
	 * Compiled part of speech data filename
	 */
	private static final String PART_OF_SPEECH_DATA_FILENAME = "partOfSpeech.sen";

	/**
	 * Compiled token data filename
	 */
	private static final String TOKEN_DATA_FILENAME = "token.sen";

	/**
	 * Compiled trie data filename
	 */
	private static final String TRIE_DATA_FILENAME = "trie.sen";

	/**
	 * Default connection cost
	 */
	private static final short DEFAULT_CONNECTION_COST = 10000;

	/**
	 * Start of part-of-speech data within the dictionary CSV
	 */
	private static final int PART_OF_SPEECH_START = 2;

	/**
	 * Size of part-of-speech data within the dictionary CSV
	 */
	private static final int PART_OF_SPEECH_SIZE = 7;

	/**
	 * Beginning-of-string token part-of-speech
	 */
	private static final String BOS_PART_OF_SPEECH = "文頭,*,*,*,*,*,*";

	/**
	 * End-of-string token part-of-speech
	 */
	private static final String EOS_PART_OF_SPEECH = "文末,*,*,*,*,*,*";

	/**
	 * Unknown token part-of-speech
	 */
	private static final String UNKNOWN_PART_OF_SPEECH = "名詞,サ変接続,*,*,*,*,*";


	/**
	 * Precursor data for the Trie file
	 */
	private static class TrieData {

		/**
		 * Trie keys
		 */
		public String keys[];

		/**
		 * Trie values
		 */
		public int values[];

		/**
		 * The actual number of entries in the keys/values arrays
		 */
		public int size;

	}


	/**
	 * Increases the size of an array of <code>short</code>s
	 *
	 * @param current The existing array
	 * @return The resized array
	 */
	private static short[] resize(short current[]) {

		short tmp[] = new short[(int) (current.length * 1.5)];
		System.arraycopy(current, 0, tmp, 0, current.length);

		return tmp;

	}


	/**
	 * Splits a compound reading or pronunciation field into a list
	 *
	 * Compound fields are of the form:
	 *
	 *   "{head1/head2[/head3 ...]}tail"
	 *
	 * The returned list will consist of:
	 *
	 *   "head1tail",
	 *   "head2tail",
	 *   "head3tail",
	 *   ...
	 *
	 * @param compoundField The field to split
	 * @return The split list
	 */
	private List<String> splitCompoundField(String compoundField) {

		List<String> splitFieldList;

		if ((compoundField.length() == 0) || (compoundField.charAt(0) != '{')) {

			// No alternatives
			splitFieldList = new ArrayList<String>(1);
			splitFieldList.add(compoundField);

		} else {

			// 1 or more alternatives. No existing entry in Ipadic has more than 4
			splitFieldList = new ArrayList<String>(4);

			String[] parts = compoundField.split("[{}]");
			String tail = (parts.length == 3) ? parts[2] : "";

			String[] heads = parts.length > 0 ? parts[1].split("/") : new String[0] ;

			for (int i = 0; i < heads.length; i++) {
				splitFieldList.add(heads[i] + tail);
			}

		}

		return splitFieldList;

	}


	/**
	 * Creates the part-of-speech data file
	 *
	 * @param dictionaryCSVFilenames The filenames of the dictionary CSV data file and any additional dictionaries
	 * @param partOfSpeechDataFilename The filename for the part-of-speech data file
	 * @param matrixBuilders The three <code>CostMatrixBuilder</code>s
	 * @param partOfSpeechStart The starting index of the part-of-speech data within a CSV line
	 * @param partOfSpeechSize The number of part-of-speech values within a CSV line
	 * @param charset The charset of the CSV data
	 * @param bosPartOfSpeech The beginning-of-string part-of-speech code
	 * @param eosPartOfSpeech The end-of-string part-of-speech code
	 * @param unknownPartOfSpeech  The beginning-of-string part-of-speech code
	 * @param dictionaryList Populated by this method with the String/CToken tuples that will be used to create the Token file
	 * @param standardCTokens Populated by this method with the three standard CTokens ("bos", "eos" and "unknown")
	 *
	 * @throws IOException
	 */
	private void createPartOfSpeechDataFile(List<String> dictionaryCSVFilenames, String partOfSpeechDataFilename,
			CostMatrixBuilder[] matrixBuilders, int partOfSpeechStart, int partOfSpeechSize, String charset,
			String bosPartOfSpeech, String eosPartOfSpeech, String unknownPartOfSpeech, VirtualTupleList dictionaryList, CToken[] standardCTokens) throws IOException
	{

		String[] csvValues = null;

		CSVData key_b = new CSVData();
		CSVData pos_b = new CSVData();

		DataOutputStream outputStream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(partOfSpeechDataFilename)));

		for (String dictionaryCSVFilename : dictionaryCSVFilenames) {

			CSVParser parser = new CSVParser(new FileInputStream(dictionaryCSVFilename), charset);

			while ((csvValues = parser.nextTokens()) != null) {

				if (csvValues.length < (partOfSpeechSize + partOfSpeechStart)) {
					throw new RuntimeException("format error:" + parser.currentLine());
				}

				key_b.clear();
				pos_b.clear();
				for (int i = partOfSpeechStart; i < (partOfSpeechStart + partOfSpeechSize); i++) {
					key_b.append(csvValues[i]);
					pos_b.append(csvValues[i]);
				}

				for (int i = partOfSpeechStart + partOfSpeechSize; i < csvValues.length; i++) {
					pos_b.append(csvValues[i]);
				}

				CToken ctoken = new CToken();

				ctoken.rcAttr2 = (short) matrixBuilders[0].getDicId(key_b.toString());
				ctoken.rcAttr1 = (short) matrixBuilders[1].getDicId(key_b.toString());
				ctoken.lcAttr = (short) matrixBuilders[2].getDicId(key_b.toString());
				ctoken.partOfSpeechIndex = outputStream.size() >> 1;
				ctoken.length = (short) csvValues[0].length();
                try {
                    ctoken.cost = (short) Integer.parseInt(csvValues[1]);
                } catch (NumberFormatException ex) {
                    ctoken.cost = (short) 0;
                }

				dictionaryList.add(csvValues[0], ctoken);


				// Write to part of speech data file

				StringBuilder partOfSpeechBuilder = new StringBuilder();
				for (int i = partOfSpeechStart; i < (partOfSpeechStart + 4); i++) {
					if (!csvValues[i].equals("*")) {
						partOfSpeechBuilder.append(csvValues[i]);
						partOfSpeechBuilder.append("-");
					}
				}
				String partOfSpeech = partOfSpeechBuilder.substring(0, partOfSpeechBuilder.length() - 1);
				String conjugationalType = csvValues[partOfSpeechStart + 4];
				String conjugationalForm = csvValues[partOfSpeechStart + 5];
				String basicForm = csvValues[partOfSpeechStart + 6];
				List<String> readings = splitCompoundField(csvValues[partOfSpeechStart + 7]);
				List<String> pronunciations = splitCompoundField(csvValues[partOfSpeechStart + 8]);

				outputStream.writeChar(partOfSpeech.length());
				outputStream.writeChars(partOfSpeech);

				outputStream.writeChar(conjugationalType.length());
				outputStream.writeChars(conjugationalType);

				outputStream.writeChar(conjugationalForm.length());
				outputStream.writeChars(conjugationalForm);

				outputStream.writeChar(basicForm.length());
				outputStream.writeChars(basicForm);

				outputStream.writeChar(readings.size());

				for (String reading : readings) {
					outputStream.writeChar(reading.length());
					outputStream.writeChars(reading);
				}

				for (String pronunciation : pronunciations) {
					outputStream.writeChar(pronunciation.length());
					outputStream.writeChars(pronunciation);
				}

			}

		}

		outputStream.close();

		dictionaryList.sort();

		CToken bosCToken = new CToken();
		bosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(bosPartOfSpeech);
		bosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(bosPartOfSpeech);
		bosCToken.lcAttr = (short) matrixBuilders[2].getDicId(bosPartOfSpeech);
		standardCTokens[0] = bosCToken;

		CToken eosCToken = new CToken();
		eosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(eosPartOfSpeech);
		eosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(eosPartOfSpeech);
		eosCToken.lcAttr = (short) matrixBuilders[2].getDicId(eosPartOfSpeech);
		standardCTokens[1] = eosCToken;

		CToken unknownCToken = new CToken();
		unknownCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(unknownPartOfSpeech);
		unknownCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(unknownPartOfSpeech);
		unknownCToken.lcAttr = (short) matrixBuilders[2].getDicId(unknownPartOfSpeech);
		unknownCToken.partOfSpeechIndex = -1;
		standardCTokens[2] = unknownCToken;

	}


	/**
	 * Creates the connection cost matrix file
	 *
	 * @param connectionCSVFilename The filename of the connection CSV data
	 * @param connectionCostDataFilename The filename for the connection cost matrix
	 * @param defaultCost The default connection cost
	 * @param charset The charset of the connection CSV data
	 * @return An array of three <code>CostMatrixBuilder</code>s
	 * @throws IOException
	 */
	private CostMatrixBuilder[] createConnectionCostFile(String connectionCSVFilename, String connectionCostDataFilename, short defaultCost, String charset) throws IOException {

		CostMatrixBuilder[] matrixBuilders = new CostMatrixBuilder[3];

		matrixBuilders[0] = new CostMatrixBuilder();
		matrixBuilders[1] = new CostMatrixBuilder();
		matrixBuilders[2] = new CostMatrixBuilder();
		Vector<String> rule1 = new Vector<String>();
		Vector<String> rule2 = new Vector<String>();
		Vector<String> rule3 = new Vector<String>();

		// The approximate length of the file, plus a bit. If we're wrong it'll be expanded during processing
		short[] scores = new short[30000];

		// Read connection cost CSV data
		CSVParser parser = new CSVParser(new FileInputStream(connectionCSVFilename), charset);
		String t[];
		int line = 0;
		while ((t = parser.nextTokens()) != null) {
			if (t.length < 4) {
				throw new IOException("Connection cost CSV format error");
			}
			matrixBuilders[0].add(t[0]);
			rule1.add(t[0]);

			matrixBuilders[1].add(t[1]);
			rule2.add(t[1]);

			matrixBuilders[2].add(t[2]);
			rule3.add(t[2]);

			if (line == scores.length) {
				scores = resize(scores);
			}

			scores[line++] = (short) Integer.parseInt(t[3]);
		}

		// Compile CostMatrixBuilders
		matrixBuilders[0].build();
		matrixBuilders[1].build();
		matrixBuilders[2].build();

		int size1 = matrixBuilders[0].size();
		int size2 = matrixBuilders[1].size();
		int size3 = matrixBuilders[2].size();
		int ruleSize = rule1.size();


		// Write connection cost data
		MappedByteBuffer buffer = null;
		ShortBuffer shortBuffer = null;
		int matrixSizeBytes = (size1 * size2 * size3 * 2);
		int headerSizeBytes = (3 * 2);

		RandomAccessFile file = new RandomAccessFile(connectionCostDataFilename, "rw");
		file.setLength(0);
		file.writeShort(size1);
		file.writeShort(size2);
		file.writeShort(size3);
		file.setLength(headerSizeBytes + matrixSizeBytes);
		FileChannel indexChannel = file.getChannel();
		buffer = indexChannel.map(FileChannel.MapMode.READ_WRITE, headerSizeBytes, matrixSizeBytes);
		shortBuffer = buffer.asShortBuffer();
		indexChannel.close();

		for (int i = 0; i < (size1 * size2 * size3); i++) {
			shortBuffer.put(i, defaultCost);
		}

		for (int i = 0; i < ruleSize; i++) {
			Vector<Integer> r1 = matrixBuilders[0].getRuleIdList(rule1.get(i));
			Vector<Integer> r2 = matrixBuilders[1].getRuleIdList(rule2.get(i));
			Vector<Integer> r3 = matrixBuilders[2].getRuleIdList(rule3.get(i));

			for (Iterator<Integer> i1 = r1.iterator(); i1.hasNext();) {
				int ii1 = i1.next();
				for (Iterator<Integer> i2 = r2.iterator(); i2.hasNext();) {
					int ii2 = i2.next();
					for (Iterator<Integer> i3 = r3.iterator(); i3.hasNext();) {
						int ii3 = i3.next();
						int position = size3 * (size2 * ii1 + ii2) + ii3;
						shortBuffer.put(position, scores[i]);
					}
				}
			}
		}

		buffer.force();

		return matrixBuilders;

	}


	/**
	 * Create the token data file
	 *
	 * @param tokenDataFilename The filename for the token data file
	 * @param standardCTokens The beginning-of-string, end-of-string, and unknown-morpheme CTokens
	 * @param tupleList The (String,CToken) tuples
	 *
	 * @return The Trie precursor data
	 * @throws IOException
	 */
	private TrieData createTokenFile(String tokenDataFilename, CToken[] standardCTokens, VirtualTupleList tupleList)
			throws IOException
	{

		TrieData trieData = new TrieData();

		trieData.values = new int[tupleList.size()];
		trieData.keys = new String[tupleList.size()];
		trieData.size = 0;
		int spos = 0;
		int bsize = 0;
		String prev = "";

		DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tokenDataFilename)));

		// Write beginning-of-string, end-of-string, unknown-morpheme tokens
		CToken.write(out, standardCTokens[0]);
		CToken.write(out, standardCTokens[1]);
		CToken.write(out, standardCTokens[2]);

		// Write token data
		for (int i = 0; i < trieData.keys.length; i++) {
			StringCTokenTuple tuple = tupleList.get(i);
			String k = tuple.key;
			if (!prev.equals(k) && i != 0) {
				trieData.keys[trieData.size] = tupleList.get(spos).key;
				trieData.values[trieData.size] = bsize + (spos << 8);
				trieData.size++;
				bsize = 1;
				spos = i;
			} else {
				bsize++;
			}
			prev = tuple.key;
			CToken.write(out, tuple.value);
		}
		out.flush();
		out.close();

		trieData.keys[trieData.size] = tupleList.get(spos).key;
		trieData.values[trieData.size] = bsize + (spos << 8);
		trieData.size++;


		return trieData;

	}


	/**
	 * Create Trie file
	 *
	 * @param trieDataFilename The filename for the Trie file
	 * @param trieData The Trie precursor data
	 * @throws IOException
	 */
	private void createTrieFile(String trieDataFilename, TrieData trieData) throws IOException {

		TrieBuilder builder = new TrieBuilder(trieData.keys, trieData.values, trieData.size);
		builder.build(trieDataFilename);

	}


	/**
	 * Compiles CSV source data into the data files used for analysis
	 *
	 * @param customDictionaryCSVFilenames The filenames of custom dictionaries, or <code>null</code>
	 * @throws IOException
	 */
	public DictionaryBuilder(String[] customDictionaryCSVFilenames) throws IOException {

		List<String> dictionaryCSVFilenames = new ArrayList<String>();
		dictionaryCSVFilenames.add(DICTIONARY_CSV_FILENAME);
		dictionaryCSVFilenames.addAll(Arrays.asList(customDictionaryCSVFilenames));

		String charset = "UTF-8";


		// Create connection cost file (matrix.sen)
		CostMatrixBuilder[] matrixBuilders = createConnectionCostFile(
				CONNECTION_CSV_FILENAME,
				CONNECTION_COST_DATA_FILENAME,
				DEFAULT_CONNECTION_COST,
				charset
		);


		// Create part-of-speech data file (posInfo.sen)
		VirtualTupleList dictionaryList = new VirtualTupleList();
		CToken[] standardCTokens = new CToken[3];

		createPartOfSpeechDataFile(
				dictionaryCSVFilenames,
				PART_OF_SPEECH_DATA_FILENAME,
				matrixBuilders,
				PART_OF_SPEECH_START,
				PART_OF_SPEECH_SIZE,
				charset,
				BOS_PART_OF_SPEECH,
				EOS_PART_OF_SPEECH,
				UNKNOWN_PART_OF_SPEECH,
				dictionaryList,
				standardCTokens
		);

		// Free temporary object for GC
		matrixBuilders = null;


		// Create Token file (token.sen)
		TrieData trieData = createTokenFile(
				TOKEN_DATA_FILENAME,
				standardCTokens,
				dictionaryList
		);

		// Free temporary object for GC
		dictionaryList = null;


		// Create Trie file (da.sen)
		createTrieFile(TRIE_DATA_FILENAME, trieData);

	}


}
	<?xml version="1.0"?>
	<!--
	Usage:

	To use a web proxy to download the dictionary data, do the following:

	$ ant -Dproxy.host=proxy.hoehoe.jp -Dproxy.port=8080

	-->
	<project name="ipadic" default="compile" basedir=".">
	<property name="ipadic.home" value="http://sourceforge.jp/projects/ipadic/downloads/24435/ipadic-2.7.0.tar.gz"/>
	<property name="ipadic.version" value="2.7.0"/>
	<property name="ipadic.archive" value="ipadic-${ipadic.version}.tar.gz"/>
	<property name="ipadic.dir" value="ipadic-${ipadic.version}"/>

	<!-- Checks the current build status -->
	<!-- Sets the property "dics.unpacked" if the dictionary is already unpacked -->
	<!-- Sets the property "ipadic.archive.present" if the ipadic archive is already present -->
	<!-- Sets the property "dics.preprocessed" if the dictionary is already preprocessed -->
	<!-- Sets the property "dics.complete" if the dictionary is already compiled -->
	<target name="check-build-status">
	<available file="${ipadic.archive}" property="ipadic.archive.present"/>
	<condition property="dics.unpacked">
	<and>
	<available file="ipadic-${ipadic.version}/Noun.dic"/>
	</and>
	</condition>
	<condition property="dics.preprocessed">
	<and>
	<available file="dic.csv"/>
	<available file="connect.csv"/>
	</and>
	</condition>
	<condition property="dics.complete">
	<and>
	<available file="da.sen"/>
	<available file="matrix.sen"/>
	<available file="posInfo.sen"/>
	<available file="token.sen"/>
	</and>
	</condition>
	</target>

	<!--
	<target name="download" depends="prepare-proxy,check-build-status" unless="ipadic.archive.present">
	<get src="${ipadic.home}/${ipadic.archive}" dest="${ipadic.archive}" />
	</target>
	-->

	<!-- Unpacks the ipadic dictionary -->
	<target name="unpack" depends="check-build-status" unless="dics.unpacked">
	<gunzip src="${ipadic.archive}"/>
	<untar src="${ipadic.dir}.tar" dest="." />
	<delete file="${ipadic.dir}.tar"/>
	</target>

	<!-- Deletes the ipadic dictionary and compiled files -->
	<target name="clean">
	<delete>
	<fileset dir="." includes="*.sen"/>
	<fileset dir="." includes="*.csv"/>
	</delete>
	<delete dir="ipadic-${ipadic.version}"/>
	<delete file="${ipadic.archive}" />
	</target>

	<!-- Preprocesses the ipadic dictionary for compilation -->
	<target name="preprocess" depends="unpack" unless="dics.preprocessed">
	<java classname="net.java.sen.tools.DictionaryPreprocessor"
	fork="true">
	<classpath>
	<pathelement location="."/>
	<pathelement location="../../bin"/>
	<pathelement location="../../jisx0213-1.0.jar"/>
	<pathelement path="${java.class.path}"/>
	</classpath>
	<arg line="X-EUC-JISX0213" />
	<arg line="ipadic-${ipadic.version}" />
	<arg line="." />
	</java>
	</target>

	<!-- Default task - compiles the ipadic dictionary -->
	<target name="compile" depends="preprocess" unless="dics.complete">
	<java classname="net.java.sen.tools.DictionaryCompiler"
	fork="true">
	<classpath>
	<pathelement location="."/>
	<pathelement location="../../bin"/>
	<pathelement path="${java.class.path}"/>
	</classpath>
	</java>
	</target>

	<!-- Downloads and compiles the ipadic dictionary from scratch -->
	<target name="all" depends="clean,unpack,compile"/>

	</project>
	/*
	* Copyright (C) 2001-2007
	* Taku Kudoh <taku-ku@is.aist-nara.ac.jp>
	* Takashi Okamoto <tora@debian.org>
	* Matt Francis <asbel@neosheffield.co.uk>
	*
	* This library is free software; you can redistribute it and/or modify it under
	* the terms of the GNU Lesser General Public License as published by the Free
	* Software Foundation; either version 2.1 of the License, or any later version.
	*
	* This library is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
	* details.
	*
	* You should have received a copy of the GNU Lesser General Public License
	* along with this library; if not, write to the Free Software Foundation, Inc.,
	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	*
	*/

	package net.java.sen.compiler;

	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.LinkedHashSet;
	import java.util.Map;
	import java.util.Set;
	import java.util.Vector;


	/**
	* Builds an axis of the Connection Cost matrix from supplied part-of-speech /
	* conjugation data
	*
	* TODO The workings of this class are relatively simple but somewhat magical.
	* It could use some explanation from someone who understands what exactly it's
	* doing.
	*/
	class CostMatrixBuilder {

	/**
	* Set containing all unique values from one column of the Connection Cost CSV file
	*/
	private LinkedHashSet<String> ruleSet = new LinkedHashSet<String>();

	/**
	* The input rules (from ruleSet) split into individual values
	*/
	private Vector<String[]> ruleList = new Vector<String[]>();


	/**
	* TODO This is magic. How does this work?
	*/
	private Vector<Vector<Integer>> idList = new Vector<Vector<Integer>>();

	/**
	* // dic2IdHash('word type')= id for word type
	* TODO This is magic. How does this work?
	*/
	private Map<String, Integer> dicIndex = new HashMap<String, Integer>();


	/**
	* A map containing a unique integer ID for each rule added
	*/
	private Map<String, Integer> ruleIndex = new HashMap<String, Integer>();

	/**
	* Contains the set of the rules' last fields where the field is not equal to '*'
	* TODO This is magic. How does this work?
	*/
	private Set<String> lexicalized = new HashSet<String>();


	/**
	* Converts a list of part-of-speech / conjugation identifier strings to
	* a vector of IDs unique to each string
	* TODO This is magic. How does this work?
	*
	* @param csv The part-of-speech / conjugation strings
	* @param parent TODO How does this work?
	* @return A vector of IDs for the strings
	*/
	private Vector<Integer> getIdList(String csv[], boolean parent) {

	Vector<Integer> results = new Vector<Integer>(this.ruleList.size());
	results.setSize(this.ruleList.size());

	for (int j = 0; j < this.ruleList.size(); j++) {
	results.set(j, j);
	}

	for (int j = 0; j < csv.length; j++) {
	int k = 0;
	for (int n = 0; n < results.size(); n++) {
	int i = results.get(n);
	String rl_ij = this.ruleList.get(i)[j];
	if (
	((!parent) && (csv[j].charAt(0) == '*'))
	\|\| ((parent) && (rl_ij.charAt(0) == '*'))
	\|\| rl_ij.equals(csv[j])
	)
	{
	results.set(k++, results.get(n));
	}
	}
	results.setSize(k);
	}

	return results;

	}


	/**
	* Calculates a unique(?) ID for a split rule
	* TODO This is magic. How does this work?
	*
	* @param csv The split rule
	* @return The calculated ID
	*/
	private int getDicIdNoCache(String csv[]) {

	Vector<Integer> results = getIdList(csv, true);

	/* if (results.size() == 0) {
	throw new IllegalArgumentException();
	}*/

	int priority[] = new int[results.size()];
	int max = 0;
	for (int i = 0; i < results.size(); i++) {
	String csvValues[] = this.ruleList.get(results.get(i));
	for (int j = 0; j < csvValues.length; j++) {
	if (csvValues[j].charAt(0) != '*') {
	priority[i]++;
	}
	}
	if (priority[max] < priority[i]) {
	max = i;
	}
	}

	return results.size() > 0 ? results.get(max) : 0;

	}


	/**
	* Adds a Connection Cost CSV value to the builder
	*
	* @param rule The rule to add
	*/
	public void add(String rule) {

	this.ruleSet.add(rule);

	}


	/**
	* Builds the matrix axis based on the data passed to {@link #add(String)}.
	* It is an error to call {@link #add(String)} after calling
	* {@link #build()}.
	*/
	public void build() {

	int i = 0;

	this.ruleList.setSize(this.ruleSet.size());
	for (Iterator<String> iterator = this.ruleSet.iterator(); iterator.hasNext();) {
	String str = iterator.next();
	this.ruleIndex.put(str, i);

	String tokenList[] = str.split(",");

	this.ruleList.set(i, tokenList);
	if (tokenList[tokenList.length - 1].charAt(0) != '*') {
	this.lexicalized.add(tokenList[tokenList.length - 1]);
	}
	i++;
	}

	this.ruleSet.clear();

	this.idList.setSize(this.ruleList.size());
	for (int j = 0; j < this.ruleList.size(); j++) {
	Vector<Integer> results = getIdList(this.ruleList.get(j), false);
	this.idList.set(j, results);
	}

	}


	/**
	* Returns the size of the built matrix axis
	*
	* @return The size of the built matrix axis
	*/
	public int size() {

	return this.ruleList.size();

	}


	/**
	* TODO This is magic. How does this work?
	*
	* @param rule The rule
	* @return TODO how is this ID defined?
	*/
	public int getDicId(String rule) {

	String csv[] = rule.split(",");

	String lex = csv[csv.length - 1];

	if (this.lexicalized.contains(lex)) {

	return getDicIdNoCache(csv);

	}
	// Remove end field
	String partOfSpeech = rule.substring(0, rule.lastIndexOf(","));

	Integer r = this.dicIndex.get(partOfSpeech);
	if ((r != null) && (r != 0)) {

	// 0 if empty
	return r - 1;
	}

	int rg = getDicIdNoCache(csv);

	this.dicIndex.put(partOfSpeech, rg + 1);
	return rg;

	}


	/**
	* Converts a rule to a vector of IDs unique to each component part
	*
	* @param rule The rule
	* @return A vector of IDs for the component parts
	*/
	public Vector<Integer> getRuleIdList(String rule) {

	return this.idList.get(this.ruleIndex.get(rule));

	}


	}
	/*
	* Copyright (C) 2002-2007
	* Taku Kudoh <taku-ku@is.aist-nara.ac.jp>
	* Takashi Okamoto <tora@debian.org>
	* Matt Francis <asbel@neosheffield.co.uk>
	*
	* This library is free software; you can redistribute it and/or modify it under
	* the terms of the GNU Lesser General Public License as published by the Free
	* Software Foundation; either version 2.1 of the License, or any later version.
	*
	* This library is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
	* details.
	*
	* You should have received a copy of the GNU Lesser General Public License
	* along with this library; if not, write to the Free Software Foundation, Inc.,
	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	*
	*/

	package net.java.sen.compiler;

	import java.io.BufferedOutputStream;
	import java.io.DataOutputStream;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.RandomAccessFile;
	import java.nio.MappedByteBuffer;
	import java.nio.ShortBuffer;
	import java.nio.channels.FileChannel;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Vector;

	import net.java.sen.dictionary.CToken;
	import net.java.sen.trie.TrieBuilder;
	import net.java.sen.util.CSVData;
	import net.java.sen.util.CSVParser;


	/**
	* Compiles CSV source data into the data files used for analysis
	*/
	public class DictionaryBuilder {

	/**
	* Input dictionary CSV filename
	*/
	private static final String DICTIONARY_CSV_FILENAME = "dictionary.csv";

	/**
	* Input connection CSV filename
	*/
	private static final String CONNECTION_CSV_FILENAME = "connection.csv";

	/**
	* Compiled connection cost data filename
	*/
	private static final String CONNECTION_COST_DATA_FILENAME = "connectionCost.sen";

	/**
	* Compiled part of speech data filename
	*/
	private static final String PART_OF_SPEECH_DATA_FILENAME = "partOfSpeech.sen";

	/**
	* Compiled token data filename
	*/
	private static final String TOKEN_DATA_FILENAME = "token.sen";

	/**
	* Compiled trie data filename
	*/
	private static final String TRIE_DATA_FILENAME = "trie.sen";

	/**
	* Default connection cost
	*/
	private static final short DEFAULT_CONNECTION_COST = 10000;

	/**
	* Start of part-of-speech data within the dictionary CSV
	*/
	private static final int PART_OF_SPEECH_START = 2;

	/**
	* Size of part-of-speech data within the dictionary CSV
	*/
	private static final int PART_OF_SPEECH_SIZE = 7;

	/**
	* Beginning-of-string token part-of-speech
	*/
	private static final String BOS_PART_OF_SPEECH = "文頭,,,,,,";

	/**
	* End-of-string token part-of-speech
	*/
	private static final String EOS_PART_OF_SPEECH = "文末,,,,,,";

	/**
	* Unknown token part-of-speech
	*/
	private static final String UNKNOWN_PART_OF_SPEECH = "名詞,サ変接続,,,,,*";


	/**
	* Precursor data for the Trie file
	*/
	private static class TrieData {

	/**
	* Trie keys
	*/
	public String keys[];

	/**
	* Trie values
	*/
	public int values[];

	/**
	* The actual number of entries in the keys/values arrays
	*/
	public int size;

	}


	/**
	* Increases the size of an array of <code>short</code>s
	*
	* @param current The existing array
	* @return The resized array
	*/
	private static short[] resize(short current[]) {

	short tmp[] = new short[(int) (current.length * 1.5)];
	System.arraycopy(current, 0, tmp, 0, current.length);

	return tmp;

	}


	/**
	* Splits a compound reading or pronunciation field into a list
	*
	* Compound fields are of the form:
	*
	* "{head1/head2[/head3 ...]}tail"
	*
	* The returned list will consist of:
	*
	* "head1tail",
	* "head2tail",
	* "head3tail",
	* ...
	*
	* @param compoundField The field to split
	* @return The split list
	*/
	private List<String> splitCompoundField(String compoundField) {

	List<String> splitFieldList;

	if ((compoundField.length() == 0) \|\| (compoundField.charAt(0) != '{')) {

	// No alternatives
	splitFieldList = new ArrayList<String>(1);
	splitFieldList.add(compoundField);

	} else {

	// 1 or more alternatives. No existing entry in Ipadic has more than 4
	splitFieldList = new ArrayList<String>(4);

	String[] parts = compoundField.split("[{}]");
	String tail = (parts.length == 3) ? parts[2] : "";

	String[] heads = parts.length > 0 ? parts[1].split("/") : new String[0] ;

	for (int i = 0; i < heads.length; i++) {
	splitFieldList.add(heads[i] + tail);
	}

	}

	return splitFieldList;

	}


	/**
	* Creates the part-of-speech data file
	*
	* @param dictionaryCSVFilenames The filenames of the dictionary CSV data file and any additional dictionaries
	* @param partOfSpeechDataFilename The filename for the part-of-speech data file
	* @param matrixBuilders The three <code>CostMatrixBuilder</code>s
	* @param partOfSpeechStart The starting index of the part-of-speech data within a CSV line
	* @param partOfSpeechSize The number of part-of-speech values within a CSV line
	* @param charset The charset of the CSV data
	* @param bosPartOfSpeech The beginning-of-string part-of-speech code
	* @param eosPartOfSpeech The end-of-string part-of-speech code
	* @param unknownPartOfSpeech The beginning-of-string part-of-speech code
	* @param dictionaryList Populated by this method with the String/CToken tuples that will be used to create the Token file
	* @param standardCTokens Populated by this method with the three standard CTokens ("bos", "eos" and "unknown")
	*
	* @throws IOException
	*/
	private void createPartOfSpeechDataFile(List<String> dictionaryCSVFilenames, String partOfSpeechDataFilename,
	CostMatrixBuilder[] matrixBuilders, int partOfSpeechStart, int partOfSpeechSize, String charset,
	String bosPartOfSpeech, String eosPartOfSpeech, String unknownPartOfSpeech, VirtualTupleList dictionaryList, CToken[] standardCTokens) throws IOException
	{

	String[] csvValues = null;

	CSVData key_b = new CSVData();
	CSVData pos_b = new CSVData();

	DataOutputStream outputStream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(partOfSpeechDataFilename)));

	for (String dictionaryCSVFilename : dictionaryCSVFilenames) {

	CSVParser parser = new CSVParser(new FileInputStream(dictionaryCSVFilename), charset);

	while ((csvValues = parser.nextTokens()) != null) {

	if (csvValues.length < (partOfSpeechSize + partOfSpeechStart)) {
	throw new RuntimeException("format error:" + parser.currentLine());
	}

	key_b.clear();
	pos_b.clear();
	for (int i = partOfSpeechStart; i < (partOfSpeechStart + partOfSpeechSize); i++) {
	key_b.append(csvValues[i]);
	pos_b.append(csvValues[i]);
	}

	for (int i = partOfSpeechStart + partOfSpeechSize; i < csvValues.length; i++) {
	pos_b.append(csvValues[i]);
	}

	CToken ctoken = new CToken();

	ctoken.rcAttr2 = (short) matrixBuilders[0].getDicId(key_b.toString());
	ctoken.rcAttr1 = (short) matrixBuilders[1].getDicId(key_b.toString());
	ctoken.lcAttr = (short) matrixBuilders[2].getDicId(key_b.toString());
	ctoken.partOfSpeechIndex = outputStream.size() >> 1;
	ctoken.length = (short) csvValues[0].length();
	try {
	ctoken.cost = (short) Integer.parseInt(csvValues[1]);
	} catch (NumberFormatException ex) {
	ctoken.cost = (short) 0;
	}

	dictionaryList.add(csvValues[0], ctoken);


	// Write to part of speech data file

	StringBuilder partOfSpeechBuilder = new StringBuilder();
	for (int i = partOfSpeechStart; i < (partOfSpeechStart + 4); i++) {
	if (!csvValues[i].equals("*")) {
	partOfSpeechBuilder.append(csvValues[i]);
	partOfSpeechBuilder.append("-");
	}
	}
	String partOfSpeech = partOfSpeechBuilder.substring(0, partOfSpeechBuilder.length() - 1);
	String conjugationalType = csvValues[partOfSpeechStart + 4];
	String conjugationalForm = csvValues[partOfSpeechStart + 5];
	String basicForm = csvValues[partOfSpeechStart + 6];
	List<String> readings = splitCompoundField(csvValues[partOfSpeechStart + 7]);
	List<String> pronunciations = splitCompoundField(csvValues[partOfSpeechStart + 8]);

	outputStream.writeChar(partOfSpeech.length());
	outputStream.writeChars(partOfSpeech);

	outputStream.writeChar(conjugationalType.length());
	outputStream.writeChars(conjugationalType);

	outputStream.writeChar(conjugationalForm.length());
	outputStream.writeChars(conjugationalForm);

	outputStream.writeChar(basicForm.length());
	outputStream.writeChars(basicForm);

	outputStream.writeChar(readings.size());

	for (String reading : readings) {
	outputStream.writeChar(reading.length());
	outputStream.writeChars(reading);
	}

	for (String pronunciation : pronunciations) {
	outputStream.writeChar(pronunciation.length());
	outputStream.writeChars(pronunciation);
	}

	}

	}

	outputStream.close();

	dictionaryList.sort();

	CToken bosCToken = new CToken();
	bosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(bosPartOfSpeech);
	bosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(bosPartOfSpeech);
	bosCToken.lcAttr = (short) matrixBuilders[2].getDicId(bosPartOfSpeech);
	standardCTokens[0] = bosCToken;

	CToken eosCToken = new CToken();
	eosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(eosPartOfSpeech);
	eosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(eosPartOfSpeech);
	eosCToken.lcAttr = (short) matrixBuilders[2].getDicId(eosPartOfSpeech);
	standardCTokens[1] = eosCToken;

	CToken unknownCToken = new CToken();
	unknownCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(unknownPartOfSpeech);
	unknownCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(unknownPartOfSpeech);
	unknownCToken.lcAttr = (short) matrixBuilders[2].getDicId(unknownPartOfSpeech);
	unknownCToken.partOfSpeechIndex = -1;
	standardCTokens[2] = unknownCToken;

	}


	/**
	* Creates the connection cost matrix file
	*
	* @param connectionCSVFilename The filename of the connection CSV data
	* @param connectionCostDataFilename The filename for the connection cost matrix
	* @param defaultCost The default connection cost
	* @param charset The charset of the connection CSV data
	* @return An array of three <code>CostMatrixBuilder</code>s
	* @throws IOException
	*/
	private CostMatrixBuilder[] createConnectionCostFile(String connectionCSVFilename, String connectionCostDataFilename, short defaultCost, String charset) throws IOException {

	CostMatrixBuilder[] matrixBuilders = new CostMatrixBuilder[3];

	matrixBuilders[0] = new CostMatrixBuilder();
	matrixBuilders[1] = new CostMatrixBuilder();
	matrixBuilders[2] = new CostMatrixBuilder();
	Vector<String> rule1 = new Vector<String>();
	Vector<String> rule2 = new Vector<String>();
	Vector<String> rule3 = new Vector<String>();

	// The approximate length of the file, plus a bit. If we're wrong it'll be expanded during processing
	short[] scores = new short[30000];

	// Read connection cost CSV data
	CSVParser parser = new CSVParser(new FileInputStream(connectionCSVFilename), charset);
	String t[];
	int line = 0;
	while ((t = parser.nextTokens()) != null) {
	if (t.length < 4) {
	throw new IOException("Connection cost CSV format error");
	}
	matrixBuilders[0].add(t[0]);
	rule1.add(t[0]);

	matrixBuilders[1].add(t[1]);
	rule2.add(t[1]);

	matrixBuilders[2].add(t[2]);
	rule3.add(t[2]);

	if (line == scores.length) {
	scores = resize(scores);
	}

	scores[line++] = (short) Integer.parseInt(t[3]);
	}

	// Compile CostMatrixBuilders
	matrixBuilders[0].build();
	matrixBuilders[1].build();
	matrixBuilders[2].build();

	int size1 = matrixBuilders[0].size();
	int size2 = matrixBuilders[1].size();
	int size3 = matrixBuilders[2].size();
	int ruleSize = rule1.size();


	// Write connection cost data
	MappedByteBuffer buffer = null;
	ShortBuffer shortBuffer = null;
	int matrixSizeBytes = (size1 * size2 * size3 * 2);
	int headerSizeBytes = (3 * 2);

	RandomAccessFile file = new RandomAccessFile(connectionCostDataFilename, "rw");
	file.setLength(0);
	file.writeShort(size1);
	file.writeShort(size2);
	file.writeShort(size3);
	file.setLength(headerSizeBytes + matrixSizeBytes);
	FileChannel indexChannel = file.getChannel();
	buffer = indexChannel.map(FileChannel.MapMode.READ_WRITE, headerSizeBytes, matrixSizeBytes);
	shortBuffer = buffer.asShortBuffer();
	indexChannel.close();

	for (int i = 0; i < (size1 * size2 * size3); i++) {
	shortBuffer.put(i, defaultCost);
	}

	for (int i = 0; i < ruleSize; i++) {
	Vector<Integer> r1 = matrixBuilders[0].getRuleIdList(rule1.get(i));
	Vector<Integer> r2 = matrixBuilders[1].getRuleIdList(rule2.get(i));
	Vector<Integer> r3 = matrixBuilders[2].getRuleIdList(rule3.get(i));

	for (Iterator<Integer> i1 = r1.iterator(); i1.hasNext();) {
	int ii1 = i1.next();
	for (Iterator<Integer> i2 = r2.iterator(); i2.hasNext();) {
	int ii2 = i2.next();
	for (Iterator<Integer> i3 = r3.iterator(); i3.hasNext();) {
	int ii3 = i3.next();
	int position = size3 * (size2 * ii1 + ii2) + ii3;
	shortBuffer.put(position, scores[i]);
	}
	}
	}
	}

	buffer.force();

	return matrixBuilders;

	}


	/**
	* Create the token data file
	*
	* @param tokenDataFilename The filename for the token data file
	* @param standardCTokens The beginning-of-string, end-of-string, and unknown-morpheme CTokens
	* @param tupleList The (String,CToken) tuples
	*
	* @return The Trie precursor data
	* @throws IOException
	*/
	private TrieData createTokenFile(String tokenDataFilename, CToken[] standardCTokens, VirtualTupleList tupleList)
	throws IOException
	{

	TrieData trieData = new TrieData();

	trieData.values = new int[tupleList.size()];
	trieData.keys = new String[tupleList.size()];
	trieData.size = 0;
	int spos = 0;
	int bsize = 0;
	String prev = "";

	DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tokenDataFilename)));

	// Write beginning-of-string, end-of-string, unknown-morpheme tokens
	CToken.write(out, standardCTokens[0]);
	CToken.write(out, standardCTokens[1]);
	CToken.write(out, standardCTokens[2]);

	// Write token data
	for (int i = 0; i < trieData.keys.length; i++) {
	StringCTokenTuple tuple = tupleList.get(i);
	String k = tuple.key;
	if (!prev.equals(k) && i != 0) {
	trieData.keys[trieData.size] = tupleList.get(spos).key;
	trieData.values[trieData.size] = bsize + (spos << 8);
	trieData.size++;
	bsize = 1;
	spos = i;
	} else {
	bsize++;
	}
	prev = tuple.key;
	CToken.write(out, tuple.value);
	}
	out.flush();
	out.close();

	trieData.keys[trieData.size] = tupleList.get(spos).key;
	trieData.values[trieData.size] = bsize + (spos << 8);
	trieData.size++;


	return trieData;

	}


	/**
	* Create Trie file
	*
	* @param trieDataFilename The filename for the Trie file
	* @param trieData The Trie precursor data
	* @throws IOException
	*/
	private void createTrieFile(String trieDataFilename, TrieData trieData) throws IOException {

	TrieBuilder builder = new TrieBuilder(trieData.keys, trieData.values, trieData.size);
	builder.build(trieDataFilename);

	}


	/**
	* Compiles CSV source data into the data files used for analysis
	*
	* @param customDictionaryCSVFilenames The filenames of custom dictionaries, or <code>null</code>
	* @throws IOException
	*/
	public DictionaryBuilder(String[] customDictionaryCSVFilenames) throws IOException {

	List<String> dictionaryCSVFilenames = new ArrayList<String>();
	dictionaryCSVFilenames.add(DICTIONARY_CSV_FILENAME);
	dictionaryCSVFilenames.addAll(Arrays.asList(customDictionaryCSVFilenames));

	String charset = "UTF-8";


	// Create connection cost file (matrix.sen)
	CostMatrixBuilder[] matrixBuilders = createConnectionCostFile(
	CONNECTION_CSV_FILENAME,
	CONNECTION_COST_DATA_FILENAME,
	DEFAULT_CONNECTION_COST,
	charset
	);


	// Create part-of-speech data file (posInfo.sen)
	VirtualTupleList dictionaryList = new VirtualTupleList();
	CToken[] standardCTokens = new CToken[3];

	createPartOfSpeechDataFile(
	dictionaryCSVFilenames,
	PART_OF_SPEECH_DATA_FILENAME,
	matrixBuilders,
	PART_OF_SPEECH_START,
	PART_OF_SPEECH_SIZE,
	charset,
	BOS_PART_OF_SPEECH,
	EOS_PART_OF_SPEECH,
	UNKNOWN_PART_OF_SPEECH,
	dictionaryList,
	standardCTokens
	);

	// Free temporary object for GC
	matrixBuilders = null;


	// Create Token file (token.sen)
	TrieData trieData = createTokenFile(
	TOKEN_DATA_FILENAME,
	standardCTokens,
	dictionaryList
	);

	// Free temporary object for GC
	dictionaryList = null;


	// Create Trie file (da.sen)
	createTrieFile(TRIE_DATA_FILENAME, trieData);

	}


	}