Skip to content

Instantly share code, notes, and snippets.

@tenten0213
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tenten0213/dda64e3bec39a069f775 to your computer and use it in GitHub Desktop.
Save tenten0213/dda64e3bec39a069f775 to your computer and use it in GitHub Desktop.
GoSenのビルド
<?xml version="1.0"?>
<!--
Usage:
To use a web proxy to download the dictionary data, do the following:
$ ant -Dproxy.host=proxy.hoehoe.jp -Dproxy.port=8080
-->
<project name="ipadic" default="compile" basedir=".">
<property name="ipadic.home" value="http://sourceforge.jp/projects/ipadic/downloads/24435/ipadic-2.7.0.tar.gz"/>
<property name="ipadic.version" value="2.7.0"/>
<property name="ipadic.archive" value="ipadic-${ipadic.version}.tar.gz"/>
<property name="ipadic.dir" value="ipadic-${ipadic.version}"/>
<!-- Checks the current build status -->
<!-- Sets the property "dics.unpacked" if the dictionary is already unpacked -->
<!-- Sets the property "ipadic.archive.present" if the ipadic archive is already present -->
<!-- Sets the property "dics.preprocessed" if the dictionary is already preprocessed -->
<!-- Sets the property "dics.complete" if the dictionary is already compiled -->
<target name="check-build-status">
<available file="${ipadic.archive}" property="ipadic.archive.present"/>
<condition property="dics.unpacked">
<and>
<available file="ipadic-${ipadic.version}/Noun.dic"/>
</and>
</condition>
<condition property="dics.preprocessed">
<and>
<available file="dic.csv"/>
<available file="connect.csv"/>
</and>
</condition>
<condition property="dics.complete">
<and>
<available file="da.sen"/>
<available file="matrix.sen"/>
<available file="posInfo.sen"/>
<available file="token.sen"/>
</and>
</condition>
</target>
<!--
<target name="download" depends="prepare-proxy,check-build-status" unless="ipadic.archive.present">
<get src="${ipadic.home}/${ipadic.archive}" dest="${ipadic.archive}" />
</target>
-->
<!-- Unpacks the ipadic dictionary -->
<target name="unpack" depends="check-build-status" unless="dics.unpacked">
<gunzip src="${ipadic.archive}"/>
<untar src="${ipadic.dir}.tar" dest="." />
<delete file="${ipadic.dir}.tar"/>
</target>
<!-- Deletes the ipadic dictionary and compiled files -->
<target name="clean">
<delete>
<fileset dir="." includes="*.sen"/>
<fileset dir="." includes="*.csv"/>
</delete>
<delete dir="ipadic-${ipadic.version}"/>
<delete file="${ipadic.archive}" />
</target>
<!-- Preprocesses the ipadic dictionary for compilation -->
<target name="preprocess" depends="unpack" unless="dics.preprocessed">
<java classname="net.java.sen.tools.DictionaryPreprocessor"
fork="true">
<classpath>
<pathelement location="."/>
<pathelement location="../../bin"/>
<pathelement location="../../jisx0213-1.0.jar"/>
<pathelement path="${java.class.path}"/>
</classpath>
<arg line="X-EUC-JISX0213" />
<arg line="ipadic-${ipadic.version}" />
<arg line="." />
</java>
</target>
<!-- Default task - compiles the ipadic dictionary -->
<target name="compile" depends="preprocess" unless="dics.complete">
<java classname="net.java.sen.tools.DictionaryCompiler"
fork="true">
<classpath>
<pathelement location="."/>
<pathelement location="../../bin"/>
<pathelement path="${java.class.path}"/>
</classpath>
</java>
</target>
<!-- Downloads and compiles the ipadic dictionary from scratch -->
<target name="all" depends="clean,unpack,compile"/>
</project>
/*
* Copyright (C) 2001-2007
* Taku Kudoh <taku-ku@is.aist-nara.ac.jp>
* Takashi Okamoto <tora@debian.org>
* Matt Francis <asbel@neosheffield.co.uk>
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
package net.java.sen.compiler;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
/**
* Builds an axis of the Connection Cost matrix from supplied part-of-speech /
* conjugation data
*
* TODO The workings of this class are relatively simple but somewhat magical.
* It could use some explanation from someone who understands what exactly it's
* doing.
*/
class CostMatrixBuilder {
/**
* Set containing all unique values from one column of the Connection Cost CSV file
*/
private LinkedHashSet<String> ruleSet = new LinkedHashSet<String>();
/**
* The input rules (from ruleSet) split into individual values
*/
private Vector<String[]> ruleList = new Vector<String[]>();
/**
* TODO This is magic. How does this work?
*/
private Vector<Vector<Integer>> idList = new Vector<Vector<Integer>>();
/**
* // dic2IdHash('word type')= id for word type
* TODO This is magic. How does this work?
*/
private Map<String, Integer> dicIndex = new HashMap<String, Integer>();
/**
* A map containing a unique integer ID for each rule added
*/
private Map<String, Integer> ruleIndex = new HashMap<String, Integer>();
/**
* Contains the set of the rules' last fields where the field is not equal to '*'
* TODO This is magic. How does this work?
*/
private Set<String> lexicalized = new HashSet<String>();
/**
* Converts a list of part-of-speech / conjugation identifier strings to
* a vector of IDs unique to each string
* TODO This is magic. How does this work?
*
* @param csv The part-of-speech / conjugation strings
* @param parent TODO How does this work?
* @return A vector of IDs for the strings
*/
private Vector<Integer> getIdList(String csv[], boolean parent) {
Vector<Integer> results = new Vector<Integer>(this.ruleList.size());
results.setSize(this.ruleList.size());
for (int j = 0; j < this.ruleList.size(); j++) {
results.set(j, j);
}
for (int j = 0; j < csv.length; j++) {
int k = 0;
for (int n = 0; n < results.size(); n++) {
int i = results.get(n);
String rl_ij = this.ruleList.get(i)[j];
if (
((!parent) && (csv[j].charAt(0) == '*'))
|| ((parent) && (rl_ij.charAt(0) == '*'))
|| rl_ij.equals(csv[j])
)
{
results.set(k++, results.get(n));
}
}
results.setSize(k);
}
return results;
}
/**
* Calculates a unique(?) ID for a split rule
* TODO This is magic. How does this work?
*
* @param csv The split rule
* @return The calculated ID
*/
private int getDicIdNoCache(String csv[]) {
Vector<Integer> results = getIdList(csv, true);
/* if (results.size() == 0) {
throw new IllegalArgumentException();
}*/
int priority[] = new int[results.size()];
int max = 0;
for (int i = 0; i < results.size(); i++) {
String csvValues[] = this.ruleList.get(results.get(i));
for (int j = 0; j < csvValues.length; j++) {
if (csvValues[j].charAt(0) != '*') {
priority[i]++;
}
}
if (priority[max] < priority[i]) {
max = i;
}
}
return results.size() > 0 ? results.get(max) : 0;
}
/**
* Adds a Connection Cost CSV value to the builder
*
* @param rule The rule to add
*/
public void add(String rule) {
this.ruleSet.add(rule);
}
/**
* Builds the matrix axis based on the data passed to {@link #add(String)}.
* It is an error to call {@link #add(String)} after calling
* {@link #build()}.
*/
public void build() {
int i = 0;
this.ruleList.setSize(this.ruleSet.size());
for (Iterator<String> iterator = this.ruleSet.iterator(); iterator.hasNext();) {
String str = iterator.next();
this.ruleIndex.put(str, i);
String tokenList[] = str.split(",");
this.ruleList.set(i, tokenList);
if (tokenList[tokenList.length - 1].charAt(0) != '*') {
this.lexicalized.add(tokenList[tokenList.length - 1]);
}
i++;
}
this.ruleSet.clear();
this.idList.setSize(this.ruleList.size());
for (int j = 0; j < this.ruleList.size(); j++) {
Vector<Integer> results = getIdList(this.ruleList.get(j), false);
this.idList.set(j, results);
}
}
/**
* Returns the size of the built matrix axis
*
* @return The size of the built matrix axis
*/
public int size() {
return this.ruleList.size();
}
/**
* TODO This is magic. How does this work?
*
* @param rule The rule
* @return TODO how is this ID defined?
*/
public int getDicId(String rule) {
String csv[] = rule.split(",");
String lex = csv[csv.length - 1];
if (this.lexicalized.contains(lex)) {
return getDicIdNoCache(csv);
}
// Remove end field
String partOfSpeech = rule.substring(0, rule.lastIndexOf(","));
Integer r = this.dicIndex.get(partOfSpeech);
if ((r != null) && (r != 0)) {
// 0 if empty
return r - 1;
}
int rg = getDicIdNoCache(csv);
this.dicIndex.put(partOfSpeech, rg + 1);
return rg;
}
/**
* Converts a rule to a vector of IDs unique to each component part
*
* @param rule The rule
* @return A vector of IDs for the component parts
*/
public Vector<Integer> getRuleIdList(String rule) {
return this.idList.get(this.ruleIndex.get(rule));
}
}
/*
* Copyright (C) 2002-2007
* Taku Kudoh <taku-ku@is.aist-nara.ac.jp>
* Takashi Okamoto <tora@debian.org>
* Matt Francis <asbel@neosheffield.co.uk>
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
package net.java.sen.compiler;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer;
import java.nio.ShortBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import net.java.sen.dictionary.CToken;
import net.java.sen.trie.TrieBuilder;
import net.java.sen.util.CSVData;
import net.java.sen.util.CSVParser;
/**
* Compiles CSV source data into the data files used for analysis
*/
public class DictionaryBuilder {
/**
* Input dictionary CSV filename
*/
private static final String DICTIONARY_CSV_FILENAME = "dictionary.csv";
/**
* Input connection CSV filename
*/
private static final String CONNECTION_CSV_FILENAME = "connection.csv";
/**
* Compiled connection cost data filename
*/
private static final String CONNECTION_COST_DATA_FILENAME = "connectionCost.sen";
/**
* Compiled part of speech data filename
*/
private static final String PART_OF_SPEECH_DATA_FILENAME = "partOfSpeech.sen";
/**
* Compiled token data filename
*/
private static final String TOKEN_DATA_FILENAME = "token.sen";
/**
* Compiled trie data filename
*/
private static final String TRIE_DATA_FILENAME = "trie.sen";
/**
* Default connection cost
*/
private static final short DEFAULT_CONNECTION_COST = 10000;
/**
* Start of part-of-speech data within the dictionary CSV
*/
private static final int PART_OF_SPEECH_START = 2;
/**
* Size of part-of-speech data within the dictionary CSV
*/
private static final int PART_OF_SPEECH_SIZE = 7;
/**
* Beginning-of-string token part-of-speech
*/
private static final String BOS_PART_OF_SPEECH = "文頭,*,*,*,*,*,*";
/**
* End-of-string token part-of-speech
*/
private static final String EOS_PART_OF_SPEECH = "文末,*,*,*,*,*,*";
/**
* Unknown token part-of-speech
*/
private static final String UNKNOWN_PART_OF_SPEECH = "名詞,サ変接続,*,*,*,*,*";
/**
* Precursor data for the Trie file
*/
private static class TrieData {
/**
* Trie keys
*/
public String keys[];
/**
* Trie values
*/
public int values[];
/**
* The actual number of entries in the keys/values arrays
*/
public int size;
}
/**
* Increases the size of an array of <code>short</code>s
*
* @param current The existing array
* @return The resized array
*/
private static short[] resize(short current[]) {
short tmp[] = new short[(int) (current.length * 1.5)];
System.arraycopy(current, 0, tmp, 0, current.length);
return tmp;
}
/**
* Splits a compound reading or pronunciation field into a list
*
* Compound fields are of the form:
*
* "{head1/head2[/head3 ...]}tail"
*
* The returned list will consist of:
*
* "head1tail",
* "head2tail",
* "head3tail",
* ...
*
* @param compoundField The field to split
* @return The split list
*/
private List<String> splitCompoundField(String compoundField) {
List<String> splitFieldList;
if ((compoundField.length() == 0) || (compoundField.charAt(0) != '{')) {
// No alternatives
splitFieldList = new ArrayList<String>(1);
splitFieldList.add(compoundField);
} else {
// 1 or more alternatives. No existing entry in Ipadic has more than 4
splitFieldList = new ArrayList<String>(4);
String[] parts = compoundField.split("[{}]");
String tail = (parts.length == 3) ? parts[2] : "";
String[] heads = parts.length > 0 ? parts[1].split("/") : new String[0] ;
for (int i = 0; i < heads.length; i++) {
splitFieldList.add(heads[i] + tail);
}
}
return splitFieldList;
}
/**
* Creates the part-of-speech data file
*
* @param dictionaryCSVFilenames The filenames of the dictionary CSV data file and any additional dictionaries
* @param partOfSpeechDataFilename The filename for the part-of-speech data file
* @param matrixBuilders The three <code>CostMatrixBuilder</code>s
* @param partOfSpeechStart The starting index of the part-of-speech data within a CSV line
* @param partOfSpeechSize The number of part-of-speech values within a CSV line
* @param charset The charset of the CSV data
* @param bosPartOfSpeech The beginning-of-string part-of-speech code
* @param eosPartOfSpeech The end-of-string part-of-speech code
* @param unknownPartOfSpeech The beginning-of-string part-of-speech code
* @param dictionaryList Populated by this method with the String/CToken tuples that will be used to create the Token file
* @param standardCTokens Populated by this method with the three standard CTokens ("bos", "eos" and "unknown")
*
* @throws IOException
*/
private void createPartOfSpeechDataFile(List<String> dictionaryCSVFilenames, String partOfSpeechDataFilename,
CostMatrixBuilder[] matrixBuilders, int partOfSpeechStart, int partOfSpeechSize, String charset,
String bosPartOfSpeech, String eosPartOfSpeech, String unknownPartOfSpeech, VirtualTupleList dictionaryList, CToken[] standardCTokens) throws IOException
{
String[] csvValues = null;
CSVData key_b = new CSVData();
CSVData pos_b = new CSVData();
DataOutputStream outputStream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(partOfSpeechDataFilename)));
for (String dictionaryCSVFilename : dictionaryCSVFilenames) {
CSVParser parser = new CSVParser(new FileInputStream(dictionaryCSVFilename), charset);
while ((csvValues = parser.nextTokens()) != null) {
if (csvValues.length < (partOfSpeechSize + partOfSpeechStart)) {
throw new RuntimeException("format error:" + parser.currentLine());
}
key_b.clear();
pos_b.clear();
for (int i = partOfSpeechStart; i < (partOfSpeechStart + partOfSpeechSize); i++) {
key_b.append(csvValues[i]);
pos_b.append(csvValues[i]);
}
for (int i = partOfSpeechStart + partOfSpeechSize; i < csvValues.length; i++) {
pos_b.append(csvValues[i]);
}
CToken ctoken = new CToken();
ctoken.rcAttr2 = (short) matrixBuilders[0].getDicId(key_b.toString());
ctoken.rcAttr1 = (short) matrixBuilders[1].getDicId(key_b.toString());
ctoken.lcAttr = (short) matrixBuilders[2].getDicId(key_b.toString());
ctoken.partOfSpeechIndex = outputStream.size() >> 1;
ctoken.length = (short) csvValues[0].length();
try {
ctoken.cost = (short) Integer.parseInt(csvValues[1]);
} catch (NumberFormatException ex) {
ctoken.cost = (short) 0;
}
dictionaryList.add(csvValues[0], ctoken);
// Write to part of speech data file
StringBuilder partOfSpeechBuilder = new StringBuilder();
for (int i = partOfSpeechStart; i < (partOfSpeechStart + 4); i++) {
if (!csvValues[i].equals("*")) {
partOfSpeechBuilder.append(csvValues[i]);
partOfSpeechBuilder.append("-");
}
}
String partOfSpeech = partOfSpeechBuilder.substring(0, partOfSpeechBuilder.length() - 1);
String conjugationalType = csvValues[partOfSpeechStart + 4];
String conjugationalForm = csvValues[partOfSpeechStart + 5];
String basicForm = csvValues[partOfSpeechStart + 6];
List<String> readings = splitCompoundField(csvValues[partOfSpeechStart + 7]);
List<String> pronunciations = splitCompoundField(csvValues[partOfSpeechStart + 8]);
outputStream.writeChar(partOfSpeech.length());
outputStream.writeChars(partOfSpeech);
outputStream.writeChar(conjugationalType.length());
outputStream.writeChars(conjugationalType);
outputStream.writeChar(conjugationalForm.length());
outputStream.writeChars(conjugationalForm);
outputStream.writeChar(basicForm.length());
outputStream.writeChars(basicForm);
outputStream.writeChar(readings.size());
for (String reading : readings) {
outputStream.writeChar(reading.length());
outputStream.writeChars(reading);
}
for (String pronunciation : pronunciations) {
outputStream.writeChar(pronunciation.length());
outputStream.writeChars(pronunciation);
}
}
}
outputStream.close();
dictionaryList.sort();
CToken bosCToken = new CToken();
bosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(bosPartOfSpeech);
bosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(bosPartOfSpeech);
bosCToken.lcAttr = (short) matrixBuilders[2].getDicId(bosPartOfSpeech);
standardCTokens[0] = bosCToken;
CToken eosCToken = new CToken();
eosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(eosPartOfSpeech);
eosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(eosPartOfSpeech);
eosCToken.lcAttr = (short) matrixBuilders[2].getDicId(eosPartOfSpeech);
standardCTokens[1] = eosCToken;
CToken unknownCToken = new CToken();
unknownCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(unknownPartOfSpeech);
unknownCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(unknownPartOfSpeech);
unknownCToken.lcAttr = (short) matrixBuilders[2].getDicId(unknownPartOfSpeech);
unknownCToken.partOfSpeechIndex = -1;
standardCTokens[2] = unknownCToken;
}
/**
* Creates the connection cost matrix file
*
* @param connectionCSVFilename The filename of the connection CSV data
* @param connectionCostDataFilename The filename for the connection cost matrix
* @param defaultCost The default connection cost
* @param charset The charset of the connection CSV data
* @return An array of three <code>CostMatrixBuilder</code>s
* @throws IOException
*/
private CostMatrixBuilder[] createConnectionCostFile(String connectionCSVFilename, String connectionCostDataFilename, short defaultCost, String charset) throws IOException {
CostMatrixBuilder[] matrixBuilders = new CostMatrixBuilder[3];
matrixBuilders[0] = new CostMatrixBuilder();
matrixBuilders[1] = new CostMatrixBuilder();
matrixBuilders[2] = new CostMatrixBuilder();
Vector<String> rule1 = new Vector<String>();
Vector<String> rule2 = new Vector<String>();
Vector<String> rule3 = new Vector<String>();
// The approximate length of the file, plus a bit. If we're wrong it'll be expanded during processing
short[] scores = new short[30000];
// Read connection cost CSV data
CSVParser parser = new CSVParser(new FileInputStream(connectionCSVFilename), charset);
String t[];
int line = 0;
while ((t = parser.nextTokens()) != null) {
if (t.length < 4) {
throw new IOException("Connection cost CSV format error");
}
matrixBuilders[0].add(t[0]);
rule1.add(t[0]);
matrixBuilders[1].add(t[1]);
rule2.add(t[1]);
matrixBuilders[2].add(t[2]);
rule3.add(t[2]);
if (line == scores.length) {
scores = resize(scores);
}
scores[line++] = (short) Integer.parseInt(t[3]);
}
// Compile CostMatrixBuilders
matrixBuilders[0].build();
matrixBuilders[1].build();
matrixBuilders[2].build();
int size1 = matrixBuilders[0].size();
int size2 = matrixBuilders[1].size();
int size3 = matrixBuilders[2].size();
int ruleSize = rule1.size();
// Write connection cost data
MappedByteBuffer buffer = null;
ShortBuffer shortBuffer = null;
int matrixSizeBytes = (size1 * size2 * size3 * 2);
int headerSizeBytes = (3 * 2);
RandomAccessFile file = new RandomAccessFile(connectionCostDataFilename, "rw");
file.setLength(0);
file.writeShort(size1);
file.writeShort(size2);
file.writeShort(size3);
file.setLength(headerSizeBytes + matrixSizeBytes);
FileChannel indexChannel = file.getChannel();
buffer = indexChannel.map(FileChannel.MapMode.READ_WRITE, headerSizeBytes, matrixSizeBytes);
shortBuffer = buffer.asShortBuffer();
indexChannel.close();
for (int i = 0; i < (size1 * size2 * size3); i++) {
shortBuffer.put(i, defaultCost);
}
for (int i = 0; i < ruleSize; i++) {
Vector<Integer> r1 = matrixBuilders[0].getRuleIdList(rule1.get(i));
Vector<Integer> r2 = matrixBuilders[1].getRuleIdList(rule2.get(i));
Vector<Integer> r3 = matrixBuilders[2].getRuleIdList(rule3.get(i));
for (Iterator<Integer> i1 = r1.iterator(); i1.hasNext();) {
int ii1 = i1.next();
for (Iterator<Integer> i2 = r2.iterator(); i2.hasNext();) {
int ii2 = i2.next();
for (Iterator<Integer> i3 = r3.iterator(); i3.hasNext();) {
int ii3 = i3.next();
int position = size3 * (size2 * ii1 + ii2) + ii3;
shortBuffer.put(position, scores[i]);
}
}
}
}
buffer.force();
return matrixBuilders;
}
/**
* Create the token data file
*
* @param tokenDataFilename The filename for the token data file
* @param standardCTokens The beginning-of-string, end-of-string, and unknown-morpheme CTokens
* @param tupleList The (String,CToken) tuples
*
* @return The Trie precursor data
* @throws IOException
*/
private TrieData createTokenFile(String tokenDataFilename, CToken[] standardCTokens, VirtualTupleList tupleList)
throws IOException
{
TrieData trieData = new TrieData();
trieData.values = new int[tupleList.size()];
trieData.keys = new String[tupleList.size()];
trieData.size = 0;
int spos = 0;
int bsize = 0;
String prev = "";
DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tokenDataFilename)));
// Write beginning-of-string, end-of-string, unknown-morpheme tokens
CToken.write(out, standardCTokens[0]);
CToken.write(out, standardCTokens[1]);
CToken.write(out, standardCTokens[2]);
// Write token data
for (int i = 0; i < trieData.keys.length; i++) {
StringCTokenTuple tuple = tupleList.get(i);
String k = tuple.key;
if (!prev.equals(k) && i != 0) {
trieData.keys[trieData.size] = tupleList.get(spos).key;
trieData.values[trieData.size] = bsize + (spos << 8);
trieData.size++;
bsize = 1;
spos = i;
} else {
bsize++;
}
prev = tuple.key;
CToken.write(out, tuple.value);
}
out.flush();
out.close();
trieData.keys[trieData.size] = tupleList.get(spos).key;
trieData.values[trieData.size] = bsize + (spos << 8);
trieData.size++;
return trieData;
}
/**
* Create Trie file
*
* @param trieDataFilename The filename for the Trie file
* @param trieData The Trie precursor data
* @throws IOException
*/
private void createTrieFile(String trieDataFilename, TrieData trieData) throws IOException {
TrieBuilder builder = new TrieBuilder(trieData.keys, trieData.values, trieData.size);
builder.build(trieDataFilename);
}
/**
* Compiles CSV source data into the data files used for analysis
*
* @param customDictionaryCSVFilenames The filenames of custom dictionaries, or <code>null</code>
* @throws IOException
*/
public DictionaryBuilder(String[] customDictionaryCSVFilenames) throws IOException {
List<String> dictionaryCSVFilenames = new ArrayList<String>();
dictionaryCSVFilenames.add(DICTIONARY_CSV_FILENAME);
dictionaryCSVFilenames.addAll(Arrays.asList(customDictionaryCSVFilenames));
String charset = "UTF-8";
// Create connection cost file (matrix.sen)
CostMatrixBuilder[] matrixBuilders = createConnectionCostFile(
CONNECTION_CSV_FILENAME,
CONNECTION_COST_DATA_FILENAME,
DEFAULT_CONNECTION_COST,
charset
);
// Create part-of-speech data file (posInfo.sen)
VirtualTupleList dictionaryList = new VirtualTupleList();
CToken[] standardCTokens = new CToken[3];
createPartOfSpeechDataFile(
dictionaryCSVFilenames,
PART_OF_SPEECH_DATA_FILENAME,
matrixBuilders,
PART_OF_SPEECH_START,
PART_OF_SPEECH_SIZE,
charset,
BOS_PART_OF_SPEECH,
EOS_PART_OF_SPEECH,
UNKNOWN_PART_OF_SPEECH,
dictionaryList,
standardCTokens
);
// Free temporary object for GC
matrixBuilders = null;
// Create Token file (token.sen)
TrieData trieData = createTokenFile(
TOKEN_DATA_FILENAME,
standardCTokens,
dictionaryList
);
// Free temporary object for GC
dictionaryList = null;
// Create Trie file (da.sen)
createTrieFile(TRIE_DATA_FILENAME, trieData);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment