Created
January 24, 2017 14:09
-
-
Save chimerast/226b0cb02aba40fb7392b844a8a663d8 to your computer and use it in GitHub Desktop.
sen.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Index: src/java/net/java/sen/util/DoubleArrayTrie.java | |
=================================================================== | |
--- src/java/net/java/sen/util/DoubleArrayTrie.java (revision 89) | |
+++ src/java/net/java/sen/util/DoubleArrayTrie.java (working copy) | |
@@ -1,3 +1,18 @@ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+// even is base array for Double Array Trie | |
+ | |
+// odd is check array for Double Array Trie | |
+ | |
+ | |
+ | |
+ | |
+ | |
package net.java.sen.util; | |
import org.apache.commons.logging.Log; | |
@@ -9,32 +24,30 @@ | |
- | |
- | |
-public class DoubleArrayTrie { | |
+public class DoubleArrayTrie { | |
private final static int BUF_SIZE=500000; | |
private static Log log = LogFactory.getLog(DoubleArrayTrie.class); | |
- | |
- private int array[]; | |
- private int used[]; | |
- private int size; | |
- | |
- private int alloc_size; | |
- private char str[][]; | |
- private int str_size; | |
- private int len[]; | |
+ // base and check arrays for double array trie | |
+ private int array[]; | |
+ private int used[]; | |
+ private int size; | |
+ // buffe size for check and base arrays. | |
+ private int alloc_size; | |
+ private char str[][]; | |
+ private int str_size; | |
+ private int len[]; | |
private int val[]; | |
- private int next_check_pos; | |
- private int no_delete; | |
+ private int next_check_pos; | |
+ private int no_delete; | |
private class Node | |
{ | |
- int code; | |
- int depth; | |
- int left; | |
- int right; | |
+ int code; | |
+ int depth; | |
+ int left; | |
+ int right; | |
}; | |
@@ -45,31 +58,27 @@ | |
alloc_size = 0; | |
no_delete = 0; | |
} | |
+ | |
+ | |
+ | |
+ | |
public void load(String fileName) throws IOException { | |
log.info("loading double array trie dict = " + fileName); | |
long start = System.currentTimeMillis(); | |
File file= new File(fileName); | |
array = new int[(int)(file.length() / 4)]; | |
- DataInputStream is = null; | |
- try { | |
- is = new DataInputStream | |
- (new BufferedInputStream(new FileInputStream(file), BUF_SIZE)); | |
- for (int i = 0 ; i < array.length ; i++) { | |
- array[i] = is.readInt(); | |
- } | |
- } finally { | |
- if(is!=null) | |
- is.close(); | |
+ DataInputStream is = new DataInputStream | |
+ (new BufferedInputStream(new FileInputStream(file), BUF_SIZE)); | |
+ for (int i = 0 ; i < array.length ; i++) { | |
+ array[i] = is.readInt(); | |
} | |
log.info("loaded time = " + (((double)(System.currentTimeMillis()- start))/1000) + "[ms]"); | |
} | |
- | |
- | |
- | |
+ | |
int[] _resize (int ptr[], int n, int l, int v) | |
{ | |
- int tmp[] = new int [l]; | |
+ int tmp[] = new int [l]; // realloc | |
if (ptr != null) { | |
l = ptr.length; | |
} else { | |
@@ -76,19 +85,17 @@ | |
l = 0; | |
} | |
- for (int i = 0; i < l; i++) tmp[i] = ptr[i]; | |
+ for (int i = 0; i < l; i++) tmp[i] = ptr[i]; // copy | |
for (int i = l; i < l; i++) tmp[i] = v; | |
ptr = null; | |
return tmp; | |
} | |
- | |
- | |
- | |
+ | |
int resize (int new_size) | |
{ | |
array = _resize (array, alloc_size << 1, new_size << 1, (int)0); | |
- used = _resize (used, alloc_size, new_size, (int)0); | |
+ used = _resize (used, alloc_size, new_size, (int)0); | |
alloc_size = new_size; | |
@@ -100,12 +107,12 @@ | |
int prev = 0; | |
if(log.isTraceEnabled()) { | |
log.trace("parent.left=" + parent.left); | |
- log.trace("parent.right=" + parent.right); | |
- log.trace("parent.depth=" + parent.depth); | |
+ log.trace("parent.right=" + parent.right); | |
+ log.trace("parent.depth=" + parent.depth); | |
} | |
for (int i = parent.left; i < parent.right; i++) { | |
- if (((len != null) ? | |
+ if (((len != null) ? | |
len[i] : str[i].length) < parent.depth) continue; | |
char tmp[] = str[i]; | |
@@ -116,7 +123,7 @@ | |
log.trace("tmp["+parent.depth+"]="+tmp[parent.depth]); | |
cur = (int)tmp[parent.depth] + 1; | |
} | |
- | |
+ | |
if (prev > cur) { | |
log.error("given strings are not sorted.\n"); | |
throw new RuntimeException("Fatal: given strings are not sorted.\n"); | |
@@ -125,11 +132,11 @@ | |
if (cur != prev || siblings.size() == 0) { | |
Node tmp_node = new Node(); | |
tmp_node.depth = parent.depth + 1; | |
- tmp_node.code = cur; | |
- tmp_node.left = i; | |
- if (siblings.size() != 0) | |
+ tmp_node.code = cur; | |
+ tmp_node.left = i; | |
+ if (siblings.size() != 0) | |
((Node)siblings.get(siblings.size()-1)).right = i; | |
- | |
+ | |
siblings.add(tmp_node); | |
} | |
@@ -144,15 +151,14 @@ | |
int insert (Vector siblings) | |
{ | |
- int begin = 0; | |
- int pos = (((((Node)siblings.get(0)).code + 1)>((int)next_check_pos))?(((Node)siblings.get(0)).code + 1):((int)next_check_pos)) - 1; | |
- | |
+ int begin = 0; | |
+ int pos = (((((Node)siblings.get(0)).code + 1)>( (int)next_check_pos))?(((Node)siblings.get(0)).code + 1):( (int)next_check_pos)) - 1; | |
int nonzero_num = 0; | |
- int first = 0; | |
+ int first = 0; | |
while (true) { | |
pos++; | |
- { int t = (int)(pos); if(t>alloc_size) { resize((int)(t*1.05)); }}; | |
+ { int t = (int)(pos); if(t>alloc_size) { resize((int)(t*1.05)); }}; | |
if (array[(((int)pos) << 1) + 1]!=0) { | |
nonzero_num++; | |
@@ -164,7 +170,7 @@ | |
begin = pos - ((Node)siblings.get(0)).code; | |
- { int t = (int)(begin + ((Node)siblings.get(siblings.size()-1)).code); if(t>alloc_size) { resize((int)(t*1.05)); }}; | |
+ { int t = (int)(begin + ((Node)siblings.get(siblings.size()-1)).code); if(t>alloc_size) { resize((int)(t*1.05)); }}; | |
if (used[begin]!=0) continue; | |
@@ -179,15 +185,13 @@ | |
if(!flag)break; | |
} | |
- | |
- | |
- | |
- | |
+ // -- Simple heuristics -- | |
+ // if the percentage of non-empty contents in check between the index | |
+ // 'next_check_pos' and 'check' is grater than some constant value (e.g. 0.9), | |
+ // new 'next_check_pos' index is written by 'check'. | |
if (1.0 * nonzero_num/(pos - next_check_pos + 1) >= 0.95) next_check_pos = pos; | |
used[begin] = 1; | |
- size = (((size)>((int)begin + ((Node)siblings.get(siblings.size()-1)).code + 1))?(size):((int)begin + ((Node)siblings.get(siblings.size()-1)).code + 1)); | |
- | |
- | |
+ size = (((size)>( (int)begin + ((Node)siblings.get(siblings.size()-1)).code + 1))?(size):( (int)begin + ((Node)siblings.get(siblings.size()-1)).code + 1)); | |
for (int i = 0; i < siblings.size(); i++) { | |
array[(((int)begin + ((Node)siblings.get(i)).code) << 1) + 1] = begin; | |
} | |
@@ -194,14 +198,14 @@ | |
for (int i = 0; i < siblings.size(); i++) { | |
Vector new_siblings = new Vector(); | |
- | |
+ | |
if (fetch(((Node)siblings.get(i)), new_siblings)==0) { | |
- array[((int)begin + (int)((Node)siblings.get(i)).code) << 1] = | |
- (val!=null) ? | |
- (int)(-val[((Node)siblings.get(i)).left]-1) | |
+ array[((int)begin + (int)((Node)siblings.get(i)).code) << 1] = | |
+ (val!=null) ? | |
+ (int)(-val[((Node)siblings.get(i)).left]-1) | |
: (int)(-((Node)siblings.get(i)).left-1); | |
- | |
- if ((val != null) && | |
+ | |
+ if ((val != null) && | |
((int)(-val[((Node)siblings.get(i)).left]-1) >= 0)) { | |
log.error("negative value is assgined."); | |
throw new RuntimeException("Fatal: negative value is assgined."); | |
@@ -219,15 +223,15 @@ | |
void clear () | |
{ | |
- array = null; | |
- used = null; | |
+ array = null; | |
+ used = null; | |
alloc_size = 0; | |
- size = 0; | |
- no_delete = 0; | |
+ size = 0; | |
+ no_delete = 0; | |
} | |
- | |
+ | |
int get_unit_size() { return 8; }; | |
- | |
+ | |
int get_size() { return size; }; | |
int get_nonzero_size () | |
@@ -238,24 +242,20 @@ | |
return result; | |
} | |
+ | |
+ public int build (char _str[][], | |
+ int _len[], | |
+ int _val[]) | |
- | |
- | |
- public int build (char _str[][], | |
- int _len[], | |
- int _val[]) | |
- | |
{ | |
return build(_str, _len, _val, _str.length); | |
} | |
- | |
- | |
- | |
- public int build (char _str[][], | |
- int _len[], | |
- int _val[], | |
- int size) | |
+ | |
+ public int build (char _str[][], | |
+ int _len[], | |
+ int _val[], | |
+ int size) | |
{ | |
if (_str == null) return 0; | |
if (_str.length != _val.length) { | |
@@ -263,10 +263,10 @@ | |
return 0; | |
} | |
- str = _str; | |
- len = _len; | |
- str_size = size; | |
- val = _val; | |
+ str = _str; | |
+ len = _len; | |
+ str_size = size; | |
+ val = _val; | |
resize (1024 * 10); | |
@@ -274,7 +274,7 @@ | |
next_check_pos = 0; | |
Node root_node = new Node(); | |
- root_node.left = 0; | |
+ root_node.left = 0; | |
root_node.right = str_size; | |
root_node.depth = 0; | |
@@ -284,18 +284,18 @@ | |
log.trace("---insert---"); | |
insert (siblings); | |
- used = null; | |
+ used = null; | |
return size; | |
} | |
- public int search (char key[], | |
+ public int search (char key[], | |
int pos, | |
int len) | |
{ | |
if (len==0) len = key.length; | |
- int b = array[((int)0) << 1]; | |
+ int b = array[((int)0) << 1]; | |
int p; | |
for ( int i = pos; i < len; i++) { | |
p = b + (char)(key[i]) + 1; | |
@@ -317,24 +317,24 @@ | |
{ | |
if (len==0) len = key.length; | |
- int b = array[((int)0) << 1]; | |
- int num = 0; | |
- int n; | |
- int p; | |
+ int b = array[((int)0) << 1]; | |
+ int num = 0; | |
+ int n; | |
+ int p; | |
for ( int i = pos; i < len; i++) { | |
+ // ignore new line and line feed. | |
+ // this may causes error when use latain character. | |
+ // if(key[i]=='\n'||key[i]=='\r')continue; | |
- | |
- | |
- | |
- p = b; | |
+ p = b; // + 0; | |
n = array[((int)p) << 1]; | |
if ((int) b == array[(((int)p) << 1) + 1] && n < 0) { | |
if(log.isTraceEnabled()) | |
log.trace("result["+num+"]="+(-n-1)); | |
if (num < result.length) { | |
- result[num] = -n-1; | |
+ result[num] = -n-1; | |
} else { | |
log.warn("result array size may not enough"); | |
} | |
@@ -343,15 +343,15 @@ | |
p = b + (char)(key[i]) + 1; | |
- | |
- | |
+ // following lines are temporary code to resolve OutOfArrayException. | |
+ // TODO:fixme | |
if ( (p<<1) > array.length) { | |
log.warn("p range is over."); | |
log.warn("(p<<1,array.length)=("+(p<<1)+","+array.length+")"); | |
return num; | |
} | |
- | |
- | |
+ // end of temporary code. | |
+ | |
if ((int) b == array[(((int)p) << 1) + 1]) { | |
b = array[((int)p) << 1]; | |
} else { | |
@@ -377,26 +377,19 @@ | |
public void save(String file) throws IOException { | |
long start = System.currentTimeMillis(); | |
- DataOutputStream out = null; | |
- | |
- try { | |
- out = new DataOutputStream | |
- (new BufferedOutputStream(new FileOutputStream(file))); | |
- int dsize = alloc_size << 1; | |
- for (int i=0; i < dsize; i++) { | |
- out.writeInt(array[i]); | |
- } | |
- out.close(); | |
- } finally { | |
- if(out!=null) | |
- out.close(); | |
+ DataOutputStream out = new DataOutputStream | |
+ (new BufferedOutputStream(new FileOutputStream(file))); | |
+ int dsize = alloc_size << 1; | |
+ for (int i=0; i < dsize; i++) { | |
+ out.writeInt(array[i]); | |
} | |
- log.info("save time = " + | |
+ out.close(); | |
+ log.info("save time = " + | |
(((double)(System.currentTimeMillis()- start))/1000) + "[s]"); | |
} | |
- | |
+ // for debug | |
public static void dumpChar(char c[],String message) { | |
System.err.println("message="+message); | |
for (int i = 0;i<c.length;i++) { | |
@@ -406,5 +399,11 @@ | |
} | |
public static void main(String args[]) { | |
+ // DoubleArrayTrie da = new DoubleArrayTrie(); | |
+ | |
+ | |
} | |
} | |
+ | |
+ | |
+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment