Skip to content

Instantly share code, notes, and snippets.

@chimerast
Created January 24, 2017 14:09
Show Gist options
  • Save chimerast/226b0cb02aba40fb7392b844a8a663d8 to your computer and use it in GitHub Desktop.
Save chimerast/226b0cb02aba40fb7392b844a8a663d8 to your computer and use it in GitHub Desktop.
sen.patch
Index: src/java/net/java/sen/util/DoubleArrayTrie.java
===================================================================
--- src/java/net/java/sen/util/DoubleArrayTrie.java (revision 89)
+++ src/java/net/java/sen/util/DoubleArrayTrie.java (working copy)
@@ -1,3 +1,18 @@
+
+
+
+
+
+
+
+// even is base array for Double Array Trie
+
+// odd is check array for Double Array Trie
+
+
+
+
+
package net.java.sen.util;
import org.apache.commons.logging.Log;
@@ -9,32 +24,30 @@
-
-
-public class DoubleArrayTrie {
+public class DoubleArrayTrie {
private final static int BUF_SIZE=500000;
private static Log log = LogFactory.getLog(DoubleArrayTrie.class);
-
- private int array[];
- private int used[];
- private int size;
-
- private int alloc_size;
- private char str[][];
- private int str_size;
- private int len[];
+ // base and check arrays for double array trie
+ private int array[];
+ private int used[];
+ private int size;
+ // buffe size for check and base arrays.
+ private int alloc_size;
+ private char str[][];
+ private int str_size;
+ private int len[];
private int val[];
- private int next_check_pos;
- private int no_delete;
+ private int next_check_pos;
+ private int no_delete;
private class Node
{
- int code;
- int depth;
- int left;
- int right;
+ int code;
+ int depth;
+ int left;
+ int right;
};
@@ -45,31 +58,27 @@
alloc_size = 0;
no_delete = 0;
}
+
+
+
+
public void load(String fileName) throws IOException {
log.info("loading double array trie dict = " + fileName);
long start = System.currentTimeMillis();
File file= new File(fileName);
array = new int[(int)(file.length() / 4)];
- DataInputStream is = null;
- try {
- is = new DataInputStream
- (new BufferedInputStream(new FileInputStream(file), BUF_SIZE));
- for (int i = 0 ; i < array.length ; i++) {
- array[i] = is.readInt();
- }
- } finally {
- if(is!=null)
- is.close();
+ DataInputStream is = new DataInputStream
+ (new BufferedInputStream(new FileInputStream(file), BUF_SIZE));
+ for (int i = 0 ; i < array.length ; i++) {
+ array[i] = is.readInt();
}
log.info("loaded time = " + (((double)(System.currentTimeMillis()- start))/1000) + "[ms]");
}
-
-
-
+
int[] _resize (int ptr[], int n, int l, int v)
{
- int tmp[] = new int [l];
+ int tmp[] = new int [l]; // realloc
if (ptr != null) {
l = ptr.length;
} else {
@@ -76,19 +85,17 @@
l = 0;
}
- for (int i = 0; i < l; i++) tmp[i] = ptr[i];
+ for (int i = 0; i < l; i++) tmp[i] = ptr[i]; // copy
for (int i = l; i < l; i++) tmp[i] = v;
ptr = null;
return tmp;
}
-
-
-
+
int resize (int new_size)
{
array = _resize (array, alloc_size << 1, new_size << 1, (int)0);
- used = _resize (used, alloc_size, new_size, (int)0);
+ used = _resize (used, alloc_size, new_size, (int)0);
alloc_size = new_size;
@@ -100,12 +107,12 @@
int prev = 0;
if(log.isTraceEnabled()) {
log.trace("parent.left=" + parent.left);
- log.trace("parent.right=" + parent.right);
- log.trace("parent.depth=" + parent.depth);
+ log.trace("parent.right=" + parent.right);
+ log.trace("parent.depth=" + parent.depth);
}
for (int i = parent.left; i < parent.right; i++) {
- if (((len != null) ?
+ if (((len != null) ?
len[i] : str[i].length) < parent.depth) continue;
char tmp[] = str[i];
@@ -116,7 +123,7 @@
log.trace("tmp["+parent.depth+"]="+tmp[parent.depth]);
cur = (int)tmp[parent.depth] + 1;
}
-
+
if (prev > cur) {
log.error("given strings are not sorted.\n");
throw new RuntimeException("Fatal: given strings are not sorted.\n");
@@ -125,11 +132,11 @@
if (cur != prev || siblings.size() == 0) {
Node tmp_node = new Node();
tmp_node.depth = parent.depth + 1;
- tmp_node.code = cur;
- tmp_node.left = i;
- if (siblings.size() != 0)
+ tmp_node.code = cur;
+ tmp_node.left = i;
+ if (siblings.size() != 0)
((Node)siblings.get(siblings.size()-1)).right = i;
-
+
siblings.add(tmp_node);
}
@@ -144,15 +151,14 @@
int insert (Vector siblings)
{
- int begin = 0;
- int pos = (((((Node)siblings.get(0)).code + 1)>((int)next_check_pos))?(((Node)siblings.get(0)).code + 1):((int)next_check_pos)) - 1;
-
+ int begin = 0;
+ int pos = (((((Node)siblings.get(0)).code + 1)>( (int)next_check_pos))?(((Node)siblings.get(0)).code + 1):( (int)next_check_pos)) - 1;
int nonzero_num = 0;
- int first = 0;
+ int first = 0;
while (true) {
pos++;
- { int t = (int)(pos); if(t>alloc_size) { resize((int)(t*1.05)); }};
+ { int t = (int)(pos); if(t>alloc_size) { resize((int)(t*1.05)); }};
if (array[(((int)pos) << 1) + 1]!=0) {
nonzero_num++;
@@ -164,7 +170,7 @@
begin = pos - ((Node)siblings.get(0)).code;
- { int t = (int)(begin + ((Node)siblings.get(siblings.size()-1)).code); if(t>alloc_size) { resize((int)(t*1.05)); }};
+ { int t = (int)(begin + ((Node)siblings.get(siblings.size()-1)).code); if(t>alloc_size) { resize((int)(t*1.05)); }};
if (used[begin]!=0) continue;
@@ -179,15 +185,13 @@
if(!flag)break;
}
-
-
-
-
+ // -- Simple heuristics --
+ // if the percentage of non-empty contents in check between the index
+ // 'next_check_pos' and 'check' is grater than some constant value (e.g. 0.9),
+ // new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzero_num/(pos - next_check_pos + 1) >= 0.95) next_check_pos = pos;
used[begin] = 1;
- size = (((size)>((int)begin + ((Node)siblings.get(siblings.size()-1)).code + 1))?(size):((int)begin + ((Node)siblings.get(siblings.size()-1)).code + 1));
-
-
+ size = (((size)>( (int)begin + ((Node)siblings.get(siblings.size()-1)).code + 1))?(size):( (int)begin + ((Node)siblings.get(siblings.size()-1)).code + 1));
for (int i = 0; i < siblings.size(); i++) {
array[(((int)begin + ((Node)siblings.get(i)).code) << 1) + 1] = begin;
}
@@ -194,14 +198,14 @@
for (int i = 0; i < siblings.size(); i++) {
Vector new_siblings = new Vector();
-
+
if (fetch(((Node)siblings.get(i)), new_siblings)==0) {
- array[((int)begin + (int)((Node)siblings.get(i)).code) << 1] =
- (val!=null) ?
- (int)(-val[((Node)siblings.get(i)).left]-1)
+ array[((int)begin + (int)((Node)siblings.get(i)).code) << 1] =
+ (val!=null) ?
+ (int)(-val[((Node)siblings.get(i)).left]-1)
: (int)(-((Node)siblings.get(i)).left-1);
-
- if ((val != null) &&
+
+ if ((val != null) &&
((int)(-val[((Node)siblings.get(i)).left]-1) >= 0)) {
log.error("negative value is assgined.");
throw new RuntimeException("Fatal: negative value is assgined.");
@@ -219,15 +223,15 @@
void clear ()
{
- array = null;
- used = null;
+ array = null;
+ used = null;
alloc_size = 0;
- size = 0;
- no_delete = 0;
+ size = 0;
+ no_delete = 0;
}
-
+
int get_unit_size() { return 8; };
-
+
int get_size() { return size; };
int get_nonzero_size ()
@@ -238,24 +242,20 @@
return result;
}
+
+ public int build (char _str[][],
+ int _len[],
+ int _val[])
-
-
- public int build (char _str[][],
- int _len[],
- int _val[])
-
{
return build(_str, _len, _val, _str.length);
}
-
-
-
- public int build (char _str[][],
- int _len[],
- int _val[],
- int size)
+
+ public int build (char _str[][],
+ int _len[],
+ int _val[],
+ int size)
{
if (_str == null) return 0;
if (_str.length != _val.length) {
@@ -263,10 +263,10 @@
return 0;
}
- str = _str;
- len = _len;
- str_size = size;
- val = _val;
+ str = _str;
+ len = _len;
+ str_size = size;
+ val = _val;
resize (1024 * 10);
@@ -274,7 +274,7 @@
next_check_pos = 0;
Node root_node = new Node();
- root_node.left = 0;
+ root_node.left = 0;
root_node.right = str_size;
root_node.depth = 0;
@@ -284,18 +284,18 @@
log.trace("---insert---");
insert (siblings);
- used = null;
+ used = null;
return size;
}
- public int search (char key[],
+ public int search (char key[],
int pos,
int len)
{
if (len==0) len = key.length;
- int b = array[((int)0) << 1];
+ int b = array[((int)0) << 1];
int p;
for ( int i = pos; i < len; i++) {
p = b + (char)(key[i]) + 1;
@@ -317,24 +317,24 @@
{
if (len==0) len = key.length;
- int b = array[((int)0) << 1];
- int num = 0;
- int n;
- int p;
+ int b = array[((int)0) << 1];
+ int num = 0;
+ int n;
+ int p;
for ( int i = pos; i < len; i++) {
+ // ignore new line and line feed.
+ // this may causes error when use latain character.
+ // if(key[i]=='\n'||key[i]=='\r')continue;
-
-
-
- p = b;
+ p = b; // + 0;
n = array[((int)p) << 1];
if ((int) b == array[(((int)p) << 1) + 1] && n < 0) {
if(log.isTraceEnabled())
log.trace("result["+num+"]="+(-n-1));
if (num < result.length) {
- result[num] = -n-1;
+ result[num] = -n-1;
} else {
log.warn("result array size may not enough");
}
@@ -343,15 +343,15 @@
p = b + (char)(key[i]) + 1;
-
-
+ // following lines are temporary code to resolve OutOfArrayException.
+ // TODO:fixme
if ( (p<<1) > array.length) {
log.warn("p range is over.");
log.warn("(p<<1,array.length)=("+(p<<1)+","+array.length+")");
return num;
}
-
-
+ // end of temporary code.
+
if ((int) b == array[(((int)p) << 1) + 1]) {
b = array[((int)p) << 1];
} else {
@@ -377,26 +377,19 @@
public void save(String file) throws IOException {
long start = System.currentTimeMillis();
- DataOutputStream out = null;
-
- try {
- out = new DataOutputStream
- (new BufferedOutputStream(new FileOutputStream(file)));
- int dsize = alloc_size << 1;
- for (int i=0; i < dsize; i++) {
- out.writeInt(array[i]);
- }
- out.close();
- } finally {
- if(out!=null)
- out.close();
+ DataOutputStream out = new DataOutputStream
+ (new BufferedOutputStream(new FileOutputStream(file)));
+ int dsize = alloc_size << 1;
+ for (int i=0; i < dsize; i++) {
+ out.writeInt(array[i]);
}
- log.info("save time = " +
+ out.close();
+ log.info("save time = " +
(((double)(System.currentTimeMillis()- start))/1000) + "[s]");
}
-
+ // for debug
public static void dumpChar(char c[],String message) {
System.err.println("message="+message);
for (int i = 0;i<c.length;i++) {
@@ -406,5 +399,11 @@
}
public static void main(String args[]) {
+ // DoubleArrayTrie da = new DoubleArrayTrie();
+
+
}
}
+
+
+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment