Skip to content

Instantly share code, notes, and snippets.

@takawitter
Created May 14, 2013 00:12
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takawitter/5572607 to your computer and use it in GitHub Desktop.
Save takawitter/5572607 to your computer and use it in GitHub Desktop.
「高速文字列解析の世界」を参考に、LZ77実装してみた。非常に単純な実装で、compress1はマッチする文字列を元配列をなめて求める方式。compress2は文字出現位置をマップで持つ方式。 Wikipedia日本語タイトルをTrieに格納してできたTail配列に対してやってみると、513万文字が435万文字になった。 compress1が31.7秒、compress2が8.9秒。ウィンドウサイズは8192。
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class LZ77 {
public static void compress1(CharSequence src, Appendable out, int windowSize)
throws IOException{
int n = src.length();
for(int i = 0; i < n; i++){
char target = src.charAt(i);
// find longest match
boolean found = false;
int start = 0;
int matchLen = 0;
char nonMatchChar = 0xff;
for(int s = Math.max(0, i - windowSize); s < i; s++){
if(target == src.charAt(s)){
int len = getMatchedLen(src, s + 1, i + 1, n) + 1;
if(len > matchLen){
start = i - s;
matchLen = len;
nonMatchChar = (char)0xff;
if((i + matchLen) < n){
nonMatchChar = src.charAt(i + matchLen);
}
}
found = true;
}
}
if(found){
out.append((char)start)
.append((char)matchLen)
.append(nonMatchChar);
i += matchLen;
} else{
out.append((char)0x00).append((char)0x00).append(target);
}
}
}
public static void compress2(CharSequence src, Appendable out, int windowSize)
throws IOException{
Map<Character, List<Integer>> startPoss = new HashMap<Character, List<Integer>>();
int n = src.length();
for(int i = 0; i < n; i++){
char target = src.charAt(i);
// find longest match
boolean found = false;
int start = 0;
int matchLen = 0;
char nonMatchChar = 0xff;
List<Integer> poss = startPoss.get(target);
if(poss != null){
Iterator<Integer> it = poss.iterator();
while(it.hasNext()){
int s = it.next();
if((i - s) > windowSize){
it.remove();
continue;
}
int len = getMatchedLen(src, s + 1, i + 1, n) + 1;
if(len > matchLen){
start = i - s;
matchLen = len;
nonMatchChar = (char)0xff;
if((i + matchLen) < n){
nonMatchChar = src.charAt(i + matchLen);
}
}
found = true;
}
poss.add(i);
int jn = Math.min(i + matchLen + 1, n);
for(int j = i + 1; j < jn; j++){
List<Integer> p = startPoss.get(src.charAt(j));
if(p == null){
p = new LinkedList<Integer>();
startPoss.put(src.charAt(j), p);
}
p.add(j);
}
} else{
poss = new LinkedList<Integer>();
poss.add(i);
startPoss.put(target, poss);
}
if(found){
out.append((char)start)
.append((char)matchLen)
.append(nonMatchChar);
i += matchLen;
} else{
out.append((char)0x00).append((char)0x00).append(target);
}
}
}
private static int getMatchedLen(CharSequence src, int i1, int i2, int end){
int n = Math.min(i2 - i1, end - i2);
for(int i = 0; i < n; i++){
if(src.charAt(i1++) != src.charAt(i2++)) return i;
}
return 0;
}
public static void decompress(CharSequence src, StringBuilder out){
int n = src.length();
for(int i = 0; i < n; i += 3){
int start = src.charAt(i);
int matchedLen = src.charAt(i + 1);
char nonMatchChar = src.charAt(i + 2);
if(start != 0){
int s = out.length() - start;
int e = s + matchedLen;
for(; s < e; s++){
out.append(out.charAt(s));
}
}
if(nonMatchChar != 0xff){
out.append(nonMatchChar);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment