Skip to content

Instantly share code, notes, and snippets.

@libliflin
Created April 3, 2016 07:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save libliflin/7bf5272ff5d1f2a2ef27580c84684d37 to your computer and use it in GitHub Desktop.
Save libliflin/7bf5272ff5d1f2a2ef27580c84684d37 to your computer and use it in GitHub Desktop.
Everything on stackoverflow didn't really factor in having to do it for alot of different strings in a file > 2MB. And writing this was faster than learning lucene.
import java.util.*;
import java.util.function.Function;
/**
* // todo: not threadsafe
* // todo: could be faster with regards to TLBs
* // todo: could multi-thread the indexer.
* // todo: index all the things; not just whole words.
* // todo: if you index all the things; a prefix tree would probably help; ESPECIALLY with space.
* // todo: charsequence: what's that?
*/
public class Searcher {
public static Function<Character, Boolean> TABLE_NAME = (c) -> !(Character.isLetter(c) || c == '_' || c == '$');
public static Function<Character, char[]> UPPERCASE = (c) -> new char[]{Character.toUpperCase(c)};
private final char[] rawText;
private final Function<Character, Boolean> isWhiteSpace;
private final Function<Character, char[]> translate;
// translated text -> index in raw text
private Map<String, List<Integer>> index = new HashMap<>();
public Searcher(String text){
this(text, TABLE_NAME, UPPERCASE);
}
public Searcher(String text, Function<Character, Boolean> isWhiteSpace){
this(text, isWhiteSpace, UPPERCASE);
}
public Searcher(String text, Function<Character, Boolean> isWhiteSpace, Function<Character, char[]> translate){
this.rawText = text.toCharArray();
this.isWhiteSpace = isWhiteSpace;
this.translate = translate;
index();
}
// null/"" -> -1
public int count(String test){
if(test == null || test.isEmpty()){
return -1;
}
String xlated = getTranslated(0, test.length(), test.toCharArray());
List<Integer> indecies = index.get(xlated);
if(indecies == null){
return 0;
} else {
return indecies.size();
}
}
// null/"" -> null
public List<String> contexts(String test, int before, int after){
if(test == null || test.isEmpty()){
return null;
}
String xlated = getTranslated(0, test.length(), test.toCharArray());
List<Integer> indecies = index.get(xlated);
List<String> contexts = new ArrayList<>();
if(indecies == null){
return contexts;
}
for(Integer index : indecies){
int start = index - before;
int realStart = Math.max(start, 0);
int end = index + test.length() + after;
int realEnd = Math.min(rawText.length, end);
int offset = realEnd - realStart;
StringBuilder sb = new StringBuilder();
sb.append(rawText, realStart, offset);
contexts.add(sb.toString());
}
return contexts;
}
private void index(){
int max = rawText.length;
// current non-whitespace stuffs;
int beginIndex = 0;
int endIndex = 0;
for (int i = 0; i < max; i++) {
/* eat the whitespace */
while (isWhiteSpace.apply(rawText[i]) && ++i < max);
if(i == max) break;
beginIndex = i;
while (++i < max && !isWhiteSpace.apply(rawText[i]));
endIndex = i;
// todo: add another loop to not mess with the cache by re-going over the stuffs.
// todo: or inline the stuffs.
addToIndex(beginIndex, endIndex);
}
}
private void addToIndex(int beginIndex, int endIndex){
// todo: make so you can parallelize this with CAS.
String translated = getTranslated(beginIndex, endIndex, rawText);
List<Integer> integers = index.get(translated);
if(integers == null){
integers = new ArrayList<>();
}
integers.add(beginIndex);
index.put(translated, integers);
}
private String getTranslated(int beginIndex, int endIndex, char[] xlate){
// todo: buffer me plz
StringBuilder builder = new StringBuilder();
for(int i = beginIndex; i < endIndex; i++){
char[] apply = translate.apply(xlate[i]);
builder.append(apply);
}
return builder.toString();
}
}
@libliflin
Copy link
Author

Tests:

import org.junit.Test;

import static org.junit.Assert.*;

/**
 * Created by blaffin on 4/3/2016.
 */
public class SearcherTest {
    @Test
    public void contexts() throws Exception {
        String testString = "This is a test for(for,for for_for_you you!";
        Searcher searcher = new Searcher(testString);
        assertEquals("This", searcher.contexts("This", 0, 0).get(0));
        assertEquals("This", searcher.contexts("This", 4, 0).get(0));
        assertEquals("This is", searcher.contexts("This", 4, 3).get(0));
        assertEquals("you you!", searcher.contexts("you", 4, 3).get(0));
    }

    @Test
    public void count0() throws Exception {
        String testString = "This is a test for(for,for for_for_you you!";
        Searcher searcher = new Searcher(testString);
        assertEquals(-1, searcher.count(""));
        assertEquals(1, searcher.count("you"));
        assertEquals(3, searcher.count("For"));
        assertEquals(0, searcher.count("yo"));
        assertEquals(0, searcher.count("Fo"));
        assertEquals(0, searcher.count("ou"));
        assertEquals(0, searcher.count("or"));
    }

    @org.junit.Test
    public void count() throws Exception {
        String testString = "This is a test for(for,for for_for_you you!";
        Searcher searcher = new Searcher(testString);
        assertEquals(-1, searcher.count(""));
        assertEquals(1, searcher.count("you"));
        assertEquals(3, searcher.count("for"));
        assertEquals(1, searcher.count("You"));
        assertEquals(3, searcher.count("For"));
        assertEquals(1, searcher.count("YOU"));
        assertEquals(3, searcher.count("FOR"));
        assertEquals(0, searcher.count("asdfasd"));
    }

    @org.junit.Test
    public void count2() throws Exception {
        String testString = "This is a test for(for,for for_for_you you";
        Searcher searcher = new Searcher(testString);
        assertEquals(-1, searcher.count(""));
        assertEquals(1, searcher.count("you"));
        assertEquals(3, searcher.count("for"));
        assertEquals(1, searcher.count("You"));
        assertEquals(3, searcher.count("For"));
        assertEquals(1, searcher.count("YOU"));
        assertEquals(3, searcher.count("FOR"));
        assertEquals(0, searcher.count("asdfasd"));
    }

}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment