Created
April 3, 2016 07:48
-
-
Save libliflin/7bf5272ff5d1f2a2ef27580c84684d37 to your computer and use it in GitHub Desktop.
Everything on stackoverflow didn't really factor in having to do it for alot of different strings in a file > 2MB. And writing this was faster than learning lucene.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.*; | |
import java.util.function.Function; | |
/** | |
* // todo: not threadsafe | |
* // todo: could be faster with regards to TLBs | |
* // todo: could multi-thread the indexer. | |
* // todo: index all the things; not just whole words. | |
* // todo: if you index all the things; a prefix tree would probably help; ESPECIALLY with space. | |
* // todo: charsequence: what's that? | |
*/ | |
public class Searcher { | |
public static Function<Character, Boolean> TABLE_NAME = (c) -> !(Character.isLetter(c) || c == '_' || c == '$'); | |
public static Function<Character, char[]> UPPERCASE = (c) -> new char[]{Character.toUpperCase(c)}; | |
private final char[] rawText; | |
private final Function<Character, Boolean> isWhiteSpace; | |
private final Function<Character, char[]> translate; | |
// translated text -> index in raw text | |
private Map<String, List<Integer>> index = new HashMap<>(); | |
public Searcher(String text){ | |
this(text, TABLE_NAME, UPPERCASE); | |
} | |
public Searcher(String text, Function<Character, Boolean> isWhiteSpace){ | |
this(text, isWhiteSpace, UPPERCASE); | |
} | |
public Searcher(String text, Function<Character, Boolean> isWhiteSpace, Function<Character, char[]> translate){ | |
this.rawText = text.toCharArray(); | |
this.isWhiteSpace = isWhiteSpace; | |
this.translate = translate; | |
index(); | |
} | |
// null/"" -> -1 | |
public int count(String test){ | |
if(test == null || test.isEmpty()){ | |
return -1; | |
} | |
String xlated = getTranslated(0, test.length(), test.toCharArray()); | |
List<Integer> indecies = index.get(xlated); | |
if(indecies == null){ | |
return 0; | |
} else { | |
return indecies.size(); | |
} | |
} | |
// null/"" -> null | |
public List<String> contexts(String test, int before, int after){ | |
if(test == null || test.isEmpty()){ | |
return null; | |
} | |
String xlated = getTranslated(0, test.length(), test.toCharArray()); | |
List<Integer> indecies = index.get(xlated); | |
List<String> contexts = new ArrayList<>(); | |
if(indecies == null){ | |
return contexts; | |
} | |
for(Integer index : indecies){ | |
int start = index - before; | |
int realStart = Math.max(start, 0); | |
int end = index + test.length() + after; | |
int realEnd = Math.min(rawText.length, end); | |
int offset = realEnd - realStart; | |
StringBuilder sb = new StringBuilder(); | |
sb.append(rawText, realStart, offset); | |
contexts.add(sb.toString()); | |
} | |
return contexts; | |
} | |
private void index(){ | |
int max = rawText.length; | |
// current non-whitespace stuffs; | |
int beginIndex = 0; | |
int endIndex = 0; | |
for (int i = 0; i < max; i++) { | |
/* eat the whitespace */ | |
while (isWhiteSpace.apply(rawText[i]) && ++i < max); | |
if(i == max) break; | |
beginIndex = i; | |
while (++i < max && !isWhiteSpace.apply(rawText[i])); | |
endIndex = i; | |
// todo: add another loop to not mess with the cache by re-going over the stuffs. | |
// todo: or inline the stuffs. | |
addToIndex(beginIndex, endIndex); | |
} | |
} | |
private void addToIndex(int beginIndex, int endIndex){ | |
// todo: make so you can parallelize this with CAS. | |
String translated = getTranslated(beginIndex, endIndex, rawText); | |
List<Integer> integers = index.get(translated); | |
if(integers == null){ | |
integers = new ArrayList<>(); | |
} | |
integers.add(beginIndex); | |
index.put(translated, integers); | |
} | |
private String getTranslated(int beginIndex, int endIndex, char[] xlate){ | |
// todo: buffer me plz | |
StringBuilder builder = new StringBuilder(); | |
for(int i = beginIndex; i < endIndex; i++){ | |
char[] apply = translate.apply(xlate[i]); | |
builder.append(apply); | |
} | |
return builder.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Tests: