Created
May 20, 2019 15:22
-
-
Save nauful/2d031eb0bc9ff8c22f378170e76e411a to your computer and use it in GitHub Desktop.
Simple and fast document indexer with an inverted index (ala Apache Lucene/ElasticSearch) + full-text search. Create meta-data nodes with ids and related arrays of prefixable text (i.e. document codes/tags)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.iva.nlp.document; | |
import java.util.Arrays; | |
import java.util.BitSet; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
public class DocumentInvertedIndexer { | |
public static class Node { | |
private final int index; | |
private final String[] strs; | |
public Node(int index, final String[] strs) { | |
this.index = index; | |
this.strs = strs; | |
} | |
public int getIndex() { | |
return index; | |
} | |
public String[] getStrs() { | |
return strs; | |
} | |
} | |
protected int hash2(final String s, int p) { | |
return (5381 * 33 + s.charAt(p)) * 33 + s.charAt(p + 1); | |
} | |
protected int hash4(final String s, int p) { | |
return (((5381 * 33 + s.charAt(p)) * 33 + s.charAt(p + 1)) * 33 + s.charAt(p + 2)) * 33 + s.charAt(p + 3); | |
} | |
protected String cleanText(final String s) { | |
final StringBuilder sb = new StringBuilder(s.length()); | |
for (char c : s.toUpperCase().trim().toCharArray()) { | |
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z')) { | |
sb.append(c); | |
} | |
else { | |
sb.append('_'); | |
} | |
} | |
return sb.toString(); | |
} | |
protected final BitSet[][] hashes; | |
protected final int h2Mask; | |
protected final int h4Mask; | |
protected final HashMap<String, HashSet<Integer>> prefixMap; | |
protected final String[] prefixKeys; | |
protected final Node[] prefixes; | |
protected final Node[] fullTexts; | |
public DocumentInvertedIndexer(final Node[] prefixes, final Node[] fullTexts) { | |
this(prefixes, fullTexts, 10, 12); | |
} | |
public DocumentInvertedIndexer(final Node[] prefixes, final Node[] fullTexts, int h2Bits, int h4Bits) { | |
this.h2Mask = (1 << h2Bits) - 1; | |
this.h4Mask = (1 << h4Bits) - 1; | |
this.prefixes = prefixes; | |
this.fullTexts = fullTexts; | |
final HashMap<String, String> cleanTextCache = new HashMap<>(); | |
prefixMap = new HashMap<>(); | |
for (final Node n : prefixes) { | |
for (final String xs : n.getStrs()) { | |
final String s = cleanTextCache.computeIfAbsent(xs, (k) -> cleanText(xs)); | |
if (!s.isEmpty()) { | |
prefixMap.computeIfAbsent(s, (k) -> new HashSet<>()).add(n.getIndex()); | |
} | |
} | |
} | |
prefixKeys = prefixMap.keySet().toArray(new String[prefixMap.keySet().size()]); | |
Arrays.sort(prefixKeys); | |
hashes = new BitSet[3][]; | |
hashes[0] = new BitSet[256]; | |
hashes[1] = new BitSet[h2Mask + 1]; | |
hashes[2] = new BitSet[h4Mask + 1]; | |
for (int i = 0; i < hashes.length; i++) { | |
for (int j = 0; j < hashes[i].length; j++) { | |
hashes[i][j] = new BitSet(); | |
} | |
} | |
for (final Node n : fullTexts) { | |
for (String xs : n.getStrs()) { | |
final String s = cleanTextCache.computeIfAbsent(xs, (k) -> cleanText(xs)); | |
if (s.isEmpty()) { | |
continue; | |
} | |
for (char c : s.toCharArray()) { | |
hashes[0][c].set(n.getIndex()); | |
} | |
if (s.length() >= 2) { | |
for (int p = 0; p < s.length() - 2; p++) { | |
hashes[1][hash2(s, p) & h2Mask].set(n.getIndex()); | |
} | |
if (s.length() >= 4) { | |
for (int p = 0; p < s.length() - 4; p++) { | |
hashes[2][hash4(s, p) & h4Mask].set(n.getIndex()); | |
} | |
} | |
} | |
} | |
} | |
} | |
protected BitSet matchPrefixes(final String req) { | |
final String reqClean = cleanText(req); | |
final BitSet cands = new BitSet(); | |
if (reqClean.isEmpty()) { | |
return cands; | |
} | |
int off = 0; | |
int mid = prefixKeys.length, len = prefixKeys.length; | |
while (mid > 0) { | |
mid = len / 2; | |
int c = reqClean.compareTo(prefixKeys[off + mid]); | |
if (c > 0) { | |
off += mid; | |
} | |
len -= mid; | |
} | |
int runs = 0; | |
while (off < prefixKeys.length && runs < 1 && !prefixKeys[off].startsWith(reqClean)) { | |
++off; | |
++runs; | |
} | |
while (off < prefixKeys.length && prefixKeys[off].startsWith(reqClean)) { | |
for (int i : prefixMap.get(prefixKeys[off])) { | |
cands.set(i); | |
} | |
++off; | |
} | |
return cands; | |
} | |
protected BitSet matchFullText(final String req) { | |
final String reqClean = cleanText(req); | |
final BitSet cands = new BitSet(); | |
for (int p = 0; p < reqClean.length(); p++) { | |
if (p == 0) { | |
cands.or(hashes[0][reqClean.charAt(p)]); | |
} | |
else { | |
cands.and(hashes[0][reqClean.charAt(p)]); | |
} | |
} | |
if (req.length() >= 2) { | |
for (int p = 0; p < reqClean.length() - 2; p++) { | |
cands.and(hashes[1][hash2(reqClean, p) & h2Mask]); | |
} | |
if (req.length() >= 4) { | |
for (int p = 0; p < reqClean.length() - 4; p++) { | |
cands.and(hashes[2][hash4(reqClean, p) & h4Mask]); | |
} | |
} | |
} | |
if (cands.isEmpty()) { | |
return cands; | |
} | |
final String reqUpper = req.toUpperCase(); | |
for (int i = 0; i < fullTexts.length; i++) { | |
if (!cands.get(i)) { | |
continue; | |
} | |
boolean add = false; | |
for (final String s : fullTexts[i].strs) { | |
add = s.toUpperCase().contains(reqUpper); | |
if (add) { | |
break; | |
} | |
} | |
if (!add) { | |
cands.clear(i); | |
} | |
} | |
return cands; | |
} | |
public int[] matchAny(final String req) { | |
final BitSet r1 = matchFullText(req); | |
final BitSet r2 = matchPrefixes(req); | |
r1.or(r2); | |
final int n = r1.cardinality(); | |
final int[] res = new int[n]; | |
int w = 0; | |
for (int i = 0; i < fullTexts.length; i++) { | |
if (r1.get(i)) { | |
res[w++] = i; | |
} | |
} | |
return res; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment