Skip to content

Instantly share code, notes, and snippets.

@nauful
Last active May 20, 2019 15:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nauful/11098427da651be0e69c6beff4c81d29 to your computer and use it in GitHub Desktop.
Save nauful/11098427da651be0e69c6beff4c81d29 to your computer and use it in GitHub Desktop.
Efficient and low-memory document indexer with inverted index + full-text search (ala Apache Lucene/ElasticSearch) extended to find partial word matches. Create meta-data nodes with ids and related arrays of prefixable text (i.e. document codes/tags), and nodes with ids and related arrays of body text (i.e. entire document bodies).
package com.iva.nlp.document;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.HashSet;
public class DocumentInvertedIndexer {
public static class Node {
private final int index;
private final String[] strs;
public Node(int index, final String[] strs) {
this.index = index;
this.strs = strs;
}
public int getIndex() {
return index;
}
public String[] getStrs() {
return strs;
}
}
protected int hash2(final String s, int p) {
return (5381 * 33 + s.charAt(p)) * 33 + s.charAt(p + 1);
}
protected int hash4(final String s, int p) {
return (((5381 * 33 + s.charAt(p)) * 33 + s.charAt(p + 1)) * 33 + s.charAt(p + 2)) * 33 + s.charAt(p + 3);
}
protected String cleanText(final String s) {
final StringBuilder sb = new StringBuilder(s.length());
for (char c : s.toUpperCase().trim().toCharArray()) {
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z')) {
sb.append(c);
}
else {
sb.append('_');
}
}
return sb.toString();
}
protected final BitSet[][] hashes;
protected final int h2Mask;
protected final int h4Mask;
protected final HashMap<String, HashSet<Integer>> prefixMap;
protected final String[] prefixKeys;
protected final Node[] prefixes;
protected final Node[] fullTexts;
public DocumentInvertedIndexer(final Node[] prefixes, final Node[] fullTexts) {
this(prefixes, fullTexts, 10, 12);
}
public DocumentInvertedIndexer(final Node[] prefixes, final Node[] fullTexts, int h2Bits, int h4Bits) {
this.h2Mask = (1 << h2Bits) - 1;
this.h4Mask = (1 << h4Bits) - 1;
this.prefixes = prefixes;
this.fullTexts = fullTexts;
final HashMap<String, String> cleanTextCache = new HashMap<>();
prefixMap = new HashMap<>();
for (final Node n : prefixes) {
for (final String xs : n.getStrs()) {
final String s = cleanTextCache.computeIfAbsent(xs, (k) -> cleanText(xs));
if (!s.isEmpty()) {
prefixMap.computeIfAbsent(s, (k) -> new HashSet<>()).add(n.getIndex());
}
}
}
prefixKeys = prefixMap.keySet().toArray(new String[prefixMap.keySet().size()]);
Arrays.sort(prefixKeys);
hashes = new BitSet[3][];
hashes[0] = new BitSet[256];
hashes[1] = new BitSet[h2Mask + 1];
hashes[2] = new BitSet[h4Mask + 1];
for (int i = 0; i < hashes.length; i++) {
for (int j = 0; j < hashes[i].length; j++) {
hashes[i][j] = new BitSet();
}
}
for (final Node n : fullTexts) {
for (String xs : n.getStrs()) {
final String s = cleanTextCache.computeIfAbsent(xs, (k) -> cleanText(xs));
if (s.isEmpty()) {
continue;
}
for (char c : s.toCharArray()) {
hashes[0][c].set(n.getIndex());
}
if (s.length() >= 2) {
for (int p = 0; p < s.length() - 2; p++) {
hashes[1][hash2(s, p) & h2Mask].set(n.getIndex());
}
if (s.length() >= 4) {
for (int p = 0; p < s.length() - 4; p++) {
hashes[2][hash4(s, p) & h4Mask].set(n.getIndex());
}
}
}
}
}
}
protected BitSet matchPrefixes(final String req) {
final String reqClean = cleanText(req);
final BitSet cands = new BitSet();
if (reqClean.isEmpty()) {
return cands;
}
int off = 0;
int mid = prefixKeys.length, len = prefixKeys.length;
while (mid > 0) {
mid = len / 2;
int c = reqClean.compareTo(prefixKeys[off + mid]);
if (c > 0) {
off += mid;
}
len -= mid;
}
int runs = 0;
while (off < prefixKeys.length && runs < 1 && !prefixKeys[off].startsWith(reqClean)) {
++off;
++runs;
}
while (off < prefixKeys.length && prefixKeys[off].startsWith(reqClean)) {
for (int i : prefixMap.get(prefixKeys[off])) {
cands.set(i);
}
++off;
}
return cands;
}
protected BitSet matchFullText(final String req) {
final String reqClean = cleanText(req);
final BitSet cands = new BitSet();
for (int p = 0; p < reqClean.length(); p++) {
if (p == 0) {
cands.or(hashes[0][reqClean.charAt(p)]);
}
else {
cands.and(hashes[0][reqClean.charAt(p)]);
}
}
if (req.length() >= 2) {
for (int p = 0; p < reqClean.length() - 2; p++) {
cands.and(hashes[1][hash2(reqClean, p) & h2Mask]);
}
if (req.length() >= 4) {
for (int p = 0; p < reqClean.length() - 4; p++) {
cands.and(hashes[2][hash4(reqClean, p) & h4Mask]);
}
}
}
if (cands.isEmpty()) {
return cands;
}
final String reqUpper = req.toUpperCase();
for (int i = 0; i < fullTexts.length; i++) {
if (!cands.get(i)) {
continue;
}
boolean add = false;
for (final String s : fullTexts[i].strs) {
add = s.toUpperCase().contains(reqUpper);
if (add) {
break;
}
}
if (!add) {
cands.clear(i);
}
}
return cands;
}
public int[] matchAny(final String req) {
final BitSet r1 = matchFullText(req);
final BitSet r2 = matchPrefixes(req);
r1.or(r2);
final int n = r1.cardinality();
final int[] res = new int[n];
int w = 0;
for (int i = 0; i < fullTexts.length; i++) {
if (r1.get(i)) {
res[w++] = i;
}
}
return res;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment