nauful/DocumentInvertedIndexer.java

## DocumentInvertedIndexer.java
package com.iva.nlp.document;

import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.HashSet;

public class DocumentInvertedIndexer {
  public static class Node {
    private final int index;
    private final String[] strs;

    public Node(int index, final String[] strs) {
      this.index = index;
      this.strs = strs;
    }

    public int getIndex() {
      return index;
    }

    public String[] getStrs() {
      return strs;
    }
  }

  protected int hash2(final String s, int p) {
    return (5381 * 33 + s.charAt(p)) * 33 + s.charAt(p + 1);
  }

  protected int hash4(final String s, int p) {
    return (((5381 * 33 + s.charAt(p)) * 33 + s.charAt(p + 1)) * 33 + s.charAt(p + 2)) * 33 + s.charAt(p + 3);
  }

  protected String cleanText(final String s) {
    final StringBuilder sb = new StringBuilder(s.length());
    for (char c : s.toUpperCase().trim().toCharArray()) {
      if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z')) {
        sb.append(c);
      }
      else {
        sb.append('_');
      }
    }

    return sb.toString();
  }

  protected final BitSet[][] hashes;
  protected final int h2Mask;
  protected final int h4Mask;
  protected final HashMap<String, HashSet<Integer>> prefixMap;
  protected final String[] prefixKeys;

  protected final Node[] prefixes;
  protected final Node[] fullTexts;

  public DocumentInvertedIndexer(final Node[] prefixes, final Node[] fullTexts) {
    this(prefixes, fullTexts, 10, 12);
  }

  public DocumentInvertedIndexer(final Node[] prefixes, final Node[] fullTexts, int h2Bits, int h4Bits) {
    this.h2Mask = (1 << h2Bits) - 1;
    this.h4Mask = (1 << h4Bits) - 1;

    this.prefixes = prefixes;
    this.fullTexts = fullTexts;

    final HashMap<String, String> cleanTextCache = new HashMap<>();

    prefixMap = new HashMap<>();
    for (final Node n : prefixes) {
      for (final String xs : n.getStrs()) {
        final String s = cleanTextCache.computeIfAbsent(xs, (k) -> cleanText(xs));
        if (!s.isEmpty()) {
          prefixMap.computeIfAbsent(s, (k) -> new HashSet<>()).add(n.getIndex());
        }
      }
    }

    prefixKeys = prefixMap.keySet().toArray(new String[prefixMap.keySet().size()]);
    Arrays.sort(prefixKeys);

    hashes = new BitSet[3][];
    hashes[0] = new BitSet[256];
    hashes[1] = new BitSet[h2Mask + 1];
    hashes[2] = new BitSet[h4Mask + 1];

    for (int i = 0; i < hashes.length; i++) {
      for (int j = 0; j < hashes[i].length; j++) {
        hashes[i][j] = new BitSet();
      }
    }

    for (final Node n : fullTexts) {
      for (String xs : n.getStrs()) {
        final String s = cleanTextCache.computeIfAbsent(xs, (k) -> cleanText(xs));
        if (s.isEmpty()) {
          continue;
        }

        for (char c : s.toCharArray()) {
          hashes[0][c].set(n.getIndex());
        }

        if (s.length() >= 2) {
          for (int p = 0; p < s.length() - 2; p++) {
            hashes[1][hash2(s, p) & h2Mask].set(n.getIndex());
          }

          if (s.length() >= 4) {
            for (int p = 0; p < s.length() - 4; p++) {
              hashes[2][hash4(s, p) & h4Mask].set(n.getIndex());
            }
          }
        }
      }
    }
  }

  protected BitSet matchPrefixes(final String req) {
    final String reqClean = cleanText(req);
    final BitSet cands = new BitSet();

    if (reqClean.isEmpty()) {
      return cands;
    }

    int off = 0;
    int mid = prefixKeys.length, len = prefixKeys.length;
    while (mid > 0) {
      mid = len / 2;

      int c = reqClean.compareTo(prefixKeys[off + mid]);
      if (c > 0) {
        off += mid;
      }

      len -= mid;
    }

    int runs = 0;
    while (off < prefixKeys.length && runs < 1 && !prefixKeys[off].startsWith(reqClean)) {
      ++off;
      ++runs;
    }

    while (off < prefixKeys.length && prefixKeys[off].startsWith(reqClean)) {
      for (int i : prefixMap.get(prefixKeys[off])) {
        cands.set(i);
      }

      ++off;
    }

    return cands;
  }

  protected BitSet matchFullText(final String req) {
    final String reqClean = cleanText(req);

    final BitSet cands = new BitSet();
    for (int p = 0; p < reqClean.length(); p++) {
      if (p == 0) {
        cands.or(hashes[0][reqClean.charAt(p)]);
      }
      else {
        cands.and(hashes[0][reqClean.charAt(p)]);
      }
    }

    if (req.length() >= 2) {
      for (int p = 0; p < reqClean.length() - 2; p++) {
        cands.and(hashes[1][hash2(reqClean, p) & h2Mask]);
      }

      if (req.length() >= 4) {
        for (int p = 0; p < reqClean.length() - 4; p++) {
          cands.and(hashes[2][hash4(reqClean, p) & h4Mask]);
        }
      }
    }

    if (cands.isEmpty()) {
      return cands;
    }

    final String reqUpper = req.toUpperCase();
    for (int i = 0; i < fullTexts.length; i++) {
      if (!cands.get(i)) {
        continue;
      }

      boolean add = false;
      for (final String s : fullTexts[i].strs) {
        add = s.toUpperCase().contains(reqUpper);
        if (add) {
          break;
        }
      }

      if (!add) {
        cands.clear(i);
      }
    }

    return cands;
  }

  public int[] matchAny(final String req) {
    final BitSet r1 = matchFullText(req);
    final BitSet r2 = matchPrefixes(req);

    r1.or(r2);

    final int n = r1.cardinality();
    final int[] res = new int[n];
    int w = 0;
    for (int i = 0; i < fullTexts.length; i++) {
      if (r1.get(i)) {
        res[w++] = i;
      }
    }

    return res;
  }
}
	package com.iva.nlp.document;

	import java.util.Arrays;
	import java.util.BitSet;
	import java.util.HashMap;
	import java.util.HashSet;

	public class DocumentInvertedIndexer {
	public static class Node {
	private final int index;
	private final String[] strs;

	public Node(int index, final String[] strs) {
	this.index = index;
	this.strs = strs;
	}

	public int getIndex() {
	return index;
	}

	public String[] getStrs() {
	return strs;
	}
	}

	protected int hash2(final String s, int p) {
	return (5381 * 33 + s.charAt(p)) * 33 + s.charAt(p + 1);
	}

	protected int hash4(final String s, int p) {
	return (((5381 * 33 + s.charAt(p)) * 33 + s.charAt(p + 1)) * 33 + s.charAt(p + 2)) * 33 + s.charAt(p + 3);
	}

	protected String cleanText(final String s) {
	final StringBuilder sb = new StringBuilder(s.length());
	for (char c : s.toUpperCase().trim().toCharArray()) {
	if ((c >= '0' && c <= '9') \|\| (c >= 'A' && c <= 'Z')) {
	sb.append(c);
	}
	else {
	sb.append('_');
	}
	}

	return sb.toString();
	}

	protected final BitSet[][] hashes;
	protected final int h2Mask;
	protected final int h4Mask;
	protected final HashMap<String, HashSet<Integer>> prefixMap;
	protected final String[] prefixKeys;

	protected final Node[] prefixes;
	protected final Node[] fullTexts;

	public DocumentInvertedIndexer(final Node[] prefixes, final Node[] fullTexts) {
	this(prefixes, fullTexts, 10, 12);
	}

	public DocumentInvertedIndexer(final Node[] prefixes, final Node[] fullTexts, int h2Bits, int h4Bits) {
	this.h2Mask = (1 << h2Bits) - 1;
	this.h4Mask = (1 << h4Bits) - 1;

	this.prefixes = prefixes;
	this.fullTexts = fullTexts;

	final HashMap<String, String> cleanTextCache = new HashMap<>();

	prefixMap = new HashMap<>();
	for (final Node n : prefixes) {
	for (final String xs : n.getStrs()) {
	final String s = cleanTextCache.computeIfAbsent(xs, (k) -> cleanText(xs));
	if (!s.isEmpty()) {
	prefixMap.computeIfAbsent(s, (k) -> new HashSet<>()).add(n.getIndex());
	}
	}
	}

	prefixKeys = prefixMap.keySet().toArray(new String[prefixMap.keySet().size()]);
	Arrays.sort(prefixKeys);

	hashes = new BitSet[3][];
	hashes[0] = new BitSet[256];
	hashes[1] = new BitSet[h2Mask + 1];
	hashes[2] = new BitSet[h4Mask + 1];

	for (int i = 0; i < hashes.length; i++) {
	for (int j = 0; j < hashes[i].length; j++) {
	hashes[i][j] = new BitSet();
	}
	}

	for (final Node n : fullTexts) {
	for (String xs : n.getStrs()) {
	final String s = cleanTextCache.computeIfAbsent(xs, (k) -> cleanText(xs));
	if (s.isEmpty()) {
	continue;
	}

	for (char c : s.toCharArray()) {
	hashes[0][c].set(n.getIndex());
	}

	if (s.length() >= 2) {
	for (int p = 0; p < s.length() - 2; p++) {
	hashes[1][hash2(s, p) & h2Mask].set(n.getIndex());
	}

	if (s.length() >= 4) {
	for (int p = 0; p < s.length() - 4; p++) {
	hashes[2][hash4(s, p) & h4Mask].set(n.getIndex());
	}
	}
	}
	}
	}
	}

	protected BitSet matchPrefixes(final String req) {
	final String reqClean = cleanText(req);
	final BitSet cands = new BitSet();

	if (reqClean.isEmpty()) {
	return cands;
	}

	int off = 0;
	int mid = prefixKeys.length, len = prefixKeys.length;
	while (mid > 0) {
	mid = len / 2;

	int c = reqClean.compareTo(prefixKeys[off + mid]);
	if (c > 0) {
	off += mid;
	}

	len -= mid;
	}

	int runs = 0;
	while (off < prefixKeys.length && runs < 1 && !prefixKeys[off].startsWith(reqClean)) {
	++off;
	++runs;
	}

	while (off < prefixKeys.length && prefixKeys[off].startsWith(reqClean)) {
	for (int i : prefixMap.get(prefixKeys[off])) {
	cands.set(i);
	}

	++off;
	}

	return cands;
	}

	protected BitSet matchFullText(final String req) {
	final String reqClean = cleanText(req);

	final BitSet cands = new BitSet();
	for (int p = 0; p < reqClean.length(); p++) {
	if (p == 0) {
	cands.or(hashes[0][reqClean.charAt(p)]);
	}
	else {
	cands.and(hashes[0][reqClean.charAt(p)]);
	}
	}

	if (req.length() >= 2) {
	for (int p = 0; p < reqClean.length() - 2; p++) {
	cands.and(hashes[1][hash2(reqClean, p) & h2Mask]);
	}

	if (req.length() >= 4) {
	for (int p = 0; p < reqClean.length() - 4; p++) {
	cands.and(hashes[2][hash4(reqClean, p) & h4Mask]);
	}
	}
	}

	if (cands.isEmpty()) {
	return cands;
	}

	final String reqUpper = req.toUpperCase();
	for (int i = 0; i < fullTexts.length; i++) {
	if (!cands.get(i)) {
	continue;
	}

	boolean add = false;
	for (final String s : fullTexts[i].strs) {
	add = s.toUpperCase().contains(reqUpper);
	if (add) {
	break;
	}
	}

	if (!add) {
	cands.clear(i);
	}
	}

	return cands;
	}

	public int[] matchAny(final String req) {
	final BitSet r1 = matchFullText(req);
	final BitSet r2 = matchPrefixes(req);

	r1.or(r2);

	final int n = r1.cardinality();
	final int[] res = new int[n];
	int w = 0;
	for (int i = 0; i < fullTexts.length; i++) {
	if (r1.get(i)) {
	res[w++] = i;
	}
	}

	return res;
	}
	}