Skip to content

Instantly share code, notes, and snippets.

@abler98
Created November 23, 2020 19:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abler98/7773268d6b1dcda392123b4009867d3b to your computer and use it in GitHub Desktop.
Save abler98/7773268d6b1dcda392123b4009867d3b to your computer and use it in GitHub Desktop.
package com.example;
import java.util.*;
import java.util.logging.Logger;
import java.util.regex.Pattern;
public class Main {
// Use control characters for minimize chance of matches with original text
private static final String UGLY_HACK_MARKER = "\u0080\u0081\u0082";
private static final String REPLACEMENT = "*";
private static final int BENCHMARK_NUM_OF_ITERS = 10_000_000;
private final Logger logger = Logger.getLogger("main");
public static void main(String[] args) {
new Main().run();
}
public void run() {
var dictionary = Arrays.asList(
new DictionaryItem("при", DictionaryItem.ReplaceType.START),
new DictionaryItem("вет", DictionaryItem.ReplaceType.ANYWHERE),
new DictionaryItem("дела", DictionaryItem.ReplaceType.FULL_WORD)
);
var inputText = "Всем привет! Как дела? ~вет**~ Сюрприз?";
var expectedText = "Всем *! Как *? ~***~ Сюрприз?";
var calculatedResult = replaceWithCalculatedPositions(inputText, dictionary);
if (calculatedResult.equals(expectedText)) {
logger.info("Calculated result: " + calculatedResult);
// Avg execution time: 20790 ms
runBenchmark("calculated", () -> replaceWithCalculatedPositions(inputText, dictionary));
} else {
logger.warning("Invalid calculated result: " + calculatedResult);
}
var uglyResult = replaceWithUglyHack(inputText, dictionary);
if (uglyResult.equals(expectedText)) {
logger.info("Ugly result: " + uglyResult);
// Avg execution time: 60793 ms
runBenchmark("ugly", () -> replaceWithUglyHack(inputText, dictionary));
} else {
logger.warning("Invalid ugly result: " + uglyResult);
}
}
private String replaceWithCalculatedPositions(String input, List<DictionaryItem> dictionary) {
var matchResults = new LinkedList<WordMatch>();
for (var item : dictionary) {
item
.getPattern()
.matcher(input)
.results()
.forEach(matchResult -> matchResults.add(new WordMatch(
matchResult.start(),
matchResult.end()
)));
}
matchResults.sort(Comparator.comparingInt(WordMatch::getStart));
var normalizedMatchResults = new LinkedList<WordMatch>();
for (var match : matchResults) {
var lastMatch = normalizedMatchResults.peekLast();
// Join adjacent matches
if (lastMatch != null && lastMatch.getEnd() >= match.getStart()) {
normalizedMatchResults.removeLast();
normalizedMatchResults.addLast(new WordMatch(lastMatch.getStart(), match.getEnd()));
} else {
normalizedMatchResults.addLast(match);
}
}
var result = new StringBuilder();
int offset = 0;
for (var match : normalizedMatchResults) {
result.append(input, offset, match.getStart());
result.append(REPLACEMENT);
offset = match.getEnd();
}
result.append(input.substring(offset));
return result.toString();
}
// Simple but slow
private String replaceWithUglyHack(String input, List<DictionaryItem> dictionary) {
var wordBoundary = "(?<!" + UGLY_HACK_MARKER + ")(?:\\b)";
for (var item : dictionary) {
input = item
.getPattern(wordBoundary)
.matcher(input)
.replaceAll(UGLY_HACK_MARKER + "$1" + UGLY_HACK_MARKER);
}
return input
.replaceAll("(" + UGLY_HACK_MARKER + "){2,}", "")
.replaceAll(UGLY_HACK_MARKER + "(.+?)" + UGLY_HACK_MARKER, REPLACEMENT);
}
private void runBenchmark(String label, Runnable runnable) {
var startTime = System.nanoTime();
for (int i = 0; i < BENCHMARK_NUM_OF_ITERS; i++) {
runnable.run();
}
var elapsedTimeMs = (System.nanoTime() - startTime) / 1e6;
logger.info("Benchmark [" + label + "]: " + elapsedTimeMs + " ms");
}
private static class WordMatch {
private final int start;
private final int end;
public WordMatch(int start, int end) {
this.start = start;
this.end = end;
}
public int getStart() {
return start;
}
public int getEnd() {
return end;
}
}
private static class DictionaryItem {
public enum ReplaceType {
START,
ANYWHERE,
FULL_WORD,
}
private final Map<String, Pattern> patternCache = new HashMap<>();
private final String value;
private final ReplaceType replaceType;
public DictionaryItem(String value, ReplaceType replaceType) {
this.value = value;
this.replaceType = replaceType;
}
public Pattern getPattern() {
return getPattern("\\b");
}
public Pattern getPattern(String wordBoundary) {
return patternCache.computeIfAbsent(wordBoundary, (key) -> {
var quotedValue = Pattern.quote(value);
var regex = switch (replaceType) {
case START -> wordBoundary + quotedValue;
case ANYWHERE -> quotedValue;
case FULL_WORD -> wordBoundary + quotedValue + wordBoundary;
};
return Pattern.compile(
"(" + regex + ")",
Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE
);
});
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment