Skip to content

Instantly share code, notes, and snippets.

@Jire
Last active December 12, 2023 23:18
Show Gist options
  • Save Jire/4aa72bd3554cdccdc369c216a230ee56 to your computer and use it in GitHub Desktop.
Save Jire/4aa72bd3554cdccdc369c216a230ee56 to your computer and use it in GitHub Desktop.
Based off PimDeWitte's, this improves performance by over a magnitude and eliminates all garbage (allocations).
import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import net.openhft.hashing.LongHashFunction;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
/**
* Originally created by Pim De Witte.
*
* Performance drastically improved by over an order of magnitude by Thomas G. P. Nappo (Jire).
* Garbage production has been eliminated as well.
*/
public class BadWords {
static Long2ObjectMap<String[]> words = new Long2ObjectOpenHashMap<>();
static int largestWordLength = 0;
public static void flag(String word) {
String[] ignore_in_combination_with_words = new String[]{};
if (word.length() > largestWordLength) {
largestWordLength = word.length();
}
words.put(LongHashFunction.xx().hashChars(word.replaceAll(" ", "")), ignore_in_combination_with_words);
}
public static void loadConfigs() {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new URL("https://docs.google.com/spreadsheets/d/1hIEi2YG3ydav1E06Bzf2mQbGZ12kh2fe4ISgLg_UBuM/export?format=csv").openConnection().getInputStream()));
String line = "";
int counter = 0;
while((line = reader.readLine()) != null) {
counter++;
String[] content = null;
try {
content = line.split(",");
if(content.length == 0) {
continue;
}
String word = content[0];
String[] ignore_in_combination_with_words = new String[]{};
if(content.length > 1) {
ignore_in_combination_with_words = content[1].split("_");
}
if(word.length() > largestWordLength) {
largestWordLength = word.length();
}
words.put(LongHashFunction.xx().hashChars(word.replace(" ", "")), ignore_in_combination_with_words);
} catch(Exception e) {
e.printStackTrace();
}
}
System.out.println("Loaded " + counter + " words to filter out");
} catch (IOException e) {
e.printStackTrace();
}
}
private static final char[][] convert = {
{'o', '0'},
{'i', '1'},
{'l', '1'},
{'t', '+'},
{'e', '3'},
{'i', '!'},
{'l', '!'},
{'s', '$'},
{'a', '&'},
{'a', '@'},
{'c', '('},
{'d', ')'},
{'d', '0'},
{'g', '6'},
{'t', '7'},
{'g', '9'},
{'s', '5'},
{'a', '4'}
};
private static final ThreadLocal<StringBuilder> sb = ThreadLocal.withInitial(StringBuilder::new); // make this regular if you don't need thread safety.
/**
* Iterates over a String input and checks whether a cuss word was found in a list, then checks if the word should be ignored (e.g. bass contains the word *ss).
*
* @param input
* @return
*/
public static boolean badWordsFound(String input) {
if (input == null) {
return false;
}
StringBuilder sb = BadWords.sb.get();
sb.setLength(0);
removeLeetspeak:
for (int i = 0; i < input.length(); i++) {
char c = input.charAt(i);
if (Character.isLetter(c)) {
sb.append(Character.toLowerCase(c));
} else {
for (char[] conversion : convert) {
if (c == conversion[1]) {
sb.append(conversion[0]);
continue removeLeetspeak;
}
}
}
}
// iterate over each letter in the word
for (int start = 0; start < sb.length(); start++) {
// from each letter, keep going to find bad words until either the end of the sentence is reached, or the max word length is reached.
for (int offset = 1; offset < (sb.length() + 1 - start) && offset < largestWordLength; offset++) {
long hash = LongHashFunction.xx().hashChars(sb, start, offset);
if (words.containsKey(hash)) {
// for example, if you want to say the word bass, that should be possible.
String[] ignoreCheck = words.get(hash);
boolean ignore = false;
for (int s = 0; s < ignoreCheck.length; s++) {
if (indexOf(sb, ignoreCheck[s]) >= 0) {
ignore = true;
break;
}
}
if (!ignore) {
return true;
}
}
}
}
return false;
}
private static int indexOf(CharSequence source, CharSequence target) {
int sourceCount = source.length();
int targetCount = target.length();
int sourceOffset = 0;
int targetOffset = 0;
if (0 >= sourceCount) {
return (targetCount == 0 ? sourceCount : -1);
}
if (targetCount == 0) {
return 0;
}
char first = target.charAt(targetOffset);
int max = sourceOffset + (sourceCount - targetCount);
for (int i = sourceOffset; i <= max; i++) {
/* Look for first character. */
if (source.charAt(i) != first) {
while (++i <= max && source.charAt(i) != first);
}
/* Found first character, now look at the rest of v2 */
if (i <= max) {
int j = i + 1;
int end = j + targetCount - 1;
for (int k = targetOffset + 1; j < end && source.charAt(j)
== target.charAt(k); j++, k++);
if (j == end) {
/* Found whole string. */
return i - sourceOffset;
}
}
}
return -1;
}
}
@ChosenQuill
Copy link

ChosenQuill commented Jan 9, 2019

I know you are about full memory optimization, but is there any way to return the swear word for a reference in an automatic report? This is because I get a ton of false positives in messages and its easier to figure out what the false positive is if I have the word available.

ie
if (!ignore) { return wordToCheck; }
and at the end if nothing is found return null

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment