Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vi3k6i5/00a2a7ddd36e1de28ccc019da6185aed to your computer and use it in GitHub Desktop.
Save vi3k6i5/00a2a7ddd36e1de28ccc019da6185aed to your computer and use it in GitHub Desktop.
Benchmarking timing performance Keyword Extraction using regex in java
// compare the results with FlashText here https://gist.github.com/vi3k6i5/604eefd92866d081cfa19f862224e4a0
import java.util.regex.*;
import java.lang.StringBuilder;
import java.util.*;
public class RegexBenchmark {
public static String getWordOfLength(int length) {
String SALTCHARS = "abcdefghijklmnopqrstuvwxyz1234567890";
StringBuilder salt = new StringBuilder();
Random rnd = new Random();
while (salt.length() < length) { // length of the random string.
int index = (int) (rnd.nextFloat() * SALTCHARS.length());
salt.append(SALTCHARS.charAt(index));
}
String saltStr = salt.toString();
return saltStr;
}
/*
# all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]
*/
public static ArrayList<String> getWordsList() {
ArrayList<Integer> lista = new ArrayList<Integer>();
ArrayList<String> list_words = new ArrayList<String>();
lista.add(3);
lista.add(4);
lista.add(5);
lista.add(6);
lista.add(7);
lista.add(8);
Random rand = new Random();
for(int i=0; i<100000; i++){
Integer randomInt = lista.get(rand.nextInt(lista.size()));
String next_word = getWordOfLength(randomInt);
list_words.add(next_word);
}
return list_words;
}
public static List<String> pickNRandomElements(ArrayList<String> list, int n) {
Random r = new Random();
int length = list.size();
if (length < n) return null;
//We don't need to shuffle the whole list
for (int i = length - 1; i >= length - n; --i)
{
Collections.swap(list, i , r.nextInt(i + 1));
}
return list.subList(length - n, length);
}
public static String join(List<String> list, String delim) {
StringBuilder sb = new StringBuilder();
String loopDelim = "";
for(String s : list) {
sb.append(loopDelim);
sb.append(s);
loopDelim = delim;
}
return sb.toString();
}
public static void main(String args[]){
ArrayList<String> wordList = getWordsList();
System.out.println(wordList.size());
List<String> chosen_words = pickNRandomElements(wordList, 5000);
System.out.println(chosen_words.size());
String delim = String.valueOf(' ');
String story = join(chosen_words, delim);
System.out.println(story.length());
long startTime = System.currentTimeMillis();
long endTime = System.currentTimeMillis();
for (int keywords_length = 1; keywords_length <= 20001; keywords_length+=1000)
{
List<String> unique_keywords_sublist = pickNRandomElements(wordList, keywords_length);
StringBuffer str = new StringBuffer ("");
for (String element : unique_keywords_sublist) {
str.append("\\b");
str.append(element);
str.append("\\b|");
}
String pattern = str.toString();
pattern = pattern.substring(0, pattern.length() - 1);
Pattern p = Pattern.compile(pattern);
Matcher m = p.matcher(story);
StringBuffer sb = new StringBuffer();
int count = 0;
startTime = System.currentTimeMillis();
while (m.find()) {
count += 1;
}
endTime = System.currentTimeMillis();
System.out.println(keywords_length + " execution time: " + (endTime - startTime) );
}
}
}
//keywords_length/ time in milliseconds
// 1 execution time: 7
// 1001 execution time: 425
// 2001 execution time: 718
// 3001 execution time: 1084
// 4001 execution time: 1461
// 5001 execution time: 1791
// 6001 execution time: 2257
// 7001 execution time: 2655
// 8001 execution time: 3048
// 9001 execution time: 3417
// 10001 execution time: 3744
// 11001 execution time: 4092
// 12001 execution time: 4427
// 13001 execution time: 4724
// 14001 execution time: 5057
// 15001 execution time: 5204
// 16001 execution time: 5494
// 17001 execution time: 5777
// 18001 execution time: 6049
// 19001 execution time: 6419
// 20001 execution time: 6620
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment