Skip to content

Instantly share code, notes, and snippets.

@twillouer
Last active August 29, 2015 14:17
Show Gist options
  • Save twillouer/e4180390dc503fe886d0 to your computer and use it in GitHub Desktop.
Save twillouer/e4180390dc503fe886d0 to your computer and use it in GitHub Desktop.
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(3)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class ReplaceAllBench {
@Param({ "10", "100", "1000" })
private int size;
String string;
@Setup
public void setup() throws Throwable
{
string = randomAlphanumericString(size / 3) + "\n\r " + randomAlphanumericString(size / 3) + "\n "
+ randomAlphanumericString(size / 3);
}
static class Replacer {
private static final Pattern COMPILE = Pattern.compile("[\n\r]+ ");
static String unfold_original(String s)
{
s = s.replaceAll("\n\r ", "");
s = s.replaceAll("\r\n ", "");
s = s.replaceAll("\n ", "");
s = s.replaceAll("\r ", "");
return s;
}
static String unfold_regexp(String s)
{
s = s.replaceAll("\n\r |\r\n |\n |\r ", "");
return s;
}
static String unfold_regexpcompiled(String s)
{
s = COMPILE.matcher(s).replaceAll("");
return s;
}
private enum NLStatus {
NONE, RC, NL, RC_NL, NL_RC;
}
/**
* Remove all the '\n' and ’\r' followed by a ' ' from a LDIF String.
*
* @param s The String to unfold
* @return The resulting String
*/
protected static String unfold_optim_emmanuel_1(String s)
{
int pos = 0;
char[] unfold = new char[s.length()];
NLStatus newLine = NLStatus.NONE;
for (char c : s.toCharArray()) {
switch (c) {
case '\n':
switch (newLine) {
case NONE:
newLine = NLStatus.NL;
break;
case RC:
newLine = NLStatus.RC_NL;
break;
case NL:
unfold[pos++] = '\n';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.NL;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.NL;
break;
}
break;
case '\r':
switch (newLine) {
case NONE:
newLine = NLStatus.RC;
break;
case NL:
newLine = NLStatus.NL_RC;
break;
case RC:
unfold[pos++] = '\r';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.RC;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.RC;
break;
}
break;
case ' ':
if (newLine == NLStatus.NONE) {
unfold[pos++] = c;
} else {
newLine = NLStatus.NONE;
}
break;
default:
switch (newLine) {
case NONE:
break;
case NL:
unfold[pos++] = '\n';
newLine = NLStatus.NONE;
break;
case RC:
unfold[pos++] = '\r';
newLine = NLStatus.NONE;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.NONE;
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.NONE;
break;
}
unfold[pos++] = c;
}
}
switch (newLine) {
case NONE:
break;
case NL:
unfold[pos++] = '\n';
break;
case RC:
unfold[pos++] = '\r';
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
break;
}
return new String(unfold, 0, pos);
}
private static final String[] TODO = { "\n\r ", "\r\n ", "\r ", "\n " };
private static final String[] TO = { "", "", "", "" };
protected static String unfold_with_stringutils_on_apache_common(final String string)
{
return StringUtils.replaceEach(string, TODO, TO);
}
public static String unfold_olivier(String test)
{
// Null -> null
if (test == null) {
return null;
}
// 0 or 1 char
if (test.length() < 2)
return test;
// 2 chars
if (test.length() == 2) {
if (test.charAt(1) == ' ') {
char c0 = test.charAt(0);
if (c0 == '\r' || c0 == '\n') {
return "";
}
}
return test;
}
// More than 2 chars
char[] chars = test.toCharArray();
char[] dest = new char[chars.length];
int p = chars.length - 1;
int d = p;
while (p >= 2) {
char c = chars[p];
// Not a space : keep as is
if (c != ' ') {
dest[d] = c;
p--;
d--;
}
// Space
else {
char c1 = chars[p - 1];
// Previous char is special : investigate deeper
if (c1 == '\r' || c1 == '\n') {
p--;
char c2 = chars[p - 1];
if ((c2 == '\r' || c2 == '\n') && c2 != c1) {
p--;
}
p--;
}
// It was just a space : keep it
else {
dest[d] = c;
p--;
d--;
}
}
}
// Keep the remaining chars as it (special cases already covered)
while (p >= 0) {
dest[d--] = chars[p--];
}
return new String(dest, d + 1, chars.length - d - 1);
}
public static String unfold_olivier_optimise_par_emmanuel(String test)
{
// Null -> null
if ( test == null )
{
return null;
}
char[] chars = test.toCharArray();
// 0 or 1 char
if ( chars.length < 2 )
{
return test;
}
// 2 chars
if ( chars.length == 2 )
{
if ( chars[1] == ' ' )
{
if ( chars[0] == '\r' || chars[0] == '\n' )
{
return "";
}
}
return test;
}
// More than 2 chars
int p = chars.length - 1;
int d = p;
while ( p >= 0 )
{
char c = chars[p];
// Not a space : keep as is
if ( c != ' ' )
{
chars[d] = c;
p--;
d--;
}
// Space
else
{
char c1 = chars[p - 1];
// Previous char is special : investigate deeper
if ( c1 == '\r' )
{
if ( ( chars[p - 2] == '\n' ) )
{
p -= 3;
}
else
{
p -= 2;
}
}
else if ( c1 == '\n' )
{
if ( ( chars[p - 2] == '\r' ) )
{
p -= 3;
}
else
{
p -= 2;
}
}
// It was just a space : keep it
else
{
chars[d] = c;
p--;
d--;
}
}
}
// Keep the remaining chars as it (special cases already covered)
while ( p >= 0 )
{
chars[d--] = chars[p--];
}
return new String( chars, d + 1, chars.length - d - 1 );
}
}
@Benchmark
public String unfold_original()
{
return Replacer.unfold_original(string);
}
@Benchmark
public String unfold_regexp()
{
return Replacer.unfold_regexp(string);
}
@Benchmark
public String unfold_regexpcompiled()
{
return Replacer.unfold_regexpcompiled(string);
}
@Benchmark
public String unfold_optim_emmanuel_1()
{
return Replacer.unfold_optim_emmanuel_1(string);
}
@Benchmark
public String unfold_with_stringutils_on_apache_common()
{
return Replacer.unfold_with_stringutils_on_apache_common(string);
}
@Benchmark
public String unfold_unfold_olivier()
{
return Replacer.unfold_olivier(string);
}
@Benchmark
public String unfold_olivier_optimise_par_emmanuel()
{
return Replacer.unfold_olivier_optimise_par_emmanuel(string);
}
public static void main(String[] args) throws RunnerException, IOException
{
Options opt = new OptionsBuilder().include(".*" + ReplaceAllBench.class.getSimpleName() + ".*")
.warmupIterations(20)
.warmupTime(TimeValue.seconds(1))
.measurementIterations(20)
.timeUnit(TimeUnit.NANOSECONDS)
.forks(1)
// .addProfiler(LinuxPerfAsmProfiler.class)
.build();
new Runner(opt).run();
}
}
@melix
Copy link

melix commented Mar 16, 2015

Bon, là on est pas trop mal...

        static String unfold_cedric_ultimate(String s) {
            if (s == null || s.length() < 2) {
                return s;
            }

            char p1 = 'x';
            char p2 = 'x';
            char[] chars = s.toCharArray();
            int wrtAt = 0;

            for (char c : chars) {
                chars[wrtAt++] = c;
                if (' ' == c) {
                    if ('\n' == p1) {
                        if ('\r' == p2) {
                            wrtAt--;
                        }
                        wrtAt--;
                        wrtAt--;
                    }
                    if ('\r' == p1) {
                        if ('\n' == p2) {
                            wrtAt--;
                        }
                        wrtAt--;
                        wrtAt--;
                    }
                }

                p2 = p1;
                p1 = c;

            }

            return new String(chars, 0, wrtAt);
        }

@melix
Copy link

melix commented Mar 16, 2015

After running the full benchmark:

The following methods are deemed to be correct:
Reflection.unfold_cedric_ultimate2
Reflection.unfold_cedric
Reflection.unfold_cedric_improved
Reflection.unfold_regexp
Reflection.unfold_cedric_ultimate
Reflection.unfold_common

The following methods are proved to be incorrect:
Reflection.unfold_regexpcompiled
Reflection.unfold
Reflection.unfold3
Reflection.unfold2
Reflection.unfold_olivier

# Run complete. Total time: 00:26:35

Benchmark                                            (size)   Mode  Samples      Score     Error   Units
r.ReplaceAllBenchmark.unfold_all_regexp                  10  thrpt       20   1571.746 ±  29.372  ops/ms
r.ReplaceAllBenchmark.unfold_all_regexp                 100  thrpt       20    345.773 ±   2.164  ops/ms
r.ReplaceAllBenchmark.unfold_all_regexp                1000  thrpt       20     39.093 ±   0.554  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                      10  thrpt       20  25168.509 ± 525.919  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                     100  thrpt       20   7124.064 ± 223.885  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                    1000  thrpt       20    814.544 ±  16.060  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved             10  thrpt       20  33391.135 ± 395.195  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved            100  thrpt       20   9804.861 ± 104.523  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved           1000  thrpt       20    906.952 ±  12.452  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate             10  thrpt       20  35065.155 ± 696.067  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate            100  thrpt       20   9382.235 ± 138.609  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate           1000  thrpt       20    750.416 ±  15.975  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2            10  thrpt       20  31528.472 ± 183.737  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2           100  thrpt       20   9378.629 ± 143.134  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2          1000  thrpt       20    862.118 ±  18.347  ops/ms
r.ReplaceAllBenchmark.unfold_compiled_regexp             10  thrpt       20   4522.387 ± 120.559  ops/ms
r.ReplaceAllBenchmark.unfold_compiled_regexp            100  thrpt       20    813.303 ±  14.269  ops/ms
r.ReplaceAllBenchmark.unfold_compiled_regexp           1000  thrpt       20    100.297 ±   2.482  ops/ms
r.ReplaceAllBenchmark.unfold_original                    10  thrpt       20   1137.678 ±  11.718  ops/ms
r.ReplaceAllBenchmark.unfold_original                   100  thrpt       20    420.988 ±  15.005  ops/ms
r.ReplaceAllBenchmark.unfold_original                  1000  thrpt       20     54.039 ±   1.534  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common               10  thrpt       20   6582.421 ± 232.638  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common              100  thrpt       20   1803.557 ±  61.737  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common             1000  thrpt       20    113.271 ±   1.129  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_olivier              10  thrpt       20  21048.183 ± 509.316  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_olivier             100  thrpt       20   6366.979 ±  81.390  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_olivier            1000  thrpt       20    578.769 ±  10.877  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_optim_olivier        10  thrpt       20  34515.031 ± 344.551  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_optim_olivier       100  thrpt       20   7551.517 ±  53.829  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_optim_olivier      1000  thrpt       20    673.183 ±  11.679  ops/ms
r.ReplaceAllBenchmark.unfold_very_complicated            10  thrpt       20  13886.697 ± 366.905  ops/ms
r.ReplaceAllBenchmark.unfold_very_complicated           100  thrpt       20   3173.519 ± 153.616  ops/ms
r.ReplaceAllBenchmark.unfold_very_complicated          1000  thrpt       20    285.992 ±  15.024  ops/ms

@melix
Copy link

melix commented Mar 16, 2015

En ne gardant que les algos corrects:

# Run complete. Total time: 00:14:30

Benchmark                                        (size)   Mode  Samples      Score     Error   Units
r.ReplaceAllBenchmark.unfold_all_regexp              10  thrpt       20   1467.862 ±  28.884  ops/ms
r.ReplaceAllBenchmark.unfold_all_regexp             100  thrpt       20    342.911 ±   4.772  ops/ms
r.ReplaceAllBenchmark.unfold_all_regexp            1000  thrpt       20     39.269 ±   0.936  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                  10  thrpt       20  24729.376 ± 935.766  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                 100  thrpt       20   7074.242 ±  53.409  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                1000  thrpt       20    809.072 ±  16.939  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved         10  thrpt       20  36381.822 ± 263.571  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved        100  thrpt       20   9391.023 ±  60.022  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved       1000  thrpt       20    720.513 ±   5.490  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate         10  thrpt       20  36553.386 ± 367.102  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate        100  thrpt       20   9565.954 ± 142.565  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate       1000  thrpt       20    877.977 ±  11.356  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2        10  thrpt       20  32364.978 ± 706.792  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2       100  thrpt       20   9668.940 ±  72.017  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2      1000  thrpt       20    932.645 ±   5.214  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common           10  thrpt       20   6869.541 ± 133.914  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common          100  thrpt       20   1739.391 ±  18.654  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common         1000  thrpt       20    133.172 ±   2.303  ops/ms

cf https://github.com/melix/lecharny-challenge

@OlivierCroisier
Copy link

Hey, mon algo était correct normalement ^^;
Quel est le problème ? Quels sont les cas qui ne passent pas ?

@melix
Copy link

melix commented Mar 16, 2015

C'est pas moi qui ait dit qu'il était faux, c'est la machine. Sur des cas concrets :) Tu peux regarder ce que fait le main pour trouver des exemples qui plantent. De mémoire le tien se plantait sur des séquences du genre \r\r (mais j'ai un doute, ils ont tous des pbs différents :)).

@OlivierCroisier
Copy link

Hmmm, effectivement, un cas n'était pas géré. J'en ai profité pour essayer d'améliorer un peu l'algo. Tu peux le tester ?

public static String unfold_olivier2(String test) {

    // Throws NPE if null, like the original method
    if (test.length() < 2) return test;

    char[] chars = test.toCharArray();
    int p = chars.length - 1;
    int d = p;
    char c, c1, c2;
    while (p > 0) {
        c = chars[p];
        c1 = chars[p - 1];
        if (c == ' ' && (c1 == '\n' || c1 == '\r')) {
            p--;
            if (p > 0) {
                c2 = chars[p - 1];
                if ((c2 == '\n' || c2 == '\r') && c2 != c1) {
                    p--;
                }
            }
        }
        else {
            chars[d] = c;
            d--;
        }
        p--;
    }
    while (p >= 0) {
        chars[d--] = chars[p--];
    }

    return new String(chars, d + 1, chars.length - d - 1);
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment