Skip to content

Instantly share code, notes, and snippets.

@twillouer
Last active August 29, 2015 14:17
Show Gist options
  • Save twillouer/e4180390dc503fe886d0 to your computer and use it in GitHub Desktop.
Save twillouer/e4180390dc503fe886d0 to your computer and use it in GitHub Desktop.
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(3)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class ReplaceAllBench {
@Param({ "10", "100", "1000" })
private int size;
String string;
@Setup
public void setup() throws Throwable
{
string = randomAlphanumericString(size / 3) + "\n\r " + randomAlphanumericString(size / 3) + "\n "
+ randomAlphanumericString(size / 3);
}
static class Replacer {
private static final Pattern COMPILE = Pattern.compile("[\n\r]+ ");
static String unfold_original(String s)
{
s = s.replaceAll("\n\r ", "");
s = s.replaceAll("\r\n ", "");
s = s.replaceAll("\n ", "");
s = s.replaceAll("\r ", "");
return s;
}
static String unfold_regexp(String s)
{
s = s.replaceAll("\n\r |\r\n |\n |\r ", "");
return s;
}
static String unfold_regexpcompiled(String s)
{
s = COMPILE.matcher(s).replaceAll("");
return s;
}
private enum NLStatus {
NONE, RC, NL, RC_NL, NL_RC;
}
/**
* Remove all the '\n' and ’\r' followed by a ' ' from a LDIF String.
*
* @param s The String to unfold
* @return The resulting String
*/
protected static String unfold_optim_emmanuel_1(String s)
{
int pos = 0;
char[] unfold = new char[s.length()];
NLStatus newLine = NLStatus.NONE;
for (char c : s.toCharArray()) {
switch (c) {
case '\n':
switch (newLine) {
case NONE:
newLine = NLStatus.NL;
break;
case RC:
newLine = NLStatus.RC_NL;
break;
case NL:
unfold[pos++] = '\n';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.NL;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.NL;
break;
}
break;
case '\r':
switch (newLine) {
case NONE:
newLine = NLStatus.RC;
break;
case NL:
newLine = NLStatus.NL_RC;
break;
case RC:
unfold[pos++] = '\r';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.RC;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.RC;
break;
}
break;
case ' ':
if (newLine == NLStatus.NONE) {
unfold[pos++] = c;
} else {
newLine = NLStatus.NONE;
}
break;
default:
switch (newLine) {
case NONE:
break;
case NL:
unfold[pos++] = '\n';
newLine = NLStatus.NONE;
break;
case RC:
unfold[pos++] = '\r';
newLine = NLStatus.NONE;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.NONE;
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.NONE;
break;
}
unfold[pos++] = c;
}
}
switch (newLine) {
case NONE:
break;
case NL:
unfold[pos++] = '\n';
break;
case RC:
unfold[pos++] = '\r';
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
break;
}
return new String(unfold, 0, pos);
}
private static final String[] TODO = { "\n\r ", "\r\n ", "\r ", "\n " };
private static final String[] TO = { "", "", "", "" };
protected static String unfold_with_stringutils_on_apache_common(final String string)
{
return StringUtils.replaceEach(string, TODO, TO);
}
public static String unfold_olivier(String test)
{
// Null -> null
if (test == null) {
return null;
}
// 0 or 1 char
if (test.length() < 2)
return test;
// 2 chars
if (test.length() == 2) {
if (test.charAt(1) == ' ') {
char c0 = test.charAt(0);
if (c0 == '\r' || c0 == '\n') {
return "";
}
}
return test;
}
// More than 2 chars
char[] chars = test.toCharArray();
char[] dest = new char[chars.length];
int p = chars.length - 1;
int d = p;
while (p >= 2) {
char c = chars[p];
// Not a space : keep as is
if (c != ' ') {
dest[d] = c;
p--;
d--;
}
// Space
else {
char c1 = chars[p - 1];
// Previous char is special : investigate deeper
if (c1 == '\r' || c1 == '\n') {
p--;
char c2 = chars[p - 1];
if ((c2 == '\r' || c2 == '\n') && c2 != c1) {
p--;
}
p--;
}
// It was just a space : keep it
else {
dest[d] = c;
p--;
d--;
}
}
}
// Keep the remaining chars as it (special cases already covered)
while (p >= 0) {
dest[d--] = chars[p--];
}
return new String(dest, d + 1, chars.length - d - 1);
}
public static String unfold_olivier_optimise_par_emmanuel(String test)
{
// Null -> null
if ( test == null )
{
return null;
}
char[] chars = test.toCharArray();
// 0 or 1 char
if ( chars.length < 2 )
{
return test;
}
// 2 chars
if ( chars.length == 2 )
{
if ( chars[1] == ' ' )
{
if ( chars[0] == '\r' || chars[0] == '\n' )
{
return "";
}
}
return test;
}
// More than 2 chars
int p = chars.length - 1;
int d = p;
while ( p >= 0 )
{
char c = chars[p];
// Not a space : keep as is
if ( c != ' ' )
{
chars[d] = c;
p--;
d--;
}
// Space
else
{
char c1 = chars[p - 1];
// Previous char is special : investigate deeper
if ( c1 == '\r' )
{
if ( ( chars[p - 2] == '\n' ) )
{
p -= 3;
}
else
{
p -= 2;
}
}
else if ( c1 == '\n' )
{
if ( ( chars[p - 2] == '\r' ) )
{
p -= 3;
}
else
{
p -= 2;
}
}
// It was just a space : keep it
else
{
chars[d] = c;
p--;
d--;
}
}
}
// Keep the remaining chars as it (special cases already covered)
while ( p >= 0 )
{
chars[d--] = chars[p--];
}
return new String( chars, d + 1, chars.length - d - 1 );
}
}
@Benchmark
public String unfold_original()
{
return Replacer.unfold_original(string);
}
@Benchmark
public String unfold_regexp()
{
return Replacer.unfold_regexp(string);
}
@Benchmark
public String unfold_regexpcompiled()
{
return Replacer.unfold_regexpcompiled(string);
}
@Benchmark
public String unfold_optim_emmanuel_1()
{
return Replacer.unfold_optim_emmanuel_1(string);
}
@Benchmark
public String unfold_with_stringutils_on_apache_common()
{
return Replacer.unfold_with_stringutils_on_apache_common(string);
}
@Benchmark
public String unfold_unfold_olivier()
{
return Replacer.unfold_olivier(string);
}
@Benchmark
public String unfold_olivier_optimise_par_emmanuel()
{
return Replacer.unfold_olivier_optimise_par_emmanuel(string);
}
public static void main(String[] args) throws RunnerException, IOException
{
Options opt = new OptionsBuilder().include(".*" + ReplaceAllBench.class.getSimpleName() + ".*")
.warmupIterations(20)
.warmupTime(TimeValue.seconds(1))
.measurementIterations(20)
.timeUnit(TimeUnit.NANOSECONDS)
.forks(1)
// .addProfiler(LinuxPerfAsmProfiler.class)
.build();
new Runner(opt).run();
}
}
@twillouer
Copy link
Author

Results:

Benchmark (size) Mode Cnt Score Error Units
ReplaceAllBench.unfold_all_regexp 10 avgt 15 2300,867 ± 702,534 ns/op
ReplaceAllBench.unfold_all_regexp 100 avgt 15 10303,298 ± 202,785 ns/op
ReplaceAllBench.unfold_all_regexp 1000 avgt 15 94123,644 ± 1649,926 ns/op
ReplaceAllBench.unfold_compiled_regexp 10 avgt 15 656,458 ± 135,117 ns/op
ReplaceAllBench.unfold_compiled_regexp 100 avgt 15 3370,366 ± 708,509 ns/op
ReplaceAllBench.unfold_compiled_regexp 1000 avgt 15 38239,589 ± 11620,233 ns/op
ReplaceAllBench.unfold_original 10 avgt 15 1818,520 ± 117,464 ns/op
ReplaceAllBench.unfold_original 100 avgt 15 10065,356 ± 3338,643 ns/op
ReplaceAllBench.unfold_original 1000 avgt 15 82453,316 ± 12327,339 ns/op
ReplaceAllBench.unfold_unfold_common 10 avgt 15 380,190 ± 72,212 ns/op
ReplaceAllBench.unfold_unfold_common 100 avgt 15 1801,607 ± 164,418 ns/op
ReplaceAllBench.unfold_unfold_common 1000 avgt 15 16738,866 ± 1296,921 ns/op
ReplaceAllBench.unfold_very_complicated 10 avgt 15 213,194 ± 25,204 ns/op
ReplaceAllBench.unfold_very_complicated 100 avgt 15 1102,179 ± 114,323 ns/op
ReplaceAllBench.unfold_very_complicated 1000 avgt 15 9482,631 ± 687,077 ns/op
ReplaceAllBench.unfold_unfold_olivier 10 avgt 15 98,947 ± 7,024 ns/op
ReplaceAllBench.unfold_unfold_olivier 100 avgt 15 691,597 ± 33,185 ns/op
ReplaceAllBench.unfold_unfold_olivier 1000 avgt 15 6269,420 ± 407,698 ns/op

@elecharny
Copy link

J'ai une version modifiée de l'algo d'olivier, mais je suis un peu surpris : il est 20 % plus lent, sans que j'arrive à expliquer pourquoi. Les modifs que j'y ai apportées sont :

  • transformation de la String en charArray dès le début
  • non-création d'un char array de résultat, utilisation du charArray précédent sur place
  • simplification des tests (deux if au lieu d'un or)
  • utilisation d'opération entières sur p (p -= 2/3) au lieu de multiples décréments.

Voici le code modifié

public static String unfold3( String test )
{
    // Null -> null
    if ( test == null )
    {
        return null;
    }

    char[] chars = test.toCharArray();

    // 0 or 1 char
    if ( chars.length < 2 )
    {
        return test;
    }

    // 2 chars
    if ( chars.length == 2 )
    {
        if ( chars[1] == ' ' )
        {
            if ( chars[0] == '\r' || chars[0] == '\n' )
            {
                return "";
            }
        }

        return test;
    }

    // More than 2 chars
    int p = chars.length - 1;
    int d = p;

    while ( p >= 0 )
    {
        char c = chars[p];
        // Not a space : keep as is
        if ( c != ' ' )
        {
            chars[d] = c;
            p--;
            d--;
        }
        // Space
        else
        {
            char c1 = chars[p - 1];
            // Previous char is special : investigate deeper
            if ( c1 == '\r' )
            {
                if ( ( chars[p - 2] == '\n' ) )
                {
                    p -= 3;
                }
                else
                {
                    p -= 2;
                }
            }
            else if ( c1 == '\n' )
            {
                if ( ( chars[p - 2] == '\r' ) )
                {
                    p -= 3;
                }
                else
                {
                    p -= 2;
                }
            }
            // It was just a space : keep it
            else
            {
                chars[d] = c;
                p--;
                d--;
            }
        }
    }
    // Keep the remaining chars as it (special cases already covered)
    while ( p >= 0 )
    {
        chars[d--] = chars[p--];
    }

    return new String( chars, d + 1, chars.length - d - 1 );
}

Je n'ai aucune idée de la raison pour laquelle j'ai cette perte en performance, mon hypothèse est que la création d'un autre char[] pour le résultat permet à la JVM d'optimiser la création de la String au final.

@twillouer
Copy link
Author

Attention, les tailles ont changé, elles sont 3 fois plus petites que sur le précédent bench (d'où la différence). D'après ce que tu indiquais, il faut maintenant regarder les lignes "100" pour refléter mieux ton cas d'usage :

Benchmark                                    (size)  Mode  Cnt      Score      Error  Units
ReplaceAllBench.unfold_all_regexp                10  avgt   15   4169,150 ± 5484,852  ns/op
ReplaceAllBench.unfold_all_regexp               100  avgt   15   4409,728 ±  884,481  ns/op
ReplaceAllBench.unfold_all_regexp              1000  avgt   15  32718,512 ± 3087,518  ns/op
ReplaceAllBench.unfold_compiled_regexp           10  avgt   15    427,047 ±  174,845  ns/op
ReplaceAllBench.unfold_compiled_regexp          100  avgt   15   1390,680 ±  298,333  ns/op
ReplaceAllBench.unfold_compiled_regexp         1000  avgt   15   8400,555 ± 1978,106  ns/op
ReplaceAllBench.unfold_original                  10  avgt   15   1209,530 ±   67,974  ns/op
ReplaceAllBench.unfold_original                 100  avgt   15   4724,720 ± 1395,322  ns/op
ReplaceAllBench.unfold_original                1000  avgt   15  35329,550 ± 7713,905  ns/op
ReplaceAllBench.unfold_unfold_common             10  avgt   15    291,642 ±   73,267  ns/op
ReplaceAllBench.unfold_unfold_common            100  avgt   15    779,961 ±  179,683  ns/op
ReplaceAllBench.unfold_unfold_common           1000  avgt   15   6111,415 ± 1340,031  ns/op
ReplaceAllBench.unfold_unfold_olivier            10  avgt   15     65,822 ±    2,460  ns/op
ReplaceAllBench.unfold_unfold_olivier           100  avgt   15    230,292 ±   11,081  ns/op
ReplaceAllBench.unfold_unfold_olivier          1000  avgt   15   2191,172 ±   90,903  ns/op
ReplaceAllBench.unfold_unfold_optim_olivier      10  avgt   15     50,816 ±    2,996  ns/op
ReplaceAllBench.unfold_unfold_optim_olivier     100  avgt   15    190,641 ±   11,317  ns/op
ReplaceAllBench.unfold_unfold_optim_olivier    1000  avgt   15   1949,342 ±   76,311  ns/op
ReplaceAllBench.unfold_very_complicated          10  avgt   15    134,907 ±   27,827  ns/op
ReplaceAllBench.unfold_very_complicated         100  avgt   15   1550,192 ± 2567,458  ns/op
ReplaceAllBench.unfold_very_complicated        1000  avgt   15   3155,884 ±  144,358  ns/op

@elecharny
Copy link

Java 8 ?

@melix
Copy link

melix commented Mar 16, 2015

Cette version est un chouilla plus lente sur longueur = 10, mais plus rapide sur les chaînes plus grandes. Par ailleurs elle gère les \n\r consécutifs (unfold3 par ex est buggy):

        static String unfold_cedric(String s) {
            if (s == null || s.length() < 2) {
                return s;
            }
            int len = s.length();
            char p1 = 'x';
            char p2 = 'x';
            char[] sb = new char[len];
            int wrtAt = 0;
            for (int i = 0; i < len; i++) {
                char c = s.charAt(i);
                if (' '==c) {
                    if ('\n' == p1) {
                        if ('\r' == p2) {
                            wrtAt--;
                        }
                        wrtAt--;
                    } else if ('\r' == p1) {
                        if ('\n' == p2) {
                            wrtAt--;
                        }
                        wrtAt--;
                    } else {
                        sb[wrtAt++] = c;
                    }
                } else {
                    sb[wrtAt++] = c;
                }

                p2 = p1;
                p1 = c;
            }

            return new String(sb, 0, wrtAt);
        }

J'obtiens ça:

Benchmark                                            (size)   Mode  Samples      Score     Error   Units
r.ReplaceAllBenchmark.unfold_cedric                      10  thrpt       20  25715.261 ± 196.721  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                     100  thrpt       20   8262.439 ± 143.142  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                    1000  thrpt       20    792.683 ±   6.258  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_optim_olivier        10  thrpt       20  26383.311 ± 803.548  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_optim_olivier       100  thrpt       20   7203.692 ±  80.246  ops/ms

Pour ref, le bench complet:

package replace;

import org.apache.commons.lang3.StringUtils;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(3)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class ReplaceAllBenchmark {

    @Param({"10", "100", "1000"})
    private int size;

    String string;

    private final static Random RANDOM = new Random();

    private static String randomAlphanumericString(int len) {
        StringBuilder sb = new StringBuilder(len);
        for (int i = 0; i < len; i++) {
            int r = RANDOM.nextInt(10);
            switch (r) {
                case 0:
                    sb.append('\r');
                    break;
                case 1:
                    sb.append('\n');
                    break;
                case 2:
                    sb.append(' ');
                    break;
                default:
                    sb.append((char) ('a' + RANDOM.nextInt(26)));
            }
        }
        return sb.toString();
    }

    @Setup
    public void setup() throws Throwable {
        string = randomAlphanumericString(size / 3) + "\n\r " + randomAlphanumericString(size / 3) + "\n "
                + randomAlphanumericString(size / 3);
    }

    static class Replacer {
        private static final Pattern COMPILE = Pattern.compile("[\n\r]+ ");

        static String unfold2(String s) {
            s = s.replaceAll("\n\r ", "");
            s = s.replaceAll("\r\n ", "");
            s = s.replaceAll("\n ", "");
            s = s.replaceAll("\r ", "");
            return s;
        }

        static String unfold_regexp(String s) {
            s = s.replaceAll("\n\r |\r\n |\n |\r ", "");
            return s;
        }

        static String unfold_regexpcompiled(String s) {
            s = COMPILE.matcher(s).replaceAll("");
            return s;
        }

        static String unfold_cedric(String s) {
            if (s == null || s.length() < 2) {
                return s;
            }
            int len = s.length();
            char p1 = 'x';
            char p2 = 'x';
            char[] sb = new char[len];
            int wrtAt = 0;
            for (int i = 0; i < len; i++) {
                char c = s.charAt(i);
                if (' '==c) {
                    if ('\n' == p1) {
                        if ('\r' == p2) {
                            wrtAt--;
                        }
                        wrtAt--;
                    } else if ('\r' == p1) {
                        if ('\n' == p2) {
                            wrtAt--;
                        }
                        wrtAt--;
                    } else {
                        sb[wrtAt++] = c;
                    }
                } else {
                    sb[wrtAt++] = c;
                }

                p2 = p1;
                p1 = c;
            }

            return new String(sb, 0, wrtAt);
        }

        private enum NLStatus {
            NONE, RC, NL, RC_NL, NL_RC;
        }

        /**
         * Remove all the '\n' and ’\r' followed by a ' ' from a LDIF String.
         *
         * @param s The String to unfold
         * @return The resulting String
         */
        protected static String unfold(String s) {
            int pos = 0;
            char[] unfold = new char[s.length()];
            NLStatus newLine = NLStatus.NONE;

            for (char c : s.toCharArray()) {
                switch (c) {
                    case '\n':
                        switch (newLine) {
                            case NONE:
                                newLine = NLStatus.NL;
                                break;

                            case RC:
                                newLine = NLStatus.RC_NL;
                                break;

                            case NL:
                                unfold[pos++] = '\n';
                                break;

                            case RC_NL:
                                unfold[pos++] = '\r';
                                unfold[pos++] = '\n';
                                newLine = NLStatus.NL;
                                break;

                            case NL_RC:
                                unfold[pos++] = '\n';
                                unfold[pos++] = '\r';
                                newLine = NLStatus.NL;
                                break;
                        }

                        break;

                    case '\r':
                        switch (newLine) {
                            case NONE:
                                newLine = NLStatus.RC;
                                break;

                            case NL:
                                newLine = NLStatus.NL_RC;
                                break;

                            case RC:
                                unfold[pos++] = '\r';
                                break;

                            case RC_NL:
                                unfold[pos++] = '\r';
                                unfold[pos++] = '\n';
                                newLine = NLStatus.RC;
                                break;

                            case NL_RC:
                                unfold[pos++] = '\n';
                                unfold[pos++] = '\r';
                                newLine = NLStatus.RC;
                                break;
                        }

                        break;

                    case ' ':
                        if (newLine == NLStatus.NONE) {
                            unfold[pos++] = c;
                        } else {
                            newLine = NLStatus.NONE;
                        }

                        break;

                    default:
                        switch (newLine) {
                            case NONE:
                                break;

                            case NL:
                                unfold[pos++] = '\n';
                                newLine = NLStatus.NONE;
                                break;

                            case RC:
                                unfold[pos++] = '\r';
                                newLine = NLStatus.NONE;
                                break;

                            case NL_RC:
                                unfold[pos++] = '\n';
                                unfold[pos++] = '\r';
                                newLine = NLStatus.NONE;
                                break;

                            case RC_NL:
                                unfold[pos++] = '\r';
                                unfold[pos++] = '\n';
                                newLine = NLStatus.NONE;
                                break;
                        }

                        unfold[pos++] = c;
                }
            }

            switch (newLine) {
                case NONE:
                    break;

                case NL:
                    unfold[pos++] = '\n';
                    break;

                case RC:
                    unfold[pos++] = '\r';
                    break;

                case NL_RC:
                    unfold[pos++] = '\n';
                    unfold[pos++] = '\r';
                    break;

                case RC_NL:
                    unfold[pos++] = '\r';
                    unfold[pos++] = '\n';
                    break;
            }

            return new String(unfold, 0, pos);
        }

        private static final String[] TODO = {"\n\r ", "\r\n ", "\r ", "\n "};

        private static final String[] TO = {"", "", "", ""};

        protected static String unfold_common(final String string) {
            return StringUtils.replaceEach(string, TODO, TO);
        }

        public static String unfold_olivier(String test) {

// Null -> null
            if (test == null) {
                return null;
            }

// 0 or 1 char
            if (test.length() < 2)
                return test;

// 2 chars
            if (test.length() == 2) {
                if (test.charAt(1) == ' ') {
                    char c0 = test.charAt(0);
                    if (c0 == '\r' || c0 == '\n') {
                        return "";
                    }
                }
                return test;
            }

// More than 2 chars
            char[] chars = test.toCharArray();
            char[] dest = new char[chars.length];
            int p = chars.length - 1;
            int d = p;
            while (p >= 2) {
                char c = chars[p];
// Not a space : keep as is
                if (c != ' ') {
                    dest[d] = c;
                    p--;
                    d--;
                }
// Space
                else {
                    char c1 = chars[p - 1];
// Previous char is special : investigate deeper
                    if (c1 == '\r' || c1 == '\n') {
                        p--;
                        char c2 = chars[p - 1];
                        if ((c2 == '\r' || c2 == '\n') && c2 != c1) {
                            p--;
                        }
                        p--;
                    }
// It was just a space : keep it
                    else {
                        dest[d] = c;
                        p--;
                        d--;
                    }
                }
            }
// Keep the remaining chars as it (special cases already covered)
            while (p >= 0) {
                dest[d--] = chars[p--];
            }

            return new String(dest, d + 1, chars.length - d - 1);
        }

        public static String unfold3(String test) {
// Null -> null
            if (test == null) {
                return null;
            }

            char[] chars = test.toCharArray();

// 0 or 1 char
            if (chars.length < 2) {
                return test;
            }

// 2 chars
            if (chars.length == 2) {
                if (chars[1] == ' ') {
                    if (chars[0] == '\r' || chars[0] == '\n') {
                        return "";
                    }
                }

                return test;
            }

// More than 2 chars
            int p = chars.length - 1;
            int d = p;

            while (p >= 0) {
                char c = chars[p];
// Not a space : keep as is
                if (c != ' ') {
                    chars[d] = c;
                    p--;
                    d--;
                }
// Space
                else {
                    char c1 = chars[p - 1];
// Previous char is special : investigate deeper
                    if (c1 == '\r') {
                        if ((chars[p - 2] == '\n')) {
                            p -= 3;
                        } else {
                            p -= 2;
                        }
                    } else if (c1 == '\n') {
                        if ((chars[p - 2] == '\r')) {
                            p -= 3;
                        } else {
                            p -= 2;
                        }
                    }
// It was just a space : keep it
                    else {
                        chars[d] = c;
                        p--;
                        d--;
                    }
                }
            }
// Keep the remaining chars as it (special cases already covered)
            while (p >= 0) {
                chars[d--] = chars[p--];
            }

            return new String(chars, d + 1, chars.length - d - 1);
        }
    }

    //@Benchmark
    public String unfold_original() {
        return Replacer.unfold2(string);
    }

    //@Benchmark
    public String unfold_all_regexp() {
        return Replacer.unfold_regexp(string);
    }

    //@Benchmark
    public String unfold_compiled_regexp() {
        return Replacer.unfold_regexpcompiled(string);
    }

    //@Benchmark
    public String unfold_very_complicated() {
        return Replacer.unfold(string);
    }

    //@Benchmark
    public String unfold_unfold_common() {
        return Replacer.unfold_common(string);
    }

    //@Benchmark
    public String unfold_unfold_olivier() {
        return Replacer.unfold_olivier(string);
    }

    @Benchmark
    public String unfold_unfold_optim_olivier() {
        return Replacer.unfold3(string);
    }

    @Benchmark
    public String unfold_cedric() {
        return Replacer.unfold_cedric(string);
    }

    public static void main(String[] args) {
        for (int i = 0; i < 1000; i++) {
            String str = randomAlphanumericString(20) + "\n\r " + randomAlphanumericString(30) + "\n "
                    + randomAlphanumericString(30);
            String orig = str.replace('\n', 'N').replace('\r', 'R').replace(' ', '_');
            String ref = Replacer.unfold_regexp(str).replace('\n', 'N').replace('\r', 'R').replace(' ', '_');
            String cmp = Replacer.unfold_cedric(str).replace('\n', 'N').replace('\r', 'R').replace(' ', '_');
            boolean equals = ref.equals(cmp);
            if (!equals) {
                System.err.println(orig);
                System.err.println(ref);
                System.err.println(cmp);
                System.err.println("---------------------------");
            }
        }
    }
} 

@elecharny
Copy link

Une version améliorée de la propale de Cédric, où on réutilise le char array au lieu d'en créer un :

static String unfold_cedric_improved( String s )
{
    if ( s == null || s.length() < 2 )
    {
        return s;
    }

    int len = s.length();
    char p1 = 'x';
    char p2 = 'x';
    char[] chars = s.toCharArray();
    int wrtAt = 0;

    for ( int i = 0; i < len; i++ )
    {
        char c = chars[i];

        if ( ' ' == c )
        {
            if ( '\n' == p1 )
            {
                if ( '\r' == p2 )
                {
                    wrtAt--;
                }
                wrtAt--;
            }
            else if ( '\r' == p1 )
            {
                if ( '\n' == p2 )
                {
                    wrtAt--;
                }
                wrtAt--;
            }
            else
            {
                chars[wrtAt++] = c;
            }
        }
        else
        {
            chars[wrtAt++] = c;
        }

        p2 = p1;
        p1 = c;
    }

    return new String( chars, 0, wrtAt );
}

@elecharny
Copy link

En fait, pas sûr que ce soit plus rapide. Le s.toCharArray() créé un nouveay char[] et copie l'original dedans. Le gain éventuel est sur la suppression du s.charAt(i), avec élimination des deux tests sur les frontières.

@melix
Copy link

melix commented Mar 16, 2015

Version un peu abusée niveau lisibilité, mais qui selon mes tests est la plus rapide:

       static String unfold_cedric_ultimate(String s) {
            if (s == null || s.length() < 2) {
                return s;
            }

            int len = s.length();
            char p1 = 'x';
            char p2 = 'x';
            char[] chars = s.toCharArray();
            int wrtAt = 0;

            for (int i = 0; i < len; i++) {
                char c = chars[i];

                try {
                    if (' ' == c) {
                        if ('\n' == p1) {
                            if ('\r' == p2) {
                                wrtAt--;
                            }
                            wrtAt--;
                            continue;
                        }
                        if ('\r' == p1) {
                            if ('\n' == p2) {
                                wrtAt--;
                            }
                            wrtAt--;
                            continue;
                        }
                    }
                    chars[wrtAt++] = c;
                } finally {
                    p2 = p1;
                    p1 = c;
                }
            }

            return new String(chars, 0, wrtAt);
        }

@melix
Copy link

melix commented Mar 16, 2015

Bon, là on est pas trop mal...

        static String unfold_cedric_ultimate(String s) {
            if (s == null || s.length() < 2) {
                return s;
            }

            char p1 = 'x';
            char p2 = 'x';
            char[] chars = s.toCharArray();
            int wrtAt = 0;

            for (char c : chars) {
                chars[wrtAt++] = c;
                if (' ' == c) {
                    if ('\n' == p1) {
                        if ('\r' == p2) {
                            wrtAt--;
                        }
                        wrtAt--;
                        wrtAt--;
                    }
                    if ('\r' == p1) {
                        if ('\n' == p2) {
                            wrtAt--;
                        }
                        wrtAt--;
                        wrtAt--;
                    }
                }

                p2 = p1;
                p1 = c;

            }

            return new String(chars, 0, wrtAt);
        }

@melix
Copy link

melix commented Mar 16, 2015

After running the full benchmark:

The following methods are deemed to be correct:
Reflection.unfold_cedric_ultimate2
Reflection.unfold_cedric
Reflection.unfold_cedric_improved
Reflection.unfold_regexp
Reflection.unfold_cedric_ultimate
Reflection.unfold_common

The following methods are proved to be incorrect:
Reflection.unfold_regexpcompiled
Reflection.unfold
Reflection.unfold3
Reflection.unfold2
Reflection.unfold_olivier

# Run complete. Total time: 00:26:35

Benchmark                                            (size)   Mode  Samples      Score     Error   Units
r.ReplaceAllBenchmark.unfold_all_regexp                  10  thrpt       20   1571.746 ±  29.372  ops/ms
r.ReplaceAllBenchmark.unfold_all_regexp                 100  thrpt       20    345.773 ±   2.164  ops/ms
r.ReplaceAllBenchmark.unfold_all_regexp                1000  thrpt       20     39.093 ±   0.554  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                      10  thrpt       20  25168.509 ± 525.919  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                     100  thrpt       20   7124.064 ± 223.885  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                    1000  thrpt       20    814.544 ±  16.060  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved             10  thrpt       20  33391.135 ± 395.195  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved            100  thrpt       20   9804.861 ± 104.523  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved           1000  thrpt       20    906.952 ±  12.452  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate             10  thrpt       20  35065.155 ± 696.067  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate            100  thrpt       20   9382.235 ± 138.609  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate           1000  thrpt       20    750.416 ±  15.975  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2            10  thrpt       20  31528.472 ± 183.737  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2           100  thrpt       20   9378.629 ± 143.134  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2          1000  thrpt       20    862.118 ±  18.347  ops/ms
r.ReplaceAllBenchmark.unfold_compiled_regexp             10  thrpt       20   4522.387 ± 120.559  ops/ms
r.ReplaceAllBenchmark.unfold_compiled_regexp            100  thrpt       20    813.303 ±  14.269  ops/ms
r.ReplaceAllBenchmark.unfold_compiled_regexp           1000  thrpt       20    100.297 ±   2.482  ops/ms
r.ReplaceAllBenchmark.unfold_original                    10  thrpt       20   1137.678 ±  11.718  ops/ms
r.ReplaceAllBenchmark.unfold_original                   100  thrpt       20    420.988 ±  15.005  ops/ms
r.ReplaceAllBenchmark.unfold_original                  1000  thrpt       20     54.039 ±   1.534  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common               10  thrpt       20   6582.421 ± 232.638  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common              100  thrpt       20   1803.557 ±  61.737  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common             1000  thrpt       20    113.271 ±   1.129  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_olivier              10  thrpt       20  21048.183 ± 509.316  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_olivier             100  thrpt       20   6366.979 ±  81.390  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_olivier            1000  thrpt       20    578.769 ±  10.877  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_optim_olivier        10  thrpt       20  34515.031 ± 344.551  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_optim_olivier       100  thrpt       20   7551.517 ±  53.829  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_optim_olivier      1000  thrpt       20    673.183 ±  11.679  ops/ms
r.ReplaceAllBenchmark.unfold_very_complicated            10  thrpt       20  13886.697 ± 366.905  ops/ms
r.ReplaceAllBenchmark.unfold_very_complicated           100  thrpt       20   3173.519 ± 153.616  ops/ms
r.ReplaceAllBenchmark.unfold_very_complicated          1000  thrpt       20    285.992 ±  15.024  ops/ms

@melix
Copy link

melix commented Mar 16, 2015

En ne gardant que les algos corrects:

# Run complete. Total time: 00:14:30

Benchmark                                        (size)   Mode  Samples      Score     Error   Units
r.ReplaceAllBenchmark.unfold_all_regexp              10  thrpt       20   1467.862 ±  28.884  ops/ms
r.ReplaceAllBenchmark.unfold_all_regexp             100  thrpt       20    342.911 ±   4.772  ops/ms
r.ReplaceAllBenchmark.unfold_all_regexp            1000  thrpt       20     39.269 ±   0.936  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                  10  thrpt       20  24729.376 ± 935.766  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                 100  thrpt       20   7074.242 ±  53.409  ops/ms
r.ReplaceAllBenchmark.unfold_cedric                1000  thrpt       20    809.072 ±  16.939  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved         10  thrpt       20  36381.822 ± 263.571  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved        100  thrpt       20   9391.023 ±  60.022  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_improved       1000  thrpt       20    720.513 ±   5.490  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate         10  thrpt       20  36553.386 ± 367.102  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate        100  thrpt       20   9565.954 ± 142.565  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate       1000  thrpt       20    877.977 ±  11.356  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2        10  thrpt       20  32364.978 ± 706.792  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2       100  thrpt       20   9668.940 ±  72.017  ops/ms
r.ReplaceAllBenchmark.unfold_cedric_ultimate2      1000  thrpt       20    932.645 ±   5.214  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common           10  thrpt       20   6869.541 ± 133.914  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common          100  thrpt       20   1739.391 ±  18.654  ops/ms
r.ReplaceAllBenchmark.unfold_unfold_common         1000  thrpt       20    133.172 ±   2.303  ops/ms

cf https://github.com/melix/lecharny-challenge

@OlivierCroisier
Copy link

Hey, mon algo était correct normalement ^^;
Quel est le problème ? Quels sont les cas qui ne passent pas ?

@melix
Copy link

melix commented Mar 16, 2015

C'est pas moi qui ait dit qu'il était faux, c'est la machine. Sur des cas concrets :) Tu peux regarder ce que fait le main pour trouver des exemples qui plantent. De mémoire le tien se plantait sur des séquences du genre \r\r (mais j'ai un doute, ils ont tous des pbs différents :)).

@OlivierCroisier
Copy link

Hmmm, effectivement, un cas n'était pas géré. J'en ai profité pour essayer d'améliorer un peu l'algo. Tu peux le tester ?

public static String unfold_olivier2(String test) {

    // Throws NPE if null, like the original method
    if (test.length() < 2) return test;

    char[] chars = test.toCharArray();
    int p = chars.length - 1;
    int d = p;
    char c, c1, c2;
    while (p > 0) {
        c = chars[p];
        c1 = chars[p - 1];
        if (c == ' ' && (c1 == '\n' || c1 == '\r')) {
            p--;
            if (p > 0) {
                c2 = chars[p - 1];
                if ((c2 == '\n' || c2 == '\r') && c2 != c1) {
                    p--;
                }
            }
        }
        else {
            chars[d] = c;
            d--;
        }
        p--;
    }
    while (p >= 0) {
        chars[d--] = chars[p--];
    }

    return new String(chars, d + 1, chars.length - d - 1);
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment