Skip to content

Instantly share code, notes, and snippets.

@twillouer
Last active August 29, 2015 14:17
Show Gist options
  • Save twillouer/e4180390dc503fe886d0 to your computer and use it in GitHub Desktop.
Save twillouer/e4180390dc503fe886d0 to your computer and use it in GitHub Desktop.
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(3)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class ReplaceAllBench {
@Param({ "10", "100", "1000" })
private int size;
String string;
@Setup
public void setup() throws Throwable
{
string = randomAlphanumericString(size / 3) + "\n\r " + randomAlphanumericString(size / 3) + "\n "
+ randomAlphanumericString(size / 3);
}
static class Replacer {
private static final Pattern COMPILE = Pattern.compile("[\n\r]+ ");
static String unfold_original(String s)
{
s = s.replaceAll("\n\r ", "");
s = s.replaceAll("\r\n ", "");
s = s.replaceAll("\n ", "");
s = s.replaceAll("\r ", "");
return s;
}
static String unfold_regexp(String s)
{
s = s.replaceAll("\n\r |\r\n |\n |\r ", "");
return s;
}
static String unfold_regexpcompiled(String s)
{
s = COMPILE.matcher(s).replaceAll("");
return s;
}
private enum NLStatus {
NONE, RC, NL, RC_NL, NL_RC;
}
/**
* Remove all the '\n' and ’\r' followed by a ' ' from a LDIF String.
*
* @param s The String to unfold
* @return The resulting String
*/
protected static String unfold_optim_emmanuel_1(String s)
{
int pos = 0;
char[] unfold = new char[s.length()];
NLStatus newLine = NLStatus.NONE;
for (char c : s.toCharArray()) {
switch (c) {
case '\n':
switch (newLine) {
case NONE:
newLine = NLStatus.NL;
break;
case RC:
newLine = NLStatus.RC_NL;
break;
case NL:
unfold[pos++] = '\n';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.NL;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.NL;
break;
}
break;
case '\r':
switch (newLine) {
case NONE:
newLine = NLStatus.RC;
break;
case NL:
newLine = NLStatus.NL_RC;
break;
case RC:
unfold[pos++] = '\r';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.RC;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.RC;
break;
}
break;
case ' ':
if (newLine == NLStatus.NONE) {
unfold[pos++] = c;
} else {
newLine = NLStatus.NONE;
}
break;
default:
switch (newLine) {
case NONE:
break;
case NL:
unfold[pos++] = '\n';
newLine = NLStatus.NONE;
break;
case RC:
unfold[pos++] = '\r';
newLine = NLStatus.NONE;
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
newLine = NLStatus.NONE;
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
newLine = NLStatus.NONE;
break;
}
unfold[pos++] = c;
}
}
switch (newLine) {
case NONE:
break;
case NL:
unfold[pos++] = '\n';
break;
case RC:
unfold[pos++] = '\r';
break;
case NL_RC:
unfold[pos++] = '\n';
unfold[pos++] = '\r';
break;
case RC_NL:
unfold[pos++] = '\r';
unfold[pos++] = '\n';
break;
}
return new String(unfold, 0, pos);
}
private static final String[] TODO = { "\n\r ", "\r\n ", "\r ", "\n " };
private static final String[] TO = { "", "", "", "" };
protected static String unfold_with_stringutils_on_apache_common(final String string)
{
return StringUtils.replaceEach(string, TODO, TO);
}
public static String unfold_olivier(String test)
{
// Null -> null
if (test == null) {
return null;
}
// 0 or 1 char
if (test.length() < 2)
return test;
// 2 chars
if (test.length() == 2) {
if (test.charAt(1) == ' ') {
char c0 = test.charAt(0);
if (c0 == '\r' || c0 == '\n') {
return "";
}
}
return test;
}
// More than 2 chars
char[] chars = test.toCharArray();
char[] dest = new char[chars.length];
int p = chars.length - 1;
int d = p;
while (p >= 2) {
char c = chars[p];
// Not a space : keep as is
if (c != ' ') {
dest[d] = c;
p--;
d--;
}
// Space
else {
char c1 = chars[p - 1];
// Previous char is special : investigate deeper
if (c1 == '\r' || c1 == '\n') {
p--;
char c2 = chars[p - 1];
if ((c2 == '\r' || c2 == '\n') && c2 != c1) {
p--;
}
p--;
}
// It was just a space : keep it
else {
dest[d] = c;
p--;
d--;
}
}
}
// Keep the remaining chars as it (special cases already covered)
while (p >= 0) {
dest[d--] = chars[p--];
}
return new String(dest, d + 1, chars.length - d - 1);
}
public static String unfold_olivier_optimise_par_emmanuel(String test)
{
// Null -> null
if ( test == null )
{
return null;
}
char[] chars = test.toCharArray();
// 0 or 1 char
if ( chars.length < 2 )
{
return test;
}
// 2 chars
if ( chars.length == 2 )
{
if ( chars[1] == ' ' )
{
if ( chars[0] == '\r' || chars[0] == '\n' )
{
return "";
}
}
return test;
}
// More than 2 chars
int p = chars.length - 1;
int d = p;
while ( p >= 0 )
{
char c = chars[p];
// Not a space : keep as is
if ( c != ' ' )
{
chars[d] = c;
p--;
d--;
}
// Space
else
{
char c1 = chars[p - 1];
// Previous char is special : investigate deeper
if ( c1 == '\r' )
{
if ( ( chars[p - 2] == '\n' ) )
{
p -= 3;
}
else
{
p -= 2;
}
}
else if ( c1 == '\n' )
{
if ( ( chars[p - 2] == '\r' ) )
{
p -= 3;
}
else
{
p -= 2;
}
}
// It was just a space : keep it
else
{
chars[d] = c;
p--;
d--;
}
}
}
// Keep the remaining chars as it (special cases already covered)
while ( p >= 0 )
{
chars[d--] = chars[p--];
}
return new String( chars, d + 1, chars.length - d - 1 );
}
}
@Benchmark
public String unfold_original()
{
return Replacer.unfold_original(string);
}
@Benchmark
public String unfold_regexp()
{
return Replacer.unfold_regexp(string);
}
@Benchmark
public String unfold_regexpcompiled()
{
return Replacer.unfold_regexpcompiled(string);
}
@Benchmark
public String unfold_optim_emmanuel_1()
{
return Replacer.unfold_optim_emmanuel_1(string);
}
@Benchmark
public String unfold_with_stringutils_on_apache_common()
{
return Replacer.unfold_with_stringutils_on_apache_common(string);
}
@Benchmark
public String unfold_unfold_olivier()
{
return Replacer.unfold_olivier(string);
}
@Benchmark
public String unfold_olivier_optimise_par_emmanuel()
{
return Replacer.unfold_olivier_optimise_par_emmanuel(string);
}
public static void main(String[] args) throws RunnerException, IOException
{
Options opt = new OptionsBuilder().include(".*" + ReplaceAllBench.class.getSimpleName() + ".*")
.warmupIterations(20)
.warmupTime(TimeValue.seconds(1))
.measurementIterations(20)
.timeUnit(TimeUnit.NANOSECONDS)
.forks(1)
// .addProfiler(LinuxPerfAsmProfiler.class)
.build();
new Runner(opt).run();
}
}
@melix
Copy link

melix commented Mar 16, 2015

C'est pas moi qui ait dit qu'il était faux, c'est la machine. Sur des cas concrets :) Tu peux regarder ce que fait le main pour trouver des exemples qui plantent. De mémoire le tien se plantait sur des séquences du genre \r\r (mais j'ai un doute, ils ont tous des pbs différents :)).

@OlivierCroisier
Copy link

Hmmm, effectivement, un cas n'était pas géré. J'en ai profité pour essayer d'améliorer un peu l'algo. Tu peux le tester ?

public static String unfold_olivier2(String test) {

    // Throws NPE if null, like the original method
    if (test.length() < 2) return test;

    char[] chars = test.toCharArray();
    int p = chars.length - 1;
    int d = p;
    char c, c1, c2;
    while (p > 0) {
        c = chars[p];
        c1 = chars[p - 1];
        if (c == ' ' && (c1 == '\n' || c1 == '\r')) {
            p--;
            if (p > 0) {
                c2 = chars[p - 1];
                if ((c2 == '\n' || c2 == '\r') && c2 != c1) {
                    p--;
                }
            }
        }
        else {
            chars[d] = c;
            d--;
        }
        p--;
    }
    while (p >= 0) {
        chars[d--] = chars[p--];
    }

    return new String(chars, d + 1, chars.length - d - 1);
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment