certik/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Compile and run using:
$ g++ -Wall -O3 -march=native -funroll-loops bench.cpp move.cpp && ./a.out 
1
1705032704
4034ms
2
1705032704
3466ms

The raw pointer version is faster. To understand why, let us look at the generated assembly by:
g++ -S -Wall -O3 -march=native -funroll-loops move.cpp

this generates a file move.s. By looking into it, the assembly for the my_move2 function is:
	movq	(%rsi), %rax
	movq	%rax, (%rdi)
	ret
The first argument is in rdi, the second in rsi. It copies the memory at rsi (the second argument) into rax and then into the memory pointed by rdi. So it is exactly equivalent to doing a=b, which generates the same assembly (as can be checked).
Now let us look at my_move1:
	movq	(%rsi), %rdx
	movq	$0, (%rsi)
	movq	(%rdi), %rax
	movq	%rdx, (%rdi)
	testq	%rax, %rax
	je	.L3
	movl	$4, %esi
	movq	%rax, %rdi
	jmp	_ZdlPvm@PLT
	.p2align 4,,10
	.p2align 3
.L3:
	ret
There the pointer at rsi gets copied to rdx, then rsi gets zeroed (per the move semantics requirement to leave std::unique_ptr in a valid state), then rdx gets moved into memory at rdi. There is a further test if rdi was initially nonzero, in which case it will get freed. It's roughly equivalent to the following code:
    if (a) delete a;
    a = b;
    b = nullptr;
The first check if (a) delete a is needed, as unique_ptr needs to free the object if any was allocated. But the b=nullptr is in principle not needed. But if we go into the assembly and remove the line movq	$0, (%rsi), then the code will fail with:
double free or corruption (fasttop)
Aborted (core dumped)

because the "b" unique_ptr will try to free its object next time it's moved into, because it thinks the non-null pointer points to a valid object, because we didn't leave it in a "valid" state.
Conclusion

std::move using std::unique_ptr is not zero cost abstraction compared to just using std::move on raw pointers.
What is an example of a zero cost abstraction?

For example the following code:
class Float
{
public:
    float f;
    Float(float f) : f{f} {}
    Float operator+(const Float &o) {
        return Float(f + o.f);
    }
};

float f1(float f) {
    return f + 1;
}

Float f2(Float f) {
    return f + 1;
}
Generates exactly the same assembly code for both functions:
g++ -S -Wall -O3 -march=native -funroll-loops float.cpp

the assembly looks like this for both f1 and f2 functions:
    vaddss  .LC0(%rip), %xmm0, %xmm0
    ret
So there is no overhead in using Float instead of float.

  
## bench.cpp
#include <iostream>
#include <memory>
#include <chrono>

const int N = 1000000000;

void my_move1(std::unique_ptr<int> &a, std::unique_ptr<int> &b);
void my_move2(int *&a, int *&b);

int f1(int i)
{
    int c = 0;
    std::unique_ptr<int> p1, p2;
    p1 = std::make_unique<int>(i);
    for (int i=0; i<N; i++) {
        my_move1(p2, p1);
        c += *p2;
        my_move1(p1, p2);
        c += *p1;
    }
    return c;
}

int f2(int i)
{
    int c = 0;
    int *p1, *p2;
    p1 = new int(i);
    for (int i=0; i<N; i++) {
        my_move2(p2, p1);
        c += *p2;
        my_move2(p1, p2);
        c += *p1;
    }
    delete p1;
    return c;
}

int main()
{
    int i;
    std::cout << "1" << std::endl;
    auto t1 = std::chrono::high_resolution_clock::now();
    i = f1(3);
    auto t2 = std::chrono::high_resolution_clock::now();
    std::cout << i << std::endl;
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
                     .count()
              << "ms" << std::endl;
    std::cout << "2" << std::endl;
    t1 = std::chrono::high_resolution_clock::now();
    i = f2(3);
    t2 = std::chrono::high_resolution_clock::now();
    std::cout << i << std::endl;
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
                     .count()
              << "ms" << std::endl;
}

## move.cpp
#include <memory>

void my_move1(std::unique_ptr<int> &a, std::unique_ptr<int> &b)
{
    a = std::move(b);
}

void my_move2(int *&a, int *&b)
{
    a = std::move(b);
}
	#include <iostream>
	#include <memory>
	#include <chrono>

	const int N = 1000000000;

	void my_move1(std::unique_ptr<int> &a, std::unique_ptr<int> &b);
	void my_move2(int &a, int &b);

	int f1(int i)
	{
	int c = 0;
	std::unique_ptr<int> p1, p2;
	p1 = std::make_unique<int>(i);
	for (int i=0; i<N; i++) {
	my_move1(p2, p1);
	c += *p2;
	my_move1(p1, p2);
	c += *p1;
	}
	return c;
	}

	int f2(int i)
	{
	int c = 0;
	int p1, p2;
	p1 = new int(i);
	for (int i=0; i<N; i++) {
	my_move2(p2, p1);
	c += *p2;
	my_move2(p1, p2);
	c += *p1;
	}
	delete p1;
	return c;
	}

	int main()
	{
	int i;
	std::cout << "1" << std::endl;
	auto t1 = std::chrono::high_resolution_clock::now();
	i = f1(3);
	auto t2 = std::chrono::high_resolution_clock::now();
	std::cout << i << std::endl;
	std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
	.count()
	<< "ms" << std::endl;
	std::cout << "2" << std::endl;
	t1 = std::chrono::high_resolution_clock::now();
	i = f2(3);
	t2 = std::chrono::high_resolution_clock::now();
	std::cout << i << std::endl;
	std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
	.count()
	<< "ms" << std::endl;
	}
	#include <memory>

	void my_move1(std::unique_ptr<int> &a, std::unique_ptr<int> &b)
	{
	a = std::move(b);
	}

	void my_move2(int &a, int &b)
	{
	a = std::move(b);
	}