Last active
October 12, 2023 15:06
-
-
Save azat/90c452b5165b762c0c5ee9450ef78878 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From cc882d3238775f951718ec29c781c9ecc7774e28 Mon Sep 17 00:00:00 2001 | |
From: Azat Khuzhin <a3at.mail@gmail.com> | |
Date: Sun, 1 Oct 2023 21:35:42 +0200 | |
Subject: [PATCH] Avoid reading the file multiple times while sending it to S3 | |
AWS S3 client can read file multiple times, this is required for: | |
- calculate checksums | |
- calculate signature | |
By default checksum calculated before signature [1], but it can be | |
calculated in a "streaming" fashion (although it is not a streaming at | |
all) and what this means is that it defers the calculation until it will | |
calculate the signature [2]. But not all algorithms is supported, and | |
default one - MD5 - does not. | |
[1]: https://github.com/aws/aws-sdk-cpp/blob/b0ee1c0d336dbb371c34358b68fba6c56aae2c92/src/aws-cpp-sdk-core/source/client/AWSClient.cpp#L783-L839 | |
[2]: https://github.com/aws/aws-sdk-cpp/blob/b0ee1c0d336dbb371c34358b68fba6c56aae2c92/src/aws-cpp-sdk-core/source/auth/signer/AWSAuthV4Signer.cpp#L231-L260 | |
But, calculating the signature will not be done for HTTPS requests | |
(since ClickHouse uses PayloadSigningPolicy::Never policy). | |
So to avoid this excessive reading we need to switch the checksums to | |
something that supports streaming, and it will not be even calculated! | |
I choose SHA1 in this patch, but it could be any (SHA256 and also | |
various combinations of CRC is supported, but this is not important). | |
Here is an example stacktrace of this excessive read: | |
<details> | |
<summary>stacktrace</summary> | |
(lldb) bt | |
* thread 383, name = 'BackupWorker', stop reason = breakpoint 1.1 | |
* frame 0: 0x00000000103c5fc0 clickhouse`DB::StdStreamBufFromReadBuffer::seekpos() + 32 at StdStreamBufFromReadBuffer.cpp:67 | |
frame 1: 0x000000001777f7f8 clickhouse`std::__1::basic_istream<char, std::__1::char_traits<char>>::tellg() [inlined] std::__1::basic_streambuf<char, std::__1::char_traits<char>>::pubseekoff[abi:v15000](this=<unavailable>, __off=0, __way=cur, __which=8) + 120 at streambuf:162 | |
frame 2: 0x000000001777f7e3 clickhouse`std::__1::basic_istream<char, std::__1::char_traits<char>>::tellg() + 99 at istream:1249 | |
frame 3: 0x00000000152e4979 clickhouse`Aws::Utils::Crypto::MD5OpenSSLImpl::Calculate() + 57 at CryptoImpl.cpp:223 | |
frame 4: 0x00000000152dedee clickhouse`Aws::Utils::Crypto::MD5::Calculate() + 14 at MD5.cpp:30 | |
frame 5: 0x00000000152db5ac clickhouse`Aws::Utils::HashingUtils::CalculateMD5() + 44 at HashingUtils.cpp:235 | |
frame 6: 0x000000001528b97b clickhouse`Aws::Client::AWSClient::AddChecksumToRequest() const + 507 at AWSClient.cpp:772 | |
frame 7: 0x000000001528ded2 clickhouse`Aws::Client::AWSClient::BuildHttpRequest() const + 1682 at AWSClient.cpp:930 | |
frame 8: 0x00000000100b864f clickhouse`DB::S3::Client::BuildHttpRequest() const + 15 at Client.cpp:622 | |
frame 9: 0x0000000015286a41 clickhouse`Aws::Client::AWSClient::AttemptOneRequest(this=0x00007ffde2f8f000, httpRequest=<unavailable>, request=<unavailable>, signerName=<unavailable>, signerRegionOverride=<unavailable>, signerServiceNameOverride="s3") const + 65 at AWSClient.cpp:491 | |
frame 10: 0x00000000152845b9 clickhouse`Aws::Client::AWSClient::AttemptExhaustively(this=0x00007ffde2f8f000, uri=0x00007ffdd4d44f38, request=0x00007ffdd4d45d10, method=HTTP_PUT, signerName="SignatureV4", signerRegionOverride="us-east-1", signerServiceNameOverride="s3") const + 1337 at AWSClient.cpp:272 | |
frame 11: 0x0000000015298d0d clickhouse`Aws::Client::AWSXMLClient::MakeRequest() const + 45 at AWSXmlClient.cpp:99 | |
frame 12: 0x0000000015298cb5 clickhouse`Aws::Client::AWSXMLClient::MakeRequest() const + 309 at AWSXmlClient.cpp:66 | |
frame 13: 0x0000000015354b23 clickhouse`Aws::S3::S3Client::PutObject(this=0x00007ffde2f8f000, request=0x00007ffdd4d45d10) const + 2659 at S3Client.cpp:1731 | |
frame 14: 0x00000000100b174f clickhouse`DB::S3::Client::PutObject(DB::S3::ExtendedRequest<Aws::S3::Model::PutObjectRequest> const&) const [inlined] | |
frame 15: 0x00000000100b173a clickhouse`DB::S3::Client::PutObject(DB::S3::ExtendedRequest<Aws::S3::Model::PutObjectRequest> const&) const + 41 at Client.cpp:578 | |
frame 16: 0x00000000100b1711 clickhouse`DB::S3::Client::PutObject(DB::S3::ExtendedRequest<Aws::S3::Model::PutObjectRequest> const&) const + 981 at Client.cpp:508 | |
frame 17: 0x00000000100b133c clickhouse`DB::S3::Client::PutObject(DB::S3::ExtendedRequest<Aws::S3::Model::PutObjectRequest> const&) const [inlined] | |
frame 18: 0x00000000100b133c clickhouse`DB::S3::Client::PutObject() const + 28 at Client.cpp:418 | |
frame 19: 0x00000000103b96d6 clickhouse`DB::copyDataToS3File() | |
</details> | |
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> | |
--- | |
src/IO/S3/Requests.h | 50 +++++++++++++++++++++++++++++++------------- | |
1 file changed, 35 insertions(+), 15 deletions(-) | |
diff --git a/src/IO/S3/Requests.h b/src/IO/S3/Requests.h | |
index 560ba9b2775..f6101235a80 100644 | |
--- a/src/IO/S3/Requests.h | |
+++ b/src/IO/S3/Requests.h | |
@@ -4,6 +4,7 @@ | |
#if USE_AWS_S3 | |
+#include <type_traits> | |
#include <IO/S3/URI.h> | |
#include <IO/S3/ProviderType.h> | |
@@ -27,10 +28,29 @@ namespace DB::S3 | |
namespace Model = Aws::S3::Model; | |
-template <typename BaseRequest> | |
+template <typename BaseRequest, bool adjust_checksum_algorithm> | |
class ExtendedRequest : public BaseRequest | |
{ | |
public: | |
+ ExtendedRequest() | |
+ { | |
+ if constexpr (adjust_checksum_algorithm) | |
+ { | |
+ /// Default, MD5, requires reading the input file one more time (in | |
+ /// AWSClient::AddChecksumToRequest()), | |
+ /// while SHA1 can calculate checksum with streaming during | |
+ /// calculating signature (in AWSAuthV4Signer::SignRequest()), but | |
+ /// only for HTTP requests, so by switching to anything except for | |
+ /// MD5 we simply tells the client to not calculate any checksum at | |
+ /// all, and this is good, since otherwise it will read input | |
+ /// stream second time. | |
+ /// | |
+ /// NOTE: that for HTTP queries it will read file 3x times. | |
+ this->SetChecksumAlgorithm(Model::ChecksumAlgorithm::SHA1); | |
+ /// FIXME: maybe it is better to override GetChecksumAlgorithmName() to return emtpy string? | |
+ } | |
+ } | |
+ | |
Aws::Endpoint::EndpointParameters GetEndpointContextParams() const override | |
{ | |
auto params = BaseRequest::GetEndpointContextParams(); | |
@@ -73,29 +93,29 @@ protected: | |
mutable ApiMode api_mode{ApiMode::AWS}; | |
}; | |
-class CopyObjectRequest : public ExtendedRequest<Model::CopyObjectRequest> | |
+class CopyObjectRequest : public ExtendedRequest<Model::CopyObjectRequest, false> | |
{ | |
public: | |
Aws::Http::HeaderValueCollection GetRequestSpecificHeaders() const override; | |
}; | |
-using HeadObjectRequest = ExtendedRequest<Model::HeadObjectRequest>; | |
-using ListObjectsV2Request = ExtendedRequest<Model::ListObjectsV2Request>; | |
-using ListObjectsRequest = ExtendedRequest<Model::ListObjectsRequest>; | |
-using GetObjectRequest = ExtendedRequest<Model::GetObjectRequest>; | |
+using HeadObjectRequest = ExtendedRequest<Model::HeadObjectRequest, false>; | |
+using ListObjectsV2Request = ExtendedRequest<Model::ListObjectsV2Request, false>; | |
+using ListObjectsRequest = ExtendedRequest<Model::ListObjectsRequest, false>; | |
+using GetObjectRequest = ExtendedRequest<Model::GetObjectRequest, false>; | |
-using CreateMultipartUploadRequest = ExtendedRequest<Model::CreateMultipartUploadRequest>; | |
-using CompleteMultipartUploadRequest = ExtendedRequest<Model::CompleteMultipartUploadRequest>; | |
-using AbortMultipartUploadRequest = ExtendedRequest<Model::AbortMultipartUploadRequest>; | |
-using UploadPartRequest = ExtendedRequest<Model::UploadPartRequest>; | |
-using UploadPartCopyRequest = ExtendedRequest<Model::UploadPartCopyRequest>; | |
+using CreateMultipartUploadRequest = ExtendedRequest<Model::CreateMultipartUploadRequest, false>; | |
+using CompleteMultipartUploadRequest = ExtendedRequest<Model::CompleteMultipartUploadRequest, false>; | |
+using AbortMultipartUploadRequest = ExtendedRequest<Model::AbortMultipartUploadRequest, false>; | |
+using UploadPartRequest = ExtendedRequest<Model::UploadPartRequest, true>; | |
+using UploadPartCopyRequest = ExtendedRequest<Model::UploadPartCopyRequest, false>; | |
-using PutObjectRequest = ExtendedRequest<Model::PutObjectRequest>; | |
-using DeleteObjectRequest = ExtendedRequest<Model::DeleteObjectRequest>; | |
-using DeleteObjectsRequest = ExtendedRequest<Model::DeleteObjectsRequest>; | |
+using PutObjectRequest = ExtendedRequest<Model::PutObjectRequest, true>; | |
+using DeleteObjectRequest = ExtendedRequest<Model::DeleteObjectRequest, false>; | |
+using DeleteObjectsRequest = ExtendedRequest<Model::DeleteObjectsRequest, false>; | |
-class ComposeObjectRequest : public ExtendedRequest<Aws::S3::S3Request> | |
+class ComposeObjectRequest : public ExtendedRequest<Aws::S3::S3Request, false> | |
{ | |
public: | |
inline const char * GetServiceRequestName() const override { return "ComposeObject"; } | |
-- | |
2.42.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment