From 29052b3ec54ad520c8f466674044bbc3141dfc13 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 14 May 2025 14:51:24 +0200 Subject: [PATCH 01/78] include open ssl --- extension/httpfs/crypto.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 04bd795e..72b38354 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -5,7 +5,16 @@ #include #define CPPHTTPLIB_OPENSSL_SUPPORT -#include "httplib.hpp" + +#include +#include +#include +#include +#include + +#if defined(_WIN32) && defined(OPENSSL_USE_APPLINK) +#include +#endif namespace duckdb { From 180bcb69e6eae0c9d5ac8a5e08fa7372de693310 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 15 May 2025 14:29:27 +0200 Subject: [PATCH 02/78] attempt to overwrite get --- extension/httpfs/httpfs_client.cpp | 169 ++++++++++++++++++++++++++--- 1 file changed, 151 insertions(+), 18 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 7a779ef9..735e3910 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -2,10 +2,103 @@ #include "http_state.hpp" #define CPPHTTPLIB_OPENSSL_SUPPORT +#include +#include #include "httplib.hpp" namespace duckdb { +// we statically compile in libcurl, which means the cert file location of the build machine is the +// place curl will look. But not every distro has this file in the same location, so we search a +// number of common locations and use the first one we find. +static std::string certFileLocations[] = { + // Arch, Debian-based, Gentoo + "/etc/ssl/certs/ca-certificates.crt", + // RedHat 7 based + "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", + // Redhat 6 based + "/etc/pki/tls/certs/ca-bundle.crt", + // OpenSUSE + "/etc/ssl/ca-bundle.pem", + // Alpine + "/etc/ssl/cert.pem"}; + +//! Grab the first path that exists, from a list of well-known locations +static std::string SelectCURLCertPath() { + for (std::string &caFile : certFileLocations) { + struct stat buf; + if (stat(caFile.c_str(), &buf) == 0) { + return caFile; + } + } + return std::string(); +} + +static std::string cert_path = SelectCURLCertPath(); + +static size_t RequestWriteCallback(void *contents, size_t size, size_t nmemb, void *userp) { + ((std::string *)userp)->append((char *)contents, size * nmemb); + return size * nmemb; +} + +class CURLRequestHeaders { +public: + CURLRequestHeaders(const vector &input) { + for (auto &header : input) { + Add(header); + } + } + ~CURLRequestHeaders() { + if (headers) { + curl_slist_free_all(headers); + } + headers = NULL; + } + operator bool() const { + return headers != NULL; + } + +public: + void Add(const string &header) { + headers = curl_slist_append(headers, header.c_str()); + } + +public: + curl_slist *headers = NULL; +}; + + +class CURLHandle { +public: + CURLHandle(const string &token, const string &cert_path) { + curl = curl_easy_init(); + if (!curl) { + throw InternalException("Failed to initialize curl"); + } + if (!token.empty()) { + curl_easy_setopt(curl, CURLOPT_XOAUTH2_BEARER, token.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BEARER); + } + if (!cert_path.empty()) { + curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + } + } + ~CURLHandle() { + curl_easy_cleanup(curl); + } + +public: + operator CURL *() { + return curl; + } + CURLcode Execute() { + return curl_easy_perform(curl); + } + +private: + CURL *curl = NULL; +}; + class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { @@ -32,30 +125,56 @@ class HTTPFSClient : public HTTPClient { } } state = http_params.state; + auto bearer_token = ""; + if (!http_params.bearer_token.empty()) { + bearer_token = http_params.bearer_token.c_str(); + } + curl = make_uniq(bearer_token, SelectCURLCertPath()); + state = http_params.state; + } + + void SetLogger(HTTPLogger &logger) { + client->set_logger(logger.GetLogger()); } unique_ptr Get(GetRequestInfo &info) override { - if (state) { - state->get_count++; + auto headers = TransformHeadersForCurl(info.headers, info.params); + CURLRequestHeaders curl_headers(headers); + + CURLcode res; + string result; + { + curl_easy_setopt(*curl_handle, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl_handle, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl_handle, CURLOPT_WRITEDATA, &result); + + if (curl_headers) { + curl_easy_setopt(*curl_handle, CURLOPT_HTTPHEADER, curl_headers.headers); + } + res = curl_handle->Execute(); } - auto headers = TransformHeaders(info.headers, info.params); - if (!info.response_handler && !info.content_handler) { - return TransformResult(client->Get(info.path, headers)); - } else { - return TransformResult(client->Get( - info.path.c_str(), headers, - [&](const duckdb_httplib_openssl::Response &response) { - auto http_response = TransformResponse(response); - return info.response_handler(*http_response); - }, - [&](const char *data, size_t data_length) { - if (state) { - state->total_bytes_received += data_length; - } - return info.content_handler(const_data_ptr_cast(data), data_length); - })); + + // DUCKDB_LOG_DEBUG(context, "iceberg.Catalog.Curl.HTTPRequest", "GET %s (curl code '%s')", url, + // curl_easy_strerror(res)); + if (res != CURLcode::CURLE_OK) { + string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", url, error)); + } + uint16_t response_code = 0; + curl_easy_getinfo(*curl_handle, CURLINFO_RESPONSE_CODE, response_code); + + // TODO: replace this with better bytes received provided by curl. + if (state) { + state->total_bytes_received += sizeof(result); } + + // get the response code + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->body = result; + return return_result; } + unique_ptr Put(PutRequestInfo &info) override { if (state) { state->put_count++; @@ -117,6 +236,19 @@ class HTTPFSClient : public HTTPClient { return headers; } + duckdb_httplib_openssl::Headers TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + headers; + for (auto &entry : header_map) { + const std::string new_header = entry.first + "=" + entry.second; + headers.insert(new_header); + } + for (auto &entry : params.extra_headers) { + const std::string new_header = entry.first + "=" + entry.second; + headers.insert(new_header); + } + return headers; + } + unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { auto status_code = HTTPUtil::ToStatusCode(response.status); auto result = make_uniq(status_code); @@ -141,6 +273,7 @@ class HTTPFSClient : public HTTPClient { private: unique_ptr client; + unique_ptr curl; optional_ptr state; }; From 303c9a7b7b1cf4b842151ecea300abcea7d905d2 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 15 May 2025 15:50:26 +0200 Subject: [PATCH 03/78] compiles with curl now --- CMakeLists.txt | 10 ++++++++ extension/httpfs/httpfs_client.cpp | 37 +++++++++++++++++++----------- vcpkg.json | 3 ++- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 92d45479..878f6f61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,8 @@ add_extension_definitions() include_directories(extension/httpfs/include ${DUCKDB_MODULE_BASE_DIR}/third_party/httplib) + + build_static_extension( httpfs extension/httpfs/hffs.cpp @@ -38,7 +40,9 @@ if(MINGW) endif() find_package(OpenSSL REQUIRED) +find_package(CURL REQUIRED) include_directories(${OPENSSL_INCLUDE_DIR}) +include_directories(${CURL_INCLUDE_DIRS}) if(EMSCRIPTEN) else() @@ -46,6 +50,11 @@ else() ${OPENSSL_LIBRARIES}) target_link_libraries(httpfs_extension duckdb_mbedtls ${OPENSSL_LIBRARIES}) + # Link dependencies into extension + target_link_libraries(httpfs_loadable_extension ${CURL_LIBRARIES}) + target_link_libraries(httpfs_extension ${CURL_LIBRARIES}) + + if(MINGW) find_package(ZLIB) target_link_libraries(httpfs_loadable_extension ZLIB::ZLIB -lcrypt32) @@ -53,6 +62,7 @@ else() endif() endif() + install( TARGETS httpfs_extension EXPORT "${DUCKDB_EXPORT_SET}" diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 735e3910..9580263a 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -2,9 +2,11 @@ #include "http_state.hpp" #define CPPHTTPLIB_OPENSSL_SUPPORT + #include -#include +// #include #include "httplib.hpp" +#include "duckdb/common/exception/http_exception.hpp" namespace duckdb { @@ -23,6 +25,7 @@ static std::string certFileLocations[] = { // Alpine "/etc/ssl/cert.pem"}; + //! Grab the first path that exists, from a list of well-known locations static std::string SelectCURLCertPath() { for (std::string &caFile : certFileLocations) { @@ -43,11 +46,12 @@ static size_t RequestWriteCallback(void *contents, size_t size, size_t nmemb, vo class CURLRequestHeaders { public: - CURLRequestHeaders(const vector &input) { + CURLRequestHeaders(vector &input) { for (auto &header : input) { Add(header); } } + CURLRequestHeaders() {} ~CURLRequestHeaders() { if (headers) { curl_slist_free_all(headers); @@ -144,24 +148,24 @@ class HTTPFSClient : public HTTPClient { CURLcode res; string result; { - curl_easy_setopt(*curl_handle, CURLOPT_URL, info.url.c_str()); - curl_easy_setopt(*curl_handle, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl_handle, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); if (curl_headers) { - curl_easy_setopt(*curl_handle, CURLOPT_HTTPHEADER, curl_headers.headers); + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); } - res = curl_handle->Execute(); + res = curl->Execute(); } // DUCKDB_LOG_DEBUG(context, "iceberg.Catalog.Curl.HTTPRequest", "GET %s (curl code '%s')", url, // curl_easy_strerror(res)); if (res != CURLcode::CURLE_OK) { string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", url, error)); + throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", info.url, error)); } uint16_t response_code = 0; - curl_easy_getinfo(*curl_handle, CURLINFO_RESPONSE_CODE, response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, response_code); // TODO: replace this with better bytes received provided by curl. if (state) { @@ -236,17 +240,22 @@ class HTTPFSClient : public HTTPClient { return headers; } - duckdb_httplib_openssl::Headers TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { - headers; + CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + std::vector headers; for (auto &entry : header_map) { const std::string new_header = entry.first + "=" + entry.second; - headers.insert(new_header); + headers.push_back(new_header); } for (auto &entry : params.extra_headers) { const std::string new_header = entry.first + "=" + entry.second; - headers.insert(new_header); + headers.push_back(new_header); } - return headers; + CURLRequestHeaders curl_headers; + for (auto &header : headers) { + curl_headers.Add(header); + } + return curl_headers; + // return CURLRequestHeaders(headers); } unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { diff --git a/vcpkg.json b/vcpkg.json index 3ed9a36b..809e67b1 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -1,5 +1,6 @@ { "dependencies": [ - "openssl" + "openssl", + "curl" ] } From b8272b19ce7ecc044d6af1df0641b5eb8d008dae Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 12:58:43 +0200 Subject: [PATCH 04/78] put head delete post get replaced --- extension/httpfs/httpfs_client.cpp | 413 ++++++++++++++++----- extension/httpfs/include/httpfs_client.hpp | 51 +++ test/sql/copy/csv/test_csv_httpfs.test | 5 + 3 files changed, 383 insertions(+), 86 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 9580263a..6e6ca232 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -40,68 +40,67 @@ static std::string SelectCURLCertPath() { static std::string cert_path = SelectCURLCertPath(); static size_t RequestWriteCallback(void *contents, size_t size, size_t nmemb, void *userp) { - ((std::string *)userp)->append((char *)contents, size * nmemb); - return size * nmemb; + size_t totalSize = size * nmemb; + std::string* str = static_cast(userp); + str->append(static_cast(contents), totalSize); + return totalSize; } -class CURLRequestHeaders { -public: - CURLRequestHeaders(vector &input) { - for (auto &header : input) { - Add(header); - } - } - CURLRequestHeaders() {} - ~CURLRequestHeaders() { - if (headers) { - curl_slist_free_all(headers); +static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, void *userp) { + size_t totalSize = size * nmemb; + std::string header(static_cast(contents), totalSize); + HeaderCollector* header_collection = static_cast(userp); + + // Trim trailing \r\n + if (!header.empty() && header.back() == '\n') { + header.pop_back(); + if (!header.empty() && header.back() == '\r') { + header.pop_back(); } - headers = NULL; - } - operator bool() const { - return headers != NULL; } -public: - void Add(const string &header) { - headers = curl_slist_append(headers, header.c_str()); + // If header starts with HTTP/... curl has followed a redirect and we have a new Header, + // so we clear all of the current header_collection + if (header.rfind("HTTP/", 0) == 0) { + header_collection->header_collection.push_back(HTTPHeaders()); } -public: - curl_slist *headers = NULL; -}; + size_t colonPos = header.find(':'); - -class CURLHandle { -public: - CURLHandle(const string &token, const string &cert_path) { - curl = curl_easy_init(); - if (!curl) { - throw InternalException("Failed to initialize curl"); - } - if (!token.empty()) { - curl_easy_setopt(curl, CURLOPT_XOAUTH2_BEARER, token.c_str()); - curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BEARER); - } - if (!cert_path.empty()) { - curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + if (colonPos != std::string::npos) { + // Split the string into two parts + std::string part1 = header.substr(0, colonPos); + std::string part2 = header.substr(colonPos + 1); + if (part2.at(0) == ' ') { + part2.erase(0, 1); } + + header_collection->header_collection.back().Insert(part1, part2); } - ~CURLHandle() { - curl_easy_cleanup(curl); - } + // TODO: some headers may not follow standard response header formats. + // what to do in this case? Invalid does not mean we should abort. -public: - operator CURL *() { - return curl; + return totalSize; +} + + CURLHandle::CURLHandle(const string &token, const string &cert_path) { + curl = curl_easy_init(); + if (!curl) { + throw InternalException("Failed to initialize curl"); } - CURLcode Execute() { - return curl_easy_perform(curl); + if (!token.empty()) { + curl_easy_setopt(curl, CURLOPT_XOAUTH2_BEARER, token.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BEARER); } + if (!cert_path.empty()) { + curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + } +} + +CURLHandle::~CURLHandle() { + curl_easy_cleanup(curl); +} -private: - CURL *curl = NULL; -}; class HTTPFSClient : public HTTPClient { public: @@ -129,12 +128,14 @@ class HTTPFSClient : public HTTPClient { } } state = http_params.state; + + // initializing curl auto bearer_token = ""; if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); } curl = make_uniq(bearer_token, SelectCURLCertPath()); - state = http_params.state; + } void SetLogger(HTTPLogger &logger) { @@ -142,15 +143,46 @@ class HTTPFSClient : public HTTPClient { } unique_ptr Get(GetRequestInfo &info) override { - auto headers = TransformHeadersForCurl(info.headers, info.params); - CURLRequestHeaders curl_headers(headers); + if (state) { + state->get_count++; + } + + // auto headers = TransformHeaders(info.headers, info.params); + // if (!info.response_handler && !info.content_handler) { + // return TransformResult(client->Get(info.path, headers)); + // } else { + // return TransformResult(client->Get( + // info.path.c_str(), headers, + // [&](const duckdb_httplib_openssl::Response &response) { + // auto http_response = TransformResponse(response); + // return info.response_handler(*http_response); + // }, + // [&](const char *data, size_t data_length) { + // if (state) { + // state->total_bytes_received += data_length; + // } + // return info.content_handler(const_data_ptr_cast(data), data_length); + // })); + // } + + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); CURLcode res; - string result; + std::string result; + HeaderCollector response_header_collection; { + // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again + curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); + + // follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + // write response headers (different header collection for each redirect) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); if (curl_headers) { curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); @@ -165,17 +197,28 @@ class HTTPFSClient : public HTTPClient { throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", info.url, error)); } uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - // TODO: replace this with better bytes received provided by curl. + idx_t bytes_received = 0; + if (response_header_collection.header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(response_header_collection.header_collection.back().GetHeaderValue("content-length")); + D_ASSERT(bytes_received == result.size()); + } else { + bytes_received = result.size(); + } if (state) { - state->total_bytes_received += sizeof(result); + state->total_bytes_received += bytes_received; } - // get the response code + const char* data = result.c_str(); + info.content_handler(const_data_ptr_cast(data), bytes_received); + + // Construct HTTPResponse auto status_code = HTTPStatusCode(response_code); auto return_result = make_uniq(status_code); return_result->body = result; + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; return return_result; } @@ -184,25 +227,170 @@ class HTTPFSClient : public HTTPClient { state->put_count++; state->total_bytes_sent += info.buffer_in_len; } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, - info.content_type)); + // auto headers = TransformHeaders(info.headers, info.params); + // return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, + // info.content_type)); + + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + // Optionally add headers directly + curl_headers.headers = curl_slist_append(curl_headers.headers, "Content-Type: application/json"); + + CURLcode res; + std::string result; + HeaderCollector response_header_collection; + + { + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + + // Perform PUT + curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "PUT"); + + // Include PUT body + curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); + curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); + + // Follow redirects if needed + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Capture response body + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Capture response headers + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Apply headers + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); + } + + // Execute the request + res = curl->Execute(); + } + + // Check response + if (res != CURLcode::CURLE_OK) { + std::string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl PUT Request to '%s' failed with error: '%s'", info.url, error)); + } + + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + + // Construct HTTPResponse + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->body = ""; + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; + return return_result; } unique_ptr Head(HeadRequestInfo &info) override { - if (state) { - state->head_count++; - } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Head(info.path, headers)); + + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + + CURLcode res; + std::string result; + HeaderCollector response_header_collection; + + { + // Set URL + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + // curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); + + // Perform HEAD request instead of GET + curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); + curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); + + // Follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // set write function to collect body — no body expected, so safe to omit + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Collect response headers (multiple header blocks for redirects) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Add headers if any + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); + } + + // Execute HEAD request + res = curl->Execute(); + } + + // Handle result + if (res != CURLcode::CURLE_OK) { + string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl HEAD Request to '%s' failed with error: '%s'", info.url, error)); + } + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + // Construct HTTPResponse + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->body = ""; + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; + return return_result; } unique_ptr Delete(DeleteRequestInfo &info) override { - if (state) { - state->delete_count++; + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + + CURLcode res; + std::string result; + HeaderCollector response_header_collection; + + // TODO: some delete requests require a BODY + { + // Set URL + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + + // Set DELETE request method + curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "DELETE"); + + // Follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Set write function to collect response body + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Collect response headers (multiple header blocks for redirects) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Add headers if any + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); + } + + // Execute DELETE request + res = curl->Execute(); } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Delete(info.path, headers)); + + // Handle result + if (res != CURLcode::CURLE_OK) { + std::string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl DELETE Request to '%s' failed with error: '%s'", info.url, error)); + } + + // Get HTTP response status code + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + // Construct HTTPResponse + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; + return return_result; + } unique_ptr Post(PostRequestInfo &info) override { @@ -210,22 +398,75 @@ class HTTPFSClient : public HTTPClient { state->post_count++; state->total_bytes_sent += info.buffer_in_len; } - // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib - duckdb_httplib_openssl::Request req; - req.method = "POST"; - req.path = info.path; - req.headers = TransformHeaders(info.headers, info.params); - req.headers.emplace("Content-Type", "application/octet-stream"); - req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, - uint64_t /*total_length*/) { - if (state) { - state->total_bytes_received += data_length; + + // // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib + // duckdb_httplib_openssl::Request req; + // req.method = "POST"; + // req.path = info.path; + // req.headers = TransformHeaders(info.headers, info.params); + // req.headers.emplace("Content-Type", "application/octet-stream"); + // req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, + // uint64_t /*total_length*/) { + // if (state) { + // state->total_bytes_received += data_length; + // } + // info.buffer_out += string(data, data_length); + // return true; + // }; + // req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); + // return TransformResult(client->send(req)); + + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + curl_headers.Add("Content-Type: application/octet-stream"); + + CURLcode res; + std::string result; + HeaderCollector response_header_collection; + + { + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_POST, 1L); + + // Set POST body + curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); + curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); + + // Follow redirects + // TODO: follow redirects for POST? + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Set write function to collect response body + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Collect response headers (multiple header blocks for redirects) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Add headers if any + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); } - info.buffer_out += string(data, data_length); - return true; - }; - req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); - return TransformResult(client->send(req)); + + // Execute POST request + res = curl->Execute(); + } + + // Handle result + if (res != CURLcode::CURLE_OK) { + string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl POST Request to '%s' failed with error: '%s'", info.url, error)); + } + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + + info.buffer_out = result; + // Construct HTTPResponse + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; + return return_result; } private: @@ -243,11 +484,11 @@ class HTTPFSClient : public HTTPClient { CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { std::vector headers; for (auto &entry : header_map) { - const std::string new_header = entry.first + "=" + entry.second; + const std::string new_header = entry.first + ": " + entry.second; headers.push_back(new_header); } for (auto &entry : params.extra_headers) { - const std::string new_header = entry.first + "=" + entry.second; + const std::string new_header = entry.first + ": " + entry.second; headers.push_back(new_header); } CURLRequestHeaders curl_headers; @@ -255,7 +496,6 @@ class HTTPFSClient : public HTTPClient { curl_headers.Add(header); } return curl_headers; - // return CURLRequestHeaders(headers); } unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { @@ -283,6 +523,7 @@ class HTTPFSClient : public HTTPClient { private: unique_ptr client; unique_ptr curl; + CURLRequestHeaders request_headers; optional_ptr state; }; diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index 1d7620cf..11a48a49 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -1,6 +1,7 @@ #pragma once #include "duckdb/common/http_util.hpp" +#include namespace duckdb { class HTTPLogger; @@ -36,4 +37,54 @@ class HTTPFSUtil : public HTTPUtil { string GetName() const override; }; +class CURLHandle { +public: + CURLHandle(const string &token, const string &cert_path); + ~CURLHandle(); + +public: + operator CURL *() { + return curl; + } + CURLcode Execute() { + return curl_easy_perform(curl); + } + +private: + CURL *curl = NULL; +}; + +class CURLRequestHeaders { +public: + CURLRequestHeaders(vector &input) { + for (auto &header : input) { + Add(header); + } + } + CURLRequestHeaders() {} + + ~CURLRequestHeaders() { + if (headers) { + curl_slist_free_all(headers); + } + headers = NULL; + } + operator bool() const { + return headers != NULL; + } + +public: + void Add(const string &header) { + headers = curl_slist_append(headers, header.c_str()); + } + +public: + curl_slist *headers = NULL; +}; + +struct HeaderCollector { + std::vector header_collection; +}; + + } // namespace duckdb diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index 869a0b67..cbeed963 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -4,9 +4,14 @@ require httpfs +require parquet + statement ok PRAGMA enable_verification +statement ok +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet'; + #FIXME this test fails: file is nonexistent mode skip From e6c27809be6bc258266e8231f2cb887706b43c14 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:10:16 +0200 Subject: [PATCH 05/78] bump duckdb --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 44d08560..4e5ff4e5 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 44d0856021c69743661adaea0006725fdbf7db29 +Subproject commit 4e5ff4e5e8508791f2fedbd3c9d19e9e5a5e4d93 From 2afdc880ca24020869aa77ce858be465a85f81e4 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:10:25 +0200 Subject: [PATCH 06/78] remove extension stuff --- extension/httpfs/httpfs_client.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 6e6ca232..0aa00576 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -138,9 +138,6 @@ class HTTPFSClient : public HTTPClient { } - void SetLogger(HTTPLogger &logger) { - client->set_logger(logger.GetLogger()); - } unique_ptr Get(GetRequestInfo &info) override { if (state) { From be81a80c54f32a8bfaf5e5397625091f782f8987 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:18:23 +0200 Subject: [PATCH 07/78] install curl during build --- .github/workflows/MinioTests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index dca4c52a..a88c84f1 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -33,7 +33,10 @@ jobs: - name: Install Ninja shell: bash - run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build + run: | + sudo apt-get update -y -qq + sudo apt-get install -y -qq software-properties-common + sudo apt-get install -y -qq libcurl4-gnutls-dev libcurl4-openssl-dev ninja-build - name: Setup Ccache uses: hendrikmuhs/ccache-action@main From fdac2778b9dc93df3fb2704c885f5f844275d6ae Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:21:34 +0200 Subject: [PATCH 08/78] run on ubuntu-latest --- .github/workflows/MinioTests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index a88c84f1..4d721ba6 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -10,7 +10,7 @@ defaults: jobs: minio-tests: name: Minio Tests - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest env: S3_TEST_SERVER_AVAILABLE: 1 AWS_DEFAULT_REGION: eu-west-1 From ef581dee06e1edf821fe32fc548cc4084388839c Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:26:29 +0200 Subject: [PATCH 09/78] init vcpackage toolchain path --- .github/workflows/MinioTests.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index 4d721ba6..b6d9dc06 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -19,6 +19,7 @@ jobs: DUCKDB_S3_ENDPOINT: duckdb-minio.com:9000 DUCKDB_S3_USE_SSL: false GEN: ninja + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake VCPKG_TARGET_TRIPLET: x64-linux steps: @@ -31,12 +32,9 @@ jobs: with: python-version: '3.10' - - name: Install Ninja + - name: Install libraries shell: bash - run: | - sudo apt-get update -y -qq - sudo apt-get install -y -qq software-properties-common - sudo apt-get install -y -qq libcurl4-gnutls-dev libcurl4-openssl-dev ninja-build + run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build - name: Setup Ccache uses: hendrikmuhs/ccache-action@main @@ -49,9 +47,11 @@ jobs: with: vcpkgGitCommitId: 5e5d0e1cd7785623065e77eff011afdeec1a3574 - - name: Build - shell: bash - run: make + - name: Build extension + env: + GEN: ninja + run: | + make release - name: Start S3/HTTP test server shell: bash From e7bc7e0790f1253e1c79817c845edabd3353ca90 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:53:23 +0200 Subject: [PATCH 10/78] actually add params to end of url for get requests --- extension/httpfs/httpfs_client.cpp | 39 +++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 0aa00576..0b28b910 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -162,7 +162,12 @@ class HTTPFSClient : public HTTPClient { // })); // } - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } CURLcode res; std::string result; @@ -173,7 +178,7 @@ class HTTPFSClient : public HTTPClient { // follow redirects curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, url.c_str()); // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); @@ -228,7 +233,7 @@ class HTTPFSClient : public HTTPClient { // return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, // info.content_type)); - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); // Optionally add headers directly curl_headers.headers = curl_slist_append(curl_headers.headers, "Content-Type: application/json"); @@ -286,7 +291,7 @@ class HTTPFSClient : public HTTPClient { unique_ptr Head(HeadRequestInfo &info) override { - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); CURLcode res; std::string result; @@ -338,7 +343,7 @@ class HTTPFSClient : public HTTPClient { } unique_ptr Delete(DeleteRequestInfo &info) override { - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); CURLcode res; std::string result; @@ -413,7 +418,7 @@ class HTTPFSClient : public HTTPClient { // req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); // return TransformResult(client->send(req)); - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); curl_headers.Add("Content-Type: application/octet-stream"); CURLcode res; @@ -478,16 +483,12 @@ class HTTPFSClient : public HTTPClient { return headers; } - CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map) { std::vector headers; for (auto &entry : header_map) { const std::string new_header = entry.first + ": " + entry.second; headers.push_back(new_header); } - for (auto &entry : params.extra_headers) { - const std::string new_header = entry.first + ": " + entry.second; - headers.push_back(new_header); - } CURLRequestHeaders curl_headers; for (auto &header : headers) { curl_headers.Add(header); @@ -495,6 +496,22 @@ class HTTPFSClient : public HTTPClient { return curl_headers; } + string TransformParamsCurl(const HTTPParams ¶ms) { + string result = ""; + unordered_map escaped_params; + bool first_param = true; + for (auto &entry : params.extra_headers) { + const string key = entry.first; + const string value = curl_easy_escape(*curl, entry.second.c_str(), 0); + if (!first_param) { + result += "&"; + } + result += key + "=" + value; + first_param = false; + } + return result; + } + unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { auto status_code = HTTPUtil::ToStatusCode(response.status); auto result = make_uniq(status_code); From b74abc2cd030ee77c60e97d3dc1ce597bead152c Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 14:58:23 +0200 Subject: [PATCH 11/78] get the response error message as well from the http response code header --- extension/httpfs/httpfs_client.cpp | 212 +++++++++++++--------------- test/sql/secret/secret_refresh.test | 4 +- 2 files changed, 99 insertions(+), 117 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 0b28b910..769d877c 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -63,6 +63,7 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v // so we clear all of the current header_collection if (header.rfind("HTTP/", 0) == 0) { header_collection->header_collection.push_back(HTTPHeaders()); + header_collection->header_collection.back().Insert("__RESPONSE_STATUS__", header); } size_t colonPos = header.find(':'); @@ -135,7 +136,6 @@ class HTTPFSClient : public HTTPClient { bearer_token = http_params.bearer_token.c_str(); } curl = make_uniq(bearer_token, SelectCURLCertPath()); - } @@ -144,24 +144,6 @@ class HTTPFSClient : public HTTPClient { state->get_count++; } - // auto headers = TransformHeaders(info.headers, info.params); - // if (!info.response_handler && !info.content_handler) { - // return TransformResult(client->Get(info.path, headers)); - // } else { - // return TransformResult(client->Get( - // info.path.c_str(), headers, - // [&](const duckdb_httplib_openssl::Response &response) { - // auto http_response = TransformResponse(response); - // return info.response_handler(*http_response); - // }, - // [&](const char *data, size_t data_length) { - // if (state) { - // state->total_bytes_received += data_length; - // } - // return info.content_handler(const_data_ptr_cast(data), data_length); - // })); - // } - auto curl_headers = TransformHeadersForCurl(info.headers); auto url = info.url; if (!info.params.extra_headers.empty()) { @@ -214,14 +196,7 @@ class HTTPFSClient : public HTTPClient { const char* data = result.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - - // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->body = result; - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; + return TransformResponseCurl(response_code, response_header_collection, result, res, url); } unique_ptr Put(PutRequestInfo &info) override { @@ -229,13 +204,16 @@ class HTTPFSClient : public HTTPClient { state->put_count++; state->total_bytes_sent += info.buffer_in_len; } - // auto headers = TransformHeaders(info.headers, info.params); - // return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, - // info.content_type)); auto curl_headers = TransformHeadersForCurl(info.headers); - // Optionally add headers directly - curl_headers.headers = curl_slist_append(curl_headers.headers, "Content-Type: application/json"); + // Add content type header from info + curl_headers.Add("Content-Type: " + info.content_type); + // transform parameters + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } CURLcode res; std::string result; @@ -280,70 +258,78 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + return TransformResponseCurl(response_code, response_header_collection, result, res, url); + // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->body = ""; - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; + // auto status_code = HTTPStatusCode(response_code); + // auto return_result = make_uniq(status_code); + // return_result->body = ""; + // return_result->headers = response_header_collection.header_collection.back(); + // return_result->url = info.url; + // return return_result; } unique_ptr Head(HeadRequestInfo &info) override { - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersForCurl(info.headers); + // transform parameters + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } - CURLcode res; - std::string result; - HeaderCollector response_header_collection; + CURLcode res; + std::string result; + HeaderCollector response_header_collection; - { - // Set URL - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + { + // Set URL + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); // curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); - // Perform HEAD request instead of GET - curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); - curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); - - // Follow redirects - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - - // set write function to collect body — no body expected, so safe to omit - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); - - // Collect response headers (multiple header blocks for redirects) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); - - // Add headers if any - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } - - // Execute HEAD request - res = curl->Execute(); - } - - // Handle result - if (res != CURLcode::CURLE_OK) { - string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl HEAD Request to '%s' failed with error: '%s'", info.url, error)); - } - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->body = ""; - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; + // Perform HEAD request instead of GET + curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); + curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); + + // Follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // set write function to collect body — no body expected, so safe to omit + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Collect response headers (multiple header blocks for redirects) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Add headers if any + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); + } + + // Execute HEAD request + res = curl->Execute(); + } + + // Handle result + if (res != CURLcode::CURLE_OK) { + string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl HEAD Request to '%s' failed with error: '%s'", info.url, error)); + } + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + return TransformResponseCurl(response_code, response_header_collection, result, res, url); } unique_ptr Delete(DeleteRequestInfo &info) override { auto curl_headers = TransformHeadersForCurl(info.headers); + // transform parameters + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } CURLcode res; std::string result; @@ -386,13 +372,7 @@ class HTTPFSClient : public HTTPClient { // Get HTTP response status code uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; - + return TransformResponseCurl(response_code, response_header_collection, result, res, url); } unique_ptr Post(PostRequestInfo &info) override { @@ -401,25 +381,15 @@ class HTTPFSClient : public HTTPClient { state->total_bytes_sent += info.buffer_in_len; } - // // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib - // duckdb_httplib_openssl::Request req; - // req.method = "POST"; - // req.path = info.path; - // req.headers = TransformHeaders(info.headers, info.params); - // req.headers.emplace("Content-Type", "application/octet-stream"); - // req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, - // uint64_t /*total_length*/) { - // if (state) { - // state->total_bytes_received += data_length; - // } - // info.buffer_out += string(data, data_length); - // return true; - // }; - // req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); - // return TransformResult(client->send(req)); - auto curl_headers = TransformHeadersForCurl(info.headers); - curl_headers.Add("Content-Type: application/octet-stream"); + const string content_type = "Content-Type: application/octet-stream"; + curl_headers.Add(content_type.c_str()); + // transform parameters + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } CURLcode res; std::string result; @@ -434,7 +404,7 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); // Follow redirects - // TODO: follow redirects for POST? + // TODO: should we follow redirects for POST? curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); // Set write function to collect response body @@ -461,14 +431,9 @@ class HTTPFSClient : public HTTPClient { } uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - info.buffer_out = result; // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; + return TransformResponseCurl(response_code, response_header_collection, result, res, url); } private: @@ -523,6 +488,23 @@ class HTTPFSClient : public HTTPClient { return result; } + unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector &header_collection, string &body, CURLcode res, string &url) { + auto status_code = HTTPStatusCode(response_code); + auto response = make_uniq(status_code); + if (response_code >= 400) { + if (header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + } else { + response->request_error = curl_easy_strerror(res); + } + return response; + } + response->body = body; + response->url = url; + response->headers = header_collection.header_collection.back(); + return response; + } + unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { if (res.error() == duckdb_httplib_openssl::Error::Success) { auto &response = res.value(); diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index 85c8738a..35e27356 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -81,7 +81,7 @@ CREATE SECRET s1 ( statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP 403 +IO Error: HTTP/1.1 403 Forbidden query I SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' @@ -121,7 +121,7 @@ set s3_access_key_id='bogus' statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP 403 +IO Error: HTTP/1.1 403 Forbidden # -> log empty query II From 88f96fc9571e3a51b057e62c3d1b2a2793367345 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 15:18:08 +0200 Subject: [PATCH 12/78] remove all running httplib.hpp code --- extension/httpfs/httpfs_client.cpp | 85 +++++++----------------------- 1 file changed, 20 insertions(+), 65 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 769d877c..8ea99c2a 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -4,8 +4,7 @@ #define CPPHTTPLIB_OPENSSL_SUPPORT #include -// #include -#include "httplib.hpp" +#include #include "duckdb/common/exception/http_exception.hpp" namespace duckdb { @@ -106,30 +105,6 @@ CURLHandle::~CURLHandle() { class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { - client = make_uniq(proto_host_port); - client->set_follow_location(true); - client->set_keep_alive(http_params.keep_alive); - if (!http_params.ca_cert_file.empty()) { - client->set_ca_cert_path(http_params.ca_cert_file.c_str()); - } - client->enable_server_certificate_verification(http_params.enable_server_cert_verification); - client->set_write_timeout(http_params.timeout, http_params.timeout_usec); - client->set_read_timeout(http_params.timeout, http_params.timeout_usec); - client->set_connection_timeout(http_params.timeout, http_params.timeout_usec); - client->set_decompress(false); - if (!http_params.bearer_token.empty()) { - client->set_bearer_token_auth(http_params.bearer_token.c_str()); - } - - if (!http_params.http_proxy.empty()) { - client->set_proxy(http_params.http_proxy, http_params.http_proxy_port); - - if (!http_params.http_proxy_username.empty()) { - client->set_proxy_basic_auth(http_params.http_proxy_username, http_params.http_proxy_password); - } - } - state = http_params.state; - // initializing curl auto bearer_token = ""; if (!http_params.bearer_token.empty()) { @@ -437,16 +412,6 @@ class HTTPFSClient : public HTTPClient { } private: - duckdb_httplib_openssl::Headers TransformHeaders(const HTTPHeaders &header_map, const HTTPParams ¶ms) { - duckdb_httplib_openssl::Headers headers; - for (auto &entry : header_map) { - headers.insert(entry); - } - for (auto &entry : params.extra_headers) { - headers.insert(entry); - } - return headers; - } CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map) { std::vector headers; @@ -477,17 +442,6 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { - auto status_code = HTTPUtil::ToStatusCode(response.status); - auto result = make_uniq(status_code); - result->body = response.body; - result->reason = response.reason; - for (auto &entry : response.headers) { - result->headers.Insert(entry.first, entry.second); - } - return result; - } - unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector &header_collection, string &body, CURLcode res, string &url) { auto status_code = HTTPStatusCode(response_code); auto response = make_uniq(status_code); @@ -505,19 +459,7 @@ class HTTPFSClient : public HTTPClient { return response; } - unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { - if (res.error() == duckdb_httplib_openssl::Error::Success) { - auto &response = res.value(); - return TransformResponse(response); - } else { - auto result = make_uniq(HTTPStatusCode::INVALID); - result->request_error = to_string(res.error()); - return result; - } - } - private: - unique_ptr client; unique_ptr curl; CURLRequestHeaders request_headers; optional_ptr state; @@ -529,14 +471,27 @@ unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, con } unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { - duckdb_httplib_openssl::Params query_params; - duckdb_httplib_openssl::detail::parse_query_text(text, query_params); + unordered_map params; + + auto pos = text.find('?'); + if (pos == std::string::npos) return params; + + std::string query = text.substr(pos + 1); + std::stringstream ss(query); + std::string item; - unordered_map result; - for (auto &entry : query_params) { - result.emplace(std::move(entry.first), std::move(entry.second)); + while (std::getline(ss, item, '&')) { + auto eq_pos = item.find('='); + if (eq_pos != std::string::npos) { + std::string key = item.substr(0, eq_pos); + std::string value = StringUtil::URLDecode(item.substr(eq_pos + 1)); + params[key] = value; + } else { + params[item] = ""; // key with no value + } } - return result; + + return params; } string HTTPFSUtil::GetName() const { From 04cb2a8fe5fda42627c4fff7b3f1547a82109c64 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 15:49:21 +0200 Subject: [PATCH 13/78] add back in state initialization --- extension/httpfs/httpfs_client.cpp | 16 ++++++++-------- test/sql/metadata_stats.test | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 test/sql/metadata_stats.test diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 8ea99c2a..8126dcf7 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -110,6 +110,7 @@ class HTTPFSClient : public HTTPClient { if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); } + state = http_params.state; curl = make_uniq(bearer_token, SelectCURLCertPath()); } @@ -234,17 +235,12 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); return TransformResponseCurl(response_code, response_header_collection, result, res, url); - - // Construct HTTPResponse - // auto status_code = HTTPStatusCode(response_code); - // auto return_result = make_uniq(status_code); - // return_result->body = ""; - // return_result->headers = response_header_collection.header_collection.back(); - // return_result->url = info.url; - // return return_result; } unique_ptr Head(HeadRequestInfo &info) override { + if (state) { + state->head_count++; + } auto curl_headers = TransformHeadersForCurl(info.headers); // transform parameters @@ -298,6 +294,10 @@ class HTTPFSClient : public HTTPClient { } unique_ptr Delete(DeleteRequestInfo &info) override { + if (state) { + state->delete_count++; + } + auto curl_headers = TransformHeadersForCurl(info.headers); // transform parameters auto url = info.url; diff --git a/test/sql/metadata_stats.test b/test/sql/metadata_stats.test new file mode 100644 index 00000000..ba5ddfd2 --- /dev/null +++ b/test/sql/metadata_stats.test @@ -0,0 +1,17 @@ +# name: test/sql/metadata_stats.test +# description: Test getting metadata stats +# group: [] + +require parquet + +require httpfs + +statement ok +SET force_download=false; + +mode output_result + +query II +explain analyze SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/userdata1.parquet') +---- +analyzed_plan :.*GET: 2.* From 02e8ac1a14ecf16837fdb8801986db0d86d719f5 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 16:02:00 +0200 Subject: [PATCH 14/78] remove output result --- test/sql/metadata_stats.test | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/sql/metadata_stats.test b/test/sql/metadata_stats.test index ba5ddfd2..f15955be 100644 --- a/test/sql/metadata_stats.test +++ b/test/sql/metadata_stats.test @@ -9,8 +9,6 @@ require httpfs statement ok SET force_download=false; -mode output_result - query II explain analyze SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/userdata1.parquet') ---- From c92998cdd92425be70d79230d1c7911c07679f99 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 17:35:46 +0200 Subject: [PATCH 15/78] add json read test --- test/sql/metadata_stats.test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/sql/metadata_stats.test b/test/sql/metadata_stats.test index f15955be..4bc6c074 100644 --- a/test/sql/metadata_stats.test +++ b/test/sql/metadata_stats.test @@ -6,6 +6,12 @@ require parquet require httpfs +require json + +# Test Force download with server that doesn't want to give us the head +statement ok +FROM read_json('https://api.spring.io/projects/spring-boot/generations') + statement ok SET force_download=false; From fa93e103159255f12ed8f4ff95ec2b753d6d751b Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 17:36:21 +0200 Subject: [PATCH 16/78] add core extensions var --- .github/workflows/MinioTests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index b6d9dc06..6f431e1d 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -18,6 +18,7 @@ jobs: AWS_SECRET_ACCESS_KEY: minio_duckdb_user_password DUCKDB_S3_ENDPOINT: duckdb-minio.com:9000 DUCKDB_S3_USE_SSL: false + CORE_EXTENSIONS: 'parquet;json' GEN: ninja VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake VCPKG_TARGET_TRIPLET: x64-linux From 499de9054e8cacad7e11c4596613462f510b93bf Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 17:36:51 +0200 Subject: [PATCH 17/78] always add headers, and set many more curl options on initialize --- extension/httpfs/httpfs_client.cpp | 107 ++++++++++++++--------------- 1 file changed, 50 insertions(+), 57 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 8126dcf7..b970e408 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -105,22 +105,56 @@ CURLHandle::~CURLHandle() { class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { - // initializing curl auto bearer_token = ""; if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); } state = http_params.state; curl = make_uniq(bearer_token, SelectCURLCertPath()); - } + // set curl options + // follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Curl re-uses connections by default + if (!http_params.keep_alive) { + curl_easy_setopt(*curl, CURLOPT_FORBID_REUSE, 1L); + } + + // client->enable_server_certificate_verification(http_params.enable_server_cert_verification); + if (http_params.enable_server_cert_verification) { + curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYPEER, 1L); // Verify the cert + curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYHOST, 2L); // Verify that the cert matches the hostname + } + + // TODO: no global write timeout option, but you could put customize a timeout in the write functions + // or handle use CURLOPT_XFERINFOFUNCTION (progress callback) with CURLOPT_TIMEOUT_MS + // we could also set CURLOPT_LOW_SPEED_LIMIT and timeout if the speed is too low for + // too long. + + // set read timeout + curl_easy_setopt(*curl, CURLOPT_TIMEOUT, http_params.timeout); + // set connection timeout + curl_easy_setopt(*curl, CURLOPT_CONNECTTIMEOUT, http_params.timeout); + // accept content as-is (i.e no decompressing) + curl_easy_setopt(*curl, CURLOPT_ACCEPT_ENCODING, "identity"); + + if (!http_params.http_proxy.empty()) { + curl_easy_setopt(*curl, CURLOPT_PROXY, StringUtil::Format("%s:%s", http_params.http_proxy, http_params.http_proxy_port).c_str()); + + if (!http_params.http_proxy_username.empty()) { + curl_easy_setopt(*curl, CURLOPT_PROXYUSERNAME, http_params.http_proxy_username.c_str()); + curl_easy_setopt(*curl, CURLOPT_PROXYPASSWORD, http_params.http_proxy_password.c_str()); + } + } + } unique_ptr Get(GetRequestInfo &info) override { if (state) { state->get_count++; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); auto url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); @@ -134,8 +168,7 @@ class HTTPFSClient : public HTTPClient { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); - // follow redirects - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(*curl, CURLOPT_URL, url.c_str()); // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); @@ -144,18 +177,10 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); } - // DUCKDB_LOG_DEBUG(context, "iceberg.Catalog.Curl.HTTPRequest", "GET %s (curl code '%s')", url, - // curl_easy_strerror(res)); - if (res != CURLcode::CURLE_OK) { - string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", info.url, error)); - } uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); @@ -181,7 +206,7 @@ class HTTPFSClient : public HTTPClient { state->total_bytes_sent += info.buffer_in_len; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); // Add content type header from info curl_headers.Add("Content-Type: " + info.content_type); // transform parameters @@ -217,20 +242,12 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); // Apply headers - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); // Execute the request res = curl->Execute(); } - // Check response - if (res != CURLcode::CURLE_OK) { - std::string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl PUT Request to '%s' failed with error: '%s'", info.url, error)); - } - uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); @@ -242,7 +259,7 @@ class HTTPFSClient : public HTTPClient { state->head_count++; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); // transform parameters auto url = info.url; if (!info.params.extra_headers.empty()) { @@ -257,7 +274,6 @@ class HTTPFSClient : public HTTPClient { { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); - // curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); // Perform HEAD request instead of GET curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); @@ -275,19 +291,13 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); // Add headers if any - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); // Execute HEAD request res = curl->Execute(); } - // Handle result - if (res != CURLcode::CURLE_OK) { - string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl HEAD Request to '%s' failed with error: '%s'", info.url, error)); - } + uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); return TransformResponseCurl(response_code, response_header_collection, result, res, url); @@ -298,7 +308,7 @@ class HTTPFSClient : public HTTPClient { state->delete_count++; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); // transform parameters auto url = info.url; if (!info.params.extra_headers.empty()) { @@ -310,7 +320,6 @@ class HTTPFSClient : public HTTPClient { std::string result; HeaderCollector response_header_collection; - // TODO: some delete requests require a BODY { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); @@ -330,20 +339,12 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); // Add headers if any - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); // Execute DELETE request res = curl->Execute(); } - // Handle result - if (res != CURLcode::CURLE_OK) { - std::string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl DELETE Request to '%s' failed with error: '%s'", info.url, error)); - } - // Get HTTP response status code uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); @@ -356,7 +357,7 @@ class HTTPFSClient : public HTTPClient { state->total_bytes_sent += info.buffer_in_len; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); const string content_type = "Content-Type: application/octet-stream"; curl_headers.Add(content_type.c_str()); // transform parameters @@ -391,19 +392,12 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); // Add headers if any - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); // Execute POST request res = curl->Execute(); } - // Handle result - if (res != CURLcode::CURLE_OK) { - string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl POST Request to '%s' failed with error: '%s'", info.url, error)); - } uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); info.buffer_out = result; @@ -413,7 +407,7 @@ class HTTPFSClient : public HTTPClient { private: - CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map) { + CURLRequestHeaders TransformHeadersCurl(const HTTPHeaders &header_map) { std::vector headers; for (auto &entry : header_map) { const std::string new_header = entry.first + ": " + entry.second; @@ -445,7 +439,7 @@ class HTTPFSClient : public HTTPClient { unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector &header_collection, string &body, CURLcode res, string &url) { auto status_code = HTTPStatusCode(response_code); auto response = make_uniq(status_code); - if (response_code >= 400) { + if (res != CURLcode::CURLE_OK) { if (header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { @@ -461,7 +455,6 @@ class HTTPFSClient : public HTTPClient { private: unique_ptr curl; - CURLRequestHeaders request_headers; optional_ptr state; }; From 9d8f71e241b60f4bf320781132be07b01da3379a Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 17:37:24 +0200 Subject: [PATCH 18/78] update extension ci tools --- extension-ci-tools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension-ci-tools b/extension-ci-tools index cca140d4..36959273 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit cca140d4cc47f3f3e40f29b49c305bd92845771f +Subproject commit 36959273c1f64ebfe01e644b779ca77fd52aaf0f From e0cce43741acfa0ff3d06ece6c9b416d2526b129 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 19 May 2025 10:09:56 +0200 Subject: [PATCH 19/78] fix test add comments --- test/sql/secret/secret_refresh.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index 35e27356..696279e0 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -78,10 +78,11 @@ CREATE SECRET s1 ( REFRESH 1 ) +# TODO: add FORBIDDEN back in once enum util for http status codes is merged into httpfs statement error FROM "s3://test-bucket/test-file.parquet" ---- -IO Error: HTTP/1.1 403 Forbidden +HTTP Error: Unable to connect to URL "s3://test-bucket/test-file.parquet": 403 () query I SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' @@ -118,10 +119,11 @@ statement ok set s3_access_key_id='bogus' # Without secret this query will fail, but since there are no suitable secrets, no refresh attempt will be made +# TODO: add FORBIDDEN in once enum util for http status codes is merged into httpfs statement error FROM "s3://test-bucket/test-file.parquet" ---- -IO Error: HTTP/1.1 403 Forbidden +HTTP Error: Unable to connect to URL "s3://test-bucket/test-file.parquet": 403 () # -> log empty query II From 9b9058789220c458cfca5115bc2e4edd40076dcc Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 10:39:39 +0200 Subject: [PATCH 20/78] declare header object on heap not on stack --- extension/httpfs/httpfs_client.cpp | 57 ++++++++++++++++++------------ 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index b970e408..d0903097 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -110,6 +110,8 @@ class HTTPFSClient : public HTTPClient { bearer_token = http_params.bearer_token.c_str(); } state = http_params.state; + // init curl globally, + // curl_global_init(CURL_GLOBAL_DEFAULT); curl = make_uniq(bearer_token, SelectCURLCertPath()); // set curl options @@ -149,6 +151,11 @@ class HTTPFSClient : public HTTPClient { } } + ~HTTPFSClient() { + // init curl globally, + // curl_global_cleanup(); + } + unique_ptr Get(GetRequestInfo &info) override { if (state) { state->get_count++; @@ -163,7 +170,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); @@ -175,7 +182,7 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); // write response headers (different header collection for each redirect) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); @@ -185,8 +192,8 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); idx_t bytes_received = 0; - if (response_header_collection.header_collection.back().HasHeader("content-length")) { - bytes_received = std::stoi(response_header_collection.header_collection.back().GetHeaderValue("content-length")); + if (response_header_collection && response_header_collection->header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(response_header_collection->header_collection.back().GetHeaderValue("content-length")); D_ASSERT(bytes_received == result.size()); } else { bytes_received = result.size(); @@ -197,7 +204,7 @@ class HTTPFSClient : public HTTPClient { const char* data = result.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } unique_ptr Put(PutRequestInfo &info) override { @@ -218,7 +225,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); @@ -239,7 +246,7 @@ class HTTPFSClient : public HTTPClient { // Capture response headers curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); // Apply headers curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -251,7 +258,7 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } unique_ptr Head(HeadRequestInfo &info) override { @@ -269,7 +276,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { // Set URL @@ -288,7 +295,7 @@ class HTTPFSClient : public HTTPClient { // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -300,7 +307,7 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -318,7 +325,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { // Set URL @@ -336,7 +343,7 @@ class HTTPFSClient : public HTTPClient { // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -348,7 +355,7 @@ class HTTPFSClient : public HTTPClient { // Get HTTP response status code uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } unique_ptr Post(PostRequestInfo &info) override { @@ -369,7 +376,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); @@ -389,7 +396,7 @@ class HTTPFSClient : public HTTPClient { // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -402,7 +409,7 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); info.buffer_out = result; // Construct HTTPResponse - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } private: @@ -436,20 +443,24 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector &header_collection, string &body, CURLcode res, string &url) { + unique_ptr TransformResponseCurl(uint16_t response_code, unique_ptr header_collection, string &body, CURLcode res, string &url) { auto status_code = HTTPStatusCode(response_code); auto response = make_uniq(status_code); - if (res != CURLcode::CURLE_OK) { - if (header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { - response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + if (header_collection && res != CURLcode::CURLE_OK) { + if (!header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error =header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { response->request_error = curl_easy_strerror(res); } return response; } response->body = body; - response->url = url; - response->headers = header_collection.header_collection.back(); + response->url= url; + if (header_collection && !header_collection->header_collection.empty()) { + for (auto &header : header_collection->header_collection.back()) { + response->headers.Insert(header.first, header.second); + } + } return response; } From 5b3d9884a756ca2727179a32f5fd161794638bd2 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 11:24:41 +0200 Subject: [PATCH 21/78] also uniq string --- extension/httpfs/httpfs_client.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index d0903097..804af0ba 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -268,10 +268,10 @@ class HTTPFSClient : public HTTPClient { auto curl_headers = TransformHeadersCurl(info.headers); // transform parameters - auto url = info.url; + auto url = make_uniq(info.url); if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + *url += "?" + curl_params; } CURLcode res; @@ -280,7 +280,7 @@ class HTTPFSClient : public HTTPClient { { // Set URL - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, url->c_str()); // Perform HEAD request instead of GET curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); @@ -303,11 +303,14 @@ class HTTPFSClient : public HTTPClient { // Execute HEAD request res = curl->Execute(); } - + Printer::Print("executed HEad"); + Printer::Print("url is " + *url); + Printer::Print("body is " + result); uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + Printer::Print("start transforming"); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, *url); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -454,7 +457,9 @@ class HTTPFSClient : public HTTPClient { } return response; } + Printer::Print("assigning body"); response->body = body; + Printer::Print("assigning url"); response->url= url; if (header_collection && !header_collection->header_collection.empty()) { for (auto &header : header_collection->header_collection.back()) { From ccbf00a26aeae737fde04a09525c7e12fc7f4183 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 11:34:11 +0200 Subject: [PATCH 22/78] some more debugging statements --- extension/httpfs/httpfs_client.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 804af0ba..d2ff8c45 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -102,6 +102,12 @@ CURLHandle::~CURLHandle() { } +struct RequestInfo { + string url; + string body; +}; + + class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { @@ -267,20 +273,20 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); + auto request_info = make_uniq(); + request_info->url = info.url; // transform parameters - auto url = make_uniq(info.url); if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - *url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { // Set URL - curl_easy_setopt(*curl, CURLOPT_URL, url->c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // Perform HEAD request instead of GET curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); @@ -291,7 +297,7 @@ class HTTPFSClient : public HTTPClient { // set write function to collect body — no body expected, so safe to omit curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); @@ -304,13 +310,13 @@ class HTTPFSClient : public HTTPClient { res = curl->Execute(); } Printer::Print("executed HEad"); - Printer::Print("url is " + *url); - Printer::Print("body is " + result); + Printer::Print("url is " + request_info->url); + Printer::Print("body is " + request_info->body); uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); Printer::Print("start transforming"); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, *url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, request_info->body, res, request_info->url); } unique_ptr Delete(DeleteRequestInfo &info) override { From de6d6065bccaddb086d5cd9a09d67edc258e4da8 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 11:44:17 +0200 Subject: [PATCH 23/78] move a lot of the request response info to the heap so it does not get overwritten on the stack --- extension/httpfs/httpfs_client.cpp | 80 +++++++++++++++--------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index d2ff8c45..eb8a3633 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -105,6 +105,7 @@ CURLHandle::~CURLHandle() { struct RequestInfo { string url; string body; + HeaderCollector header_collection; }; @@ -168,27 +169,27 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto url = info.url; + auto request_info = make_uniq(); + request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); - curl_easy_setopt(*curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // write response headers (different header collection for each redirect) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); @@ -202,15 +203,15 @@ class HTTPFSClient : public HTTPClient { bytes_received = std::stoi(response_header_collection->header_collection.back().GetHeaderValue("content-length")); D_ASSERT(bytes_received == result.size()); } else { - bytes_received = result.size(); + bytes_received = request_info->body.size(); } if (state) { state->total_bytes_received += bytes_received; } - const char* data = result.c_str(); + const char* data = request_info->body.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } unique_ptr Put(PutRequestInfo &info) override { @@ -220,21 +221,21 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); + auto request_info = make_uniq(); // Add content type header from info curl_headers.Add("Content-Type: " + info.content_type); // transform parameters - auto url = info.url; + request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // Perform PUT curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "PUT"); @@ -248,11 +249,11 @@ class HTTPFSClient : public HTTPClient { // Capture response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // Capture response headers curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); // Apply headers curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -264,7 +265,7 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } unique_ptr Head(HeadRequestInfo &info) override { @@ -282,7 +283,6 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto response_header_collection = make_uniq(); { // Set URL @@ -301,7 +301,7 @@ class HTTPFSClient : public HTTPClient { // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -316,7 +316,7 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); Printer::Print("start transforming"); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, request_info->body, res, request_info->url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -325,20 +325,20 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); + auto request_info = make_uniq(); // transform parameters - auto url = info.url; + request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { // Set URL - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // Set DELETE request method curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "DELETE"); @@ -348,11 +348,11 @@ class HTTPFSClient : public HTTPClient { // Set write function to collect response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -364,7 +364,7 @@ class HTTPFSClient : public HTTPClient { // Get HTTP response status code uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } unique_ptr Post(PostRequestInfo &info) override { @@ -374,21 +374,21 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); + auto request_info = make_uniq(); const string content_type = "Content-Type: application/octet-stream"; curl_headers.Add(content_type.c_str()); // transform parameters - auto url = info.url; + request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); curl_easy_setopt(*curl, CURLOPT_POST, 1L); // Set POST body @@ -401,11 +401,11 @@ class HTTPFSClient : public HTTPClient { // Set write function to collect response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -416,9 +416,9 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - info.buffer_out = result; + info.buffer_out = request_info->body; // Construct HTTPResponse - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } private: @@ -452,12 +452,12 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponseCurl(uint16_t response_code, unique_ptr header_collection, string &body, CURLcode res, string &url) { + unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector header_collection, string &body, CURLcode res, string &url) { auto status_code = HTTPStatusCode(response_code); auto response = make_uniq(status_code); - if (header_collection && res != CURLcode::CURLE_OK) { - if (!header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { - response->request_error =header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + if (res != CURLcode::CURLE_OK) { + if (!header_collection.header_collection.empty() && header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { response->request_error = curl_easy_strerror(res); } @@ -467,8 +467,8 @@ class HTTPFSClient : public HTTPClient { response->body = body; Printer::Print("assigning url"); response->url= url; - if (header_collection && !header_collection->header_collection.empty()) { - for (auto &header : header_collection->header_collection.back()) { + if (!header_collection.header_collection.empty()) { + for (auto &header : header_collection.header_collection.back()) { response->headers.Insert(header.first, header.second); } } From 0c167fd27393a6fe788f703aa8888bcb38a83827 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 13:53:29 +0200 Subject: [PATCH 24/78] this should work on the build now --- extension/httpfs/httpfs_client.cpp | 95 +++++++++++++----------------- 1 file changed, 40 insertions(+), 55 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index eb8a3633..8a93bd39 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -103,9 +103,9 @@ CURLHandle::~CURLHandle() { struct RequestInfo { - string url; - string body; - HeaderCollector header_collection; + string url = ""; + string body = ""; + uint16_t response_code = 0; }; @@ -176,8 +176,8 @@ class HTTPFSClient : public HTTPClient { request_info->url += "?" + curl_params; } + auto header_collector = make_uniq(); CURLcode res; - auto response_header_collection = make_uniq(); { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); @@ -186,22 +186,21 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // write response headers (different header collection for each redirect) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); } - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); idx_t bytes_received = 0; - if (response_header_collection && response_header_collection->header_collection.back().HasHeader("content-length")) { - bytes_received = std::stoi(response_header_collection->header_collection.back().GetHeaderValue("content-length")); - D_ASSERT(bytes_received == result.size()); + if (header_collector && header_collector->header_collection.empty() && header_collector->header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(header_collector->header_collection.back().GetHeaderValue("content-length")); + D_ASSERT(bytes_received == request_info->body.size()); } else { bytes_received = request_info->body.size(); } @@ -211,7 +210,7 @@ class HTTPFSClient : public HTTPClient { const char* data = request_info->body.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } unique_ptr Put(PutRequestInfo &info) override { @@ -231,9 +230,8 @@ class HTTPFSClient : public HTTPClient { request_info->url += "?" + curl_params; } + auto header_collector = make_uniq(); CURLcode res; - auto response_header_collection = make_uniq(); - { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -249,11 +247,11 @@ class HTTPFSClient : public HTTPClient { // Capture response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // Capture response headers curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); // Apply headers curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -262,10 +260,9 @@ class HTTPFSClient : public HTTPClient { res = curl->Execute(); } - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } unique_ptr Head(HeadRequestInfo &info) override { @@ -283,7 +280,7 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - + auto header_collector = make_uniq(); { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -297,11 +294,11 @@ class HTTPFSClient : public HTTPClient { // set write function to collect body — no body expected, so safe to omit curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -309,14 +306,9 @@ class HTTPFSClient : public HTTPClient { // Execute HEAD request res = curl->Execute(); } - Printer::Print("executed HEad"); - Printer::Print("url is " + request_info->url); - Printer::Print("body is " + request_info->body); - - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - Printer::Print("start transforming"); - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -334,8 +326,7 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto response_header_collection = make_uniq(); - + auto header_collector = make_uniq(); { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -348,11 +339,11 @@ class HTTPFSClient : public HTTPClient { // Set write function to collect response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -362,9 +353,8 @@ class HTTPFSClient : public HTTPClient { } // Get HTTP response status code - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } unique_ptr Post(PostRequestInfo &info) override { @@ -385,8 +375,7 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto response_header_collection = make_uniq(); - + auto header_collector = make_uniq(); { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); curl_easy_setopt(*curl, CURLOPT_POST, 1L); @@ -401,11 +390,11 @@ class HTTPFSClient : public HTTPClient { // Set write function to collect response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -414,15 +403,13 @@ class HTTPFSClient : public HTTPClient { res = curl->Execute(); } - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); info.buffer_out = request_info->body; // Construct HTTPResponse - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } private: - CURLRequestHeaders TransformHeadersCurl(const HTTPHeaders &header_map) { std::vector headers; for (auto &entry : header_map) { @@ -452,23 +439,21 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector header_collection, string &body, CURLcode res, string &url) { - auto status_code = HTTPStatusCode(response_code); + unique_ptr TransformResponseCurl(unique_ptr request_info, unique_ptr header_collection, CURLcode res) { + auto status_code = HTTPStatusCode(request_info->response_code); auto response = make_uniq(status_code); if (res != CURLcode::CURLE_OK) { - if (!header_collection.header_collection.empty() && header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { - response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + if (header_collection && !header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error = header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { response->request_error = curl_easy_strerror(res); } return response; } - Printer::Print("assigning body"); - response->body = body; - Printer::Print("assigning url"); - response->url= url; - if (!header_collection.header_collection.empty()) { - for (auto &header : header_collection.header_collection.back()) { + response->body = request_info->body; + response->url= request_info->url; + if (header_collection && !header_collection->header_collection.empty()) { + for (auto &header : header_collection->header_collection.back()) { response->headers.Insert(header.first, header.second); } } From 5d8d79fe1cad70530961300ee23508251d272737 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 14:34:24 +0200 Subject: [PATCH 25/78] more clean up --- .github/workflows/MinioTests.yml | 10 +- CMakeLists.txt | 2 - extension/httpfs/httpfs_client.cpp | 5 +- test/sql/copy/csv/test_csv_httpfs.test | 207 ++++++++++++++----------- 4 files changed, 123 insertions(+), 101 deletions(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index 6f431e1d..467b5b3a 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -33,7 +33,7 @@ jobs: with: python-version: '3.10' - - name: Install libraries + - name: Install Ninja shell: bash run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build @@ -48,11 +48,9 @@ jobs: with: vcpkgGitCommitId: 5e5d0e1cd7785623065e77eff011afdeec1a3574 - - name: Build extension - env: - GEN: ninja - run: | - make release + - name: Build + shell: bash + run: make - name: Start S3/HTTP test server shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index 878f6f61..865b9017 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,8 +9,6 @@ add_extension_definitions() include_directories(extension/httpfs/include ${DUCKDB_MODULE_BASE_DIR}/third_party/httplib) - - build_static_extension( httpfs extension/httpfs/hffs.cpp diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 8a93bd39..252ad2bf 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -118,7 +118,7 @@ class HTTPFSClient : public HTTPClient { } state = http_params.state; // init curl globally, - // curl_global_init(CURL_GLOBAL_DEFAULT); + curl_global_init(CURL_GLOBAL_DEFAULT); curl = make_uniq(bearer_token, SelectCURLCertPath()); // set curl options @@ -159,8 +159,7 @@ class HTTPFSClient : public HTTPClient { } ~HTTPFSClient() { - // init curl globally, - // curl_global_cleanup(); + curl_global_cleanup(); } unique_ptr Get(GetRequestInfo &info) override { diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index cbeed963..e1de167d 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -9,99 +9,35 @@ require parquet statement ok PRAGMA enable_verification -statement ok -select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet'; - - -#FIXME this test fails: file is nonexistent -mode skip - -query IIIIII rowsort -SELECT * from read_csv_auto('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c'); +query II +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; ---- -2020 Allemagne Germany 26.1 53196.069 200601.2 -2020 Autriche Austria 18.0 4723.5 26215.8 -2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 -2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 -2020 Chypre Cyprus 0.0 0.0 1627.6 -2020 Croatie Croatia 16.3 1094.8 6726.3 -2020 Danemark Denmark 11.600000000000001 1579.0 13601.4 -2020 Espagne Spain 17.4 14211.7 81512.9 -2020 Estonie Estonia 8.5 241.1 2827.3 -2020 Finlande Finland 2.8000000000000003 692.3 24674.4 -2020 France France 20.3 28278.9 139375.8 -2020 Grèce Greece 5.800000000000001 896.5 15401.9 -2020 Hongrie Hungary 30.5 5486.7 17872.4 -2020 Irlande Ireland 17.4 1968.477 11296.601 -2020 Italie Italy 29.2 33042.585 113119.475 -2020 Lettonie Latvia 8.200000000000001 323.605 3926.131 -2020 Lituanie Lithuania 10.7 584.104 5457.728 -2020 Luxembourg Luxembourg 16.5 623.165 3786.785 -2020 Malte Malta 0.0 0.0 547.5 -2020 Pays-Bas Netherlands 37.1 16588.314 44682.656 -2020 Pologne Poland 13.5 9323.205 69135.018 -2020 Portugal Portugal 11.1 1814.878 16354.725 -2020 Roumanie Romania 23.7 5626.161 23712.653 -2020 Royaume-Uni United Kingdom 32.4 39311.416 121414.483 -2020 République tchèque Czech Republic 21.4 5187.282 24263.896 -2020 Slovaquie Slovakia 25.0 2564.876 10248.401 -2020 Slovénie Slovenia 12.1 590.243 4861.315 -2020 Suède Sweden 1.5 475.195 31311.413 -2020 UE 28 Europe 28 22.5 238152.4 1056907.5 -2021 Allemagne Germany 26.760345686044435 51812.567 193616.957 -2021 Autriche Austria 18.720006775926056 4645.795 24817.272 -2021 Belgique Belgium 29.279402721103864 9088.083 31039.168 -2021 Bulgarie Bulgaria 12.368015142641884 1176.537 9512.739 -2021 Chypre Cyprus 0.0 0.0 1528.558 -2021 Croatie Croatia 17.10389029082304 1100.12 6431.987 -2021 Danemark Denmark 11.485631727184947 1508.152 13130.771 -2021 Espagne Spain 19.10173955663722 13815.0 72323.256 -2021 Estonie Estonia 8.988278645659518 245.094 2726.818 -2021 Finlande Finland 2.9937725178230212 694.288 23191.074 -2021 France France 20.649030024470434 26465.646 128168.955 -2021 Grèce Greece 7.580480506088059 1097.87 14482.855 -2021 Hongrie Hungary 32.344729318831554 5693.164 17601.52 -2021 Irlande Ireland 18.020604987495144 1953.468 10840.191 -2021 Italie Italy 30.86368769746751 31807.236 103057.147 -2021 Lettonie Latvia 8.502139837843602 322.927 3798.185 -2021 Lituanie Lithuania 11.029023816606903 582.797 5284.212 -2021 Luxembourg Luxembourg 17.282784281000467 564.365 3265.475 -2021 Malte Malta 0.0 0.0 499.875 -2021 Pays-Bas Netherlands 37.61392206122467 15896.316 42261.788 -2021 Pologne Poland 13.146720200313602 9235.656 70250.647 -2021 Portugal Portugal 11.437926753365227 1740.3 15215.17 -2021 Roumanie Romania 24.909638477223016 5846.885 23472.38 -2021 République tchèque Czech Republic 21.716683280446812 5158.445 23753.374 -2021 Slovaquie Slovakia 25.253930010417324 2427.134 9610.916 -2021 Slovénie Slovenia 13.141683407321874 582.024 4428.839 -2021 Suède Sweden 1.497679952802663 471.085 31454.317 -2021 UE 27 UE 27 21.894190365821018 193930.95399999994 885764.4460000001 - -query IIIIII rowsort res -SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); +1 actor +2 actress +3 producer +4 writer +5 cinematographer +6 composer +7 costume designer +8 director +9 editor +10 miscellaneous crew +11 production designer +12 guest - -query IIIIII rowsort res -SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); - - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ----- -1265 - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +query IIIIIIIIIIIIIIIIII +select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; ---- -1265 - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ----- -1265 +1 AAAAAAAABAAAAAAA 980124 7135 32946 2452238 2452208 Mr. Javier Lewis Y 9 12 1936 CHILE NULL Javier.Lewis@VFAxlnZEvOx.org 2452508 +2 AAAAAAAACAAAAAAA 819667 1461 31655 2452318 2452288 Dr. Amy Moses Y 9 4 1966 TOGO NULL Amy.Moses@Ovk9KjHH.com 2452318 +3 AAAAAAAADAAAAAAA 1473522 6247 48572 2449130 2449100 Miss Latisha Hamilton Y 18 9 1979 NIUE NULL Latisha.Hamilton@V.com 2452313 +4 AAAAAAAAEAAAAAAA 1703214 3986 39558 2450030 2450000 Dr. Michael White Y 7 6 1983 MEXICO NULL Michael.White@i.org 2452361 +5 AAAAAAAAFAAAAAAA 953372 4470 36368 2449438 2449408 Sir Robert Moran N 8 5 1956 FIJI NULL Robert.Moran@Hh.edu 2452469 +6 AAAAAAAAGAAAAAAA 213219 6374 27082 2451883 2451853 Ms. Brunilda Sharp Y 4 12 1925 SURINAME NULL Brunilda.Sharp@T3pylZEUQjm.org 2452430 +7 AAAAAAAAHAAAAAAA 68377 3219 44814 2451438 2451408 Ms. Fonda Wiles N 24 4 1985 GAMBIA NULL Fonda.Wiles@S9KnyEtz9hv.org 2452360 +8 AAAAAAAAIAAAAAAA 1215897 2471 16598 2449406 2449376 Sir Ollie Shipman N 26 12 1938 KOREA, REPUBLIC OF NULL Ollie.Shipman@be.org 2452334 +9 AAAAAAAAJAAAAAAA 1168667 1404 49388 2452275 2452245 Sir Karl Gilbert N 26 10 1966 MONTSERRAT NULL Karl.Gilbert@Crg5KyP2IxX9C4d6.edu 2452454 +10 AAAAAAAAKAAAAAAA 1207553 5143 19580 2451353 2451323 Ms. Albert Brunson N 15 10 1973 JORDAN NULL Albert.Brunson@62.com 2452641 #Add test for 5924 query IIIIII @@ -358,3 +294,94 @@ select * from read_csv_auto('https://csvbase.com/meripaterson/stock-exchanges'); 249 North America United States of America Members' Exchange NULL 2020-09-24 250 Africa Zimbabwe Victoria Falls Stock Exchange NULL 2020-11-01 251 Asia China Beijing Stock Exchange NULL 2021-12-27 + + +#FIXME this test fails: file is nonexistent +mode skip + +query IIIIII rowsort +SELECT * from read_csv_auto('https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'); +---- +2020 Allemagne Germany 26.1 53196.069 200601.2 +2020 Autriche Austria 18.0 4723.5 26215.8 +2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 +2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 +2020 Chypre Cyprus 0.0 0.0 1627.6 +2020 Croatie Croatia 16.3 1094.8 6726.3 +2020 Danemark Denmark 11.600000000000001 1579.0 13601.4 +2020 Espagne Spain 17.4 14211.7 81512.9 +2020 Estonie Estonia 8.5 241.1 2827.3 +2020 Finlande Finland 2.8000000000000003 692.3 24674.4 +2020 France France 20.3 28278.9 139375.8 +2020 Grèce Greece 5.800000000000001 896.5 15401.9 +2020 Hongrie Hungary 30.5 5486.7 17872.4 +2020 Irlande Ireland 17.4 1968.477 11296.601 +2020 Italie Italy 29.2 33042.585 113119.475 +2020 Lettonie Latvia 8.200000000000001 323.605 3926.131 +2020 Lituanie Lithuania 10.7 584.104 5457.728 +2020 Luxembourg Luxembourg 16.5 623.165 3786.785 +2020 Malte Malta 0.0 0.0 547.5 +2020 Pays-Bas Netherlands 37.1 16588.314 44682.656 +2020 Pologne Poland 13.5 9323.205 69135.018 +2020 Portugal Portugal 11.1 1814.878 16354.725 +2020 Roumanie Romania 23.7 5626.161 23712.653 +2020 Royaume-Uni United Kingdom 32.4 39311.416 121414.483 +2020 République tchèque Czech Republic 21.4 5187.282 24263.896 +2020 Slovaquie Slovakia 25.0 2564.876 10248.401 +2020 Slovénie Slovenia 12.1 590.243 4861.315 +2020 Suède Sweden 1.5 475.195 31311.413 +2020 UE 28 Europe 28 22.5 238152.4 1056907.5 +2021 Allemagne Germany 26.760345686044435 51812.567 193616.957 +2021 Autriche Austria 18.720006775926056 4645.795 24817.272 +2021 Belgique Belgium 29.279402721103864 9088.083 31039.168 +2021 Bulgarie Bulgaria 12.368015142641884 1176.537 9512.739 +2021 Chypre Cyprus 0.0 0.0 1528.558 +2021 Croatie Croatia 17.10389029082304 1100.12 6431.987 +2021 Danemark Denmark 11.485631727184947 1508.152 13130.771 +2021 Espagne Spain 19.10173955663722 13815.0 72323.256 +2021 Estonie Estonia 8.988278645659518 245.094 2726.818 +2021 Finlande Finland 2.9937725178230212 694.288 23191.074 +2021 France France 20.649030024470434 26465.646 128168.955 +2021 Grèce Greece 7.580480506088059 1097.87 14482.855 +2021 Hongrie Hungary 32.344729318831554 5693.164 17601.52 +2021 Irlande Ireland 18.020604987495144 1953.468 10840.191 +2021 Italie Italy 30.86368769746751 31807.236 103057.147 +2021 Lettonie Latvia 8.502139837843602 322.927 3798.185 +2021 Lituanie Lithuania 11.029023816606903 582.797 5284.212 +2021 Luxembourg Luxembourg 17.282784281000467 564.365 3265.475 +2021 Malte Malta 0.0 0.0 499.875 +2021 Pays-Bas Netherlands 37.61392206122467 15896.316 42261.788 +2021 Pologne Poland 13.146720200313602 9235.656 70250.647 +2021 Portugal Portugal 11.437926753365227 1740.3 15215.17 +2021 Roumanie Romania 24.909638477223016 5846.885 23472.38 +2021 République tchèque Czech Republic 21.716683280446812 5158.445 23753.374 +2021 Slovaquie Slovakia 25.253930010417324 2427.134 9610.916 +2021 Slovénie Slovenia 13.141683407321874 582.024 4428.839 +2021 Suède Sweden 1.497679952802663 471.085 31454.317 +2021 UE 27 UE 27 21.894190365821018 193930.95399999994 885764.4460000001 + +query IIIIII rowsort res +SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); + + +query IIIIII rowsort res +SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); + + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 \ No newline at end of file From 0c420152cdf1eb202ad1ab58c2f2cb7a024ab878 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 15:21:54 +0200 Subject: [PATCH 26/78] call curl_global_init, unsure when to call cleanup --- extension/httpfs/httpfs_client.cpp | 40 +++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index e2e3762e..13ebd6d2 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -94,6 +94,8 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v } if (!cert_path.empty()) { curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + } else { + throw InternalException("Could not set certificate authority"); } } @@ -109,6 +111,8 @@ struct RequestInfo { }; +static idx_t httpfs_client_count = 0; + class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { @@ -118,8 +122,9 @@ class HTTPFSClient : public HTTPClient { } state = http_params.state; - // init curl globally, - curl_global_init(CURL_GLOBAL_DEFAULT); + // call curl_global_init if not already done by another HTTPFS Client + InitCurlGlobal(); + curl = make_uniq(bearer_token, SelectCURLCertPath()); // set curl options @@ -160,7 +165,7 @@ class HTTPFSClient : public HTTPClient { } ~HTTPFSClient() { - curl_global_cleanup(); + DestroyCurlGlobal(); } unique_ptr Get(GetRequestInfo &info) override { @@ -289,6 +294,8 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); + curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); + // Follow redirects curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); @@ -443,6 +450,7 @@ class HTTPFSClient : public HTTPClient { auto status_code = HTTPStatusCode(request_info->response_code); auto response = make_uniq(status_code); if (res != CURLcode::CURLE_OK) { + // TODO: request error can come from HTTPS Status code toString() value. if (header_collection && !header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { response->request_error = header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { @@ -463,6 +471,32 @@ class HTTPFSClient : public HTTPClient { private: unique_ptr curl; optional_ptr state; + + static std::mutex &GetRefLock() { + static std::mutex mtx; + return mtx; + } + + static void InitCurlGlobal() { + GetRefLock(); + if (httpfs_client_count == 0) { + curl_global_init(CURL_GLOBAL_DEFAULT); + } + ++httpfs_client_count; + } + + static void DestroyCurlGlobal() { + // TODO: when to call curl_global_cleanup() + // calling it on client destruction causes SSL errors when verification is on (due to many requests). + // GetRefLock(); + // if (httpfs_client_count == 0) { + // throw InternalException("Destroying Httpfs client that did not initialize CURL"); + // } + // --httpfs_client_count; + // if (httpfs_client_count == 0) { + // curl_global_cleanup(); + // } + } }; unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { From bd44ff2a11246de2e7788c05659b909584dd81c5 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 15:25:10 +0200 Subject: [PATCH 27/78] remove verbose --- extension/httpfs/httpfs_client.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 13ebd6d2..2d93c4d6 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -294,8 +294,6 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); - curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); - // Follow redirects curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); From 3da7bfff534272dd4cb00d44722ab3615331fb7b Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:11:40 +0200 Subject: [PATCH 28/78] reuse many components --- extension/httpfs/httpfs_client.cpp | 112 +++++++++-------------------- 1 file changed, 35 insertions(+), 77 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 2d93c4d6..1c4695ee 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -59,7 +59,7 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v } // If header starts with HTTP/... curl has followed a redirect and we have a new Header, - // so we clear all of the current header_collection + // so we push back a new header_collection and store headers from the redirect there. if (header.rfind("HTTP/", 0) == 0) { header_collection->header_collection.push_back(HTTPHeaders()); header_collection->header_collection.back().Insert("__RESPONSE_STATUS__", header); @@ -108,6 +108,7 @@ struct RequestInfo { string url = ""; string body = ""; uint16_t response_code = 0; + std::vector header_collection; }; @@ -126,6 +127,7 @@ class HTTPFSClient : public HTTPClient { InitCurlGlobal(); curl = make_uniq(bearer_token, SelectCURLCertPath()); + request_info = make_uniq(); // set curl options // follow redirects @@ -153,6 +155,15 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_CONNECTTIMEOUT, http_params.timeout); // accept content as-is (i.e no decompressing) curl_easy_setopt(*curl, CURLOPT_ACCEPT_ENCODING, "identity"); + // follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // define the header callback + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + // define the write data callback (for get requests) + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); if (!http_params.http_proxy.empty()) { curl_easy_setopt(*curl, CURLOPT_PROXY, StringUtil::Format("%s:%s", http_params.http_proxy, http_params.http_proxy_port).c_str()); @@ -174,28 +185,17 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); request_info->url += "?" + curl_params; } - auto header_collector = make_uniq(); CURLcode res; { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); - - curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); - // write response data - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - // write response headers (different header collection for each redirect) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); } @@ -203,8 +203,8 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); idx_t bytes_received = 0; - if (header_collector && header_collector->header_collection.empty() && header_collector->header_collection.back().HasHeader("content-length")) { - bytes_received = std::stoi(header_collector->header_collection.back().GetHeaderValue("content-length")); + if (!request_info->header_collection.empty() && request_info->header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(request_info->header_collection.back().GetHeaderValue("content-length")); D_ASSERT(bytes_received == request_info->body.size()); } else { bytes_received = request_info->body.size(); @@ -215,7 +215,7 @@ class HTTPFSClient : public HTTPClient { const char* data = request_info->body.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl(res); } unique_ptr Put(PutRequestInfo &info) override { @@ -225,7 +225,6 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); // Add content type header from info curl_headers.Add("Content-Type: " + info.content_type); // transform parameters @@ -235,39 +234,24 @@ class HTTPFSClient : public HTTPClient { request_info->url += "?" + curl_params; } - auto header_collector = make_uniq(); CURLcode res; { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); - // Perform PUT curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "PUT"); - // Include PUT body curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); - // Follow redirects if needed - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - - // Capture response body - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - - // Capture response headers - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - // Apply headers curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); - // Execute the request res = curl->Execute(); } curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl(res); } unique_ptr Head(HeadRequestInfo &info) override { @@ -276,7 +260,6 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); request_info->url = info.url; // transform parameters if (!info.params.extra_headers.empty()) { @@ -285,7 +268,6 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto header_collector = make_uniq(); { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -294,17 +276,6 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); - // Follow redirects - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - - // set write function to collect body — no body expected, so safe to omit - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - - // Collect response headers (multiple header blocks for redirects) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -313,7 +284,7 @@ class HTTPFSClient : public HTTPClient { } curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl(res); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -322,7 +293,6 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); // transform parameters request_info->url = info.url; if (!info.params.extra_headers.empty()) { @@ -331,7 +301,6 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto header_collector = make_uniq(); { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -342,14 +311,6 @@ class HTTPFSClient : public HTTPClient { // Follow redirects curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - // Set write function to collect response body - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - - // Collect response headers (multiple header blocks for redirects) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -359,7 +320,7 @@ class HTTPFSClient : public HTTPClient { // Get HTTP response status code curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl( res); } unique_ptr Post(PostRequestInfo &info) override { @@ -369,7 +330,6 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); const string content_type = "Content-Type: application/octet-stream"; curl_headers.Add(content_type.c_str()); // transform parameters @@ -380,7 +340,6 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto header_collector = make_uniq(); { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); curl_easy_setopt(*curl, CURLOPT_POST, 1L); @@ -389,18 +348,6 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); - // Follow redirects - // TODO: should we follow redirects for POST? - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - - // Set write function to collect response body - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - - // Collect response headers (multiple header blocks for redirects) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -411,7 +358,7 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); info.buffer_out = request_info->body; // Construct HTTPResponse - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl( res); } private: @@ -444,13 +391,22 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponseCurl(unique_ptr request_info, unique_ptr header_collection, CURLcode res) { + void ResetRequestInfo() { + // clear headers after transform + request_info->header_collection.clear(); + // reset request info. + request_info->body = ""; + request_info->url = ""; + request_info->response_code = 0; + } + + unique_ptr TransformResponseCurl(CURLcode res) { auto status_code = HTTPStatusCode(request_info->response_code); auto response = make_uniq(status_code); if (res != CURLcode::CURLE_OK) { // TODO: request error can come from HTTPS Status code toString() value. - if (header_collection && !header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { - response->request_error = header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + if (!request_info->header_collection.empty() && request_info->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error = request_info->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { response->request_error = curl_easy_strerror(res); } @@ -458,17 +414,19 @@ class HTTPFSClient : public HTTPClient { } response->body = request_info->body; response->url= request_info->url; - if (header_collection && !header_collection->header_collection.empty()) { - for (auto &header : header_collection->header_collection.back()) { + if (!request_info->header_collection.empty()) { + for (auto &header : request_info->header_collection.back()) { response->headers.Insert(header.first, header.second); } } + ResetRequestInfo(); return response; } private: unique_ptr curl; optional_ptr state; + unique_ptr request_info; static std::mutex &GetRefLock() { static std::mutex mtx; From 198288e62c8b7643ac012c6a2518760ddc6d0c23 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:27:56 +0200 Subject: [PATCH 29/78] add test to check response headers --- test/sql/copy/csv/test_csv_httpfs.test | 5 ++++ test/sql/test_headers_parsed.test | 38 ++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 test/sql/test_headers_parsed.test diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index e1de167d..1641a0ee 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -9,6 +9,9 @@ require parquet statement ok PRAGMA enable_verification +statement ok +pragma enable_logging('HTTP'); + query II select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; ---- @@ -25,6 +28,8 @@ select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_ 11 production designer 12 guest +pragma enable_logging('HTTP'); + query IIIIIIIIIIIIIIIIII select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; ---- diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test new file mode 100644 index 00000000..3d331edb --- /dev/null +++ b/test/sql/test_headers_parsed.test @@ -0,0 +1,38 @@ +# name: test/sql/copy/csv/test_headers_parsed.test +# description: This test triggers the http prefetch mechanism. +# group: [csv] + +require httpfs + +require parquet + +statement ok +pragma enable_logging('HTTP'); + +query II +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; +---- +1 actor +2 actress +3 producer +4 writer +5 cinematographer +6 composer +7 costume designer +8 director +9 editor +10 miscellaneous crew +11 production designer +12 guest + +query II +select response.status from duckdb_logs_parsed('HTTP') order by all; +---- +OK_200 +PartialContent_206 + +query II +select response.headers['__RESPONSE_STATUS__'] from duckdb_logs_parsed('HTTP') order by all; +---- +HTTP/2 200 +HTTP/2 206 \ No newline at end of file From 0f9f74928bcdef6fd02820bd5c30218169604eb3 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:33:21 +0200 Subject: [PATCH 30/78] remove internal exception if certificate authority cannot be set --- extension/httpfs/httpfs_client.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 1c4695ee..9931b875 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -94,8 +94,6 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v } if (!cert_path.empty()) { curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); - } else { - throw InternalException("Could not set certificate authority"); } } From 73103b1a5bfb64ed18457b973fd1b6f06738c25c Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:37:16 +0200 Subject: [PATCH 31/78] fix test --- test/sql/test_headers_parsed.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test index 3d331edb..1044d45b 100644 --- a/test/sql/test_headers_parsed.test +++ b/test/sql/test_headers_parsed.test @@ -25,7 +25,7 @@ select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_ 11 production designer 12 guest -query II +query I select response.status from duckdb_logs_parsed('HTTP') order by all; ---- OK_200 From afb0c2aec107174546789b508de82a45f441295f Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:53:31 +0200 Subject: [PATCH 32/78] actually fix test --- test/sql/test_headers_parsed.test | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test index 1044d45b..44a5121e 100644 --- a/test/sql/test_headers_parsed.test +++ b/test/sql/test_headers_parsed.test @@ -31,8 +31,16 @@ select response.status from duckdb_logs_parsed('HTTP') order by all; OK_200 PartialContent_206 -query II -select response.headers['__RESPONSE_STATUS__'] from duckdb_logs_parsed('HTTP') order by all; + +# response status is either +# HTTP/2 200 +# HTTP/2 206 +# OR +# HTTP/1.1 200 OK +# HTTP/1.1 206 Partial Content +# depending on OS and CA (I think) +query I +select response.headers['__RESPONSE_STATUS__'] LIKE 'HTTP%20%' from duckdb_logs_parsed('HTTP') order by all; ---- -HTTP/2 200 -HTTP/2 206 \ No newline at end of file +true +true \ No newline at end of file From 4b32d40ed26e95d72d5f1e90439963dab5f6c505 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:57:48 +0200 Subject: [PATCH 33/78] remove rogue pragma --- test/sql/copy/csv/test_csv_httpfs.test | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index 1641a0ee..d62683ef 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -28,8 +28,6 @@ select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_ 11 production designer 12 guest -pragma enable_logging('HTTP'); - query IIIIIIIIIIIIIIIIII select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; ---- From 07a7afa8c44067dea6ba7d719fa8b0a34aa2304e Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 26 May 2025 10:41:57 +0200 Subject: [PATCH 34/78] re-run CI for after release From 3a93b424dc53a69ec2e2e55665a195cd020e73d1 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 17 Jun 2025 11:10:59 +0200 Subject: [PATCH 35/78] update main distribution pipeline --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index ec318563..d44e5294 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - duckdb_version: v1.3.0 + duckdb_version: v1.3.1 ci_tools_version: main duckdb-stable-deploy: @@ -27,6 +27,6 @@ jobs: secrets: inherit with: extension_name: httpfs - duckdb_version: v1.3.0 + duckdb_version: v1.3.1 ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} From 95567d3235cba0ae3279c9e02063dbfa65711129 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 3 Jul 2025 11:54:20 +0200 Subject: [PATCH 36/78] Have curl and httplib implementation convive side-by-side --- CMakeLists.txt | 6 +- ...tpfs_client.cpp => httpfs_curl_client.cpp} | 16 +- extension/httpfs/httpfs_extension.cpp | 16 ++ extension/httpfs/httpfs_httplib_client.cpp | 167 ++++++++++++++++++ extension/httpfs/include/httpfs_client.hpp | 9 + 5 files changed, 204 insertions(+), 10 deletions(-) rename extension/httpfs/{httpfs_client.cpp => httpfs_curl_client.cpp} (96%) create mode 100644 extension/httpfs/httpfs_httplib_client.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 865b9017..3aa0549d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,8 @@ build_static_extension( extension/httpfs/hffs.cpp extension/httpfs/s3fs.cpp extension/httpfs/httpfs.cpp - extension/httpfs/httpfs_client.cpp + extension/httpfs/httpfs_httplib_client.cpp + extension/httpfs/httpfs_curl_client.cpp extension/httpfs/http_state.cpp extension/httpfs/crypto.cpp extension/httpfs/create_secret_functions.cpp @@ -27,7 +28,8 @@ build_loadable_extension( extension/httpfs/hffs.cpp extension/httpfs/s3fs.cpp extension/httpfs/httpfs.cpp - extension/httpfs/httpfs_client.cpp + extension/httpfs/httpfs_httplib_client.cpp + extension/httpfs/httpfs_curl_client.cpp extension/httpfs/http_state.cpp extension/httpfs/crypto.cpp extension/httpfs/create_secret_functions.cpp diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_curl_client.cpp similarity index 96% rename from extension/httpfs/httpfs_client.cpp rename to extension/httpfs/httpfs_curl_client.cpp index 9931b875..d6482691 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_curl_client.cpp @@ -112,9 +112,9 @@ struct RequestInfo { static idx_t httpfs_client_count = 0; -class HTTPFSClient : public HTTPClient { +class HTTPFSCurlClient : public HTTPClient { public: - HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { + HTTPFSCurlClient(HTTPFSParams &http_params, const string &proto_host_port) { auto bearer_token = ""; if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); @@ -173,7 +173,7 @@ class HTTPFSClient : public HTTPClient { } } - ~HTTPFSClient() { + ~HTTPFSCurlClient() { DestroyCurlGlobal(); } @@ -453,12 +453,12 @@ class HTTPFSClient : public HTTPClient { } }; -unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { - auto client = make_uniq(http_params.Cast(), proto_host_port); +unique_ptr HTTPFSCurlUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { + auto client = make_uniq(http_params.Cast(), proto_host_port); return std::move(client); } -unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { +unordered_map HTTPFSCurlUtil::ParseGetParameters(const string &text) { unordered_map params; auto pos = text.find('?'); @@ -482,8 +482,8 @@ unordered_map HTTPFSUtil::ParseGetParameters(const string &text) return params; } -string HTTPFSUtil::GetName() const { - return "HTTPFS"; +string HTTPFSCurlUtil::GetName() const { + return "HTTPFS-Curl"; } } // namespace duckdb diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index c9bc9853..c8b9485b 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -2,6 +2,7 @@ #include "httpfs_extension.hpp" +#include "httpfs_client.hpp" #include "create_secret_functions.hpp" #include "duckdb.hpp" #include "s3fs.hpp" @@ -61,6 +62,21 @@ static void LoadInternal(DatabaseInstance &instance) { // HuggingFace options config.AddExtensionOption("hf_max_per_page", "Debug option to limit number of items returned in list requests", LogicalType::UBIGINT, Value::UBIGINT(0)); + + auto callback_httpfs_client_implementation = [](ClientContext& context, SetScope scope, Value& parameter) { + auto &config = DBConfig::GetConfig(context); + string value = StringValue::Get(parameter); + if (value == "curl" && (!config.http_util || config.http_util->GetName() != "HTTPFSUtil-Curl")) { + config.http_util = make_shared_ptr(); + } + if ((value == "httplib" || value == "default" )&& (!config.http_util || config.http_util->GetName() != "HTTPFSUtil")) { + config.http_util = make_shared_ptr(); + } + }; + config.AddExtensionOption("httpfs_client_implementation", "Select which is the HTTPUtil implementation to be used", + LogicalType::VARCHAR, "default", callback_httpfs_client_implementation); + + config.http_util = make_shared_ptr(); auto provider = make_uniq(config); diff --git a/extension/httpfs/httpfs_httplib_client.cpp b/extension/httpfs/httpfs_httplib_client.cpp new file mode 100644 index 00000000..84eb457b --- /dev/null +++ b/extension/httpfs/httpfs_httplib_client.cpp @@ -0,0 +1,167 @@ +#include "httpfs_client.hpp" +#include "http_state.hpp" + +#define CPPHTTPLIB_OPENSSL_SUPPORT +#include "httplib.hpp" + +namespace duckdb { + +class HTTPFSClient : public HTTPClient { +public: + HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { + client = make_uniq(proto_host_port); + client->set_follow_location(http_params.follow_location); + client->set_keep_alive(http_params.keep_alive); + if (!http_params.ca_cert_file.empty()) { + client->set_ca_cert_path(http_params.ca_cert_file.c_str()); + } + client->enable_server_certificate_verification(http_params.enable_server_cert_verification); + client->set_write_timeout(http_params.timeout, http_params.timeout_usec); + client->set_read_timeout(http_params.timeout, http_params.timeout_usec); + client->set_connection_timeout(http_params.timeout, http_params.timeout_usec); + client->set_decompress(false); + if (!http_params.bearer_token.empty()) { + client->set_bearer_token_auth(http_params.bearer_token.c_str()); + } + + if (!http_params.http_proxy.empty()) { + client->set_proxy(http_params.http_proxy, http_params.http_proxy_port); + + if (!http_params.http_proxy_username.empty()) { + client->set_proxy_basic_auth(http_params.http_proxy_username, http_params.http_proxy_password); + } + } + state = http_params.state; + } + + unique_ptr Get(GetRequestInfo &info) override { + if (state) { + state->get_count++; + } + auto headers = TransformHeaders(info.headers, info.params); + if (!info.response_handler && !info.content_handler) { + return TransformResult(client->Get(info.path, headers)); + } else { + return TransformResult(client->Get( + info.path.c_str(), headers, + [&](const duckdb_httplib_openssl::Response &response) { + auto http_response = TransformResponse(response); + return info.response_handler(*http_response); + }, + [&](const char *data, size_t data_length) { + if (state) { + state->total_bytes_received += data_length; + } + return info.content_handler(const_data_ptr_cast(data), data_length); + })); + } + } + unique_ptr Put(PutRequestInfo &info) override { + if (state) { + state->put_count++; + state->total_bytes_sent += info.buffer_in_len; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, + info.content_type)); + } + + unique_ptr Head(HeadRequestInfo &info) override { + if (state) { + state->head_count++; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Head(info.path, headers)); + } + + unique_ptr Delete(DeleteRequestInfo &info) override { + if (state) { + state->delete_count++; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Delete(info.path, headers)); + } + + unique_ptr Post(PostRequestInfo &info) override { + if (state) { + state->post_count++; + state->total_bytes_sent += info.buffer_in_len; + } + // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib + duckdb_httplib_openssl::Request req; + req.method = "POST"; + req.path = info.path; + req.headers = TransformHeaders(info.headers, info.params); + req.headers.emplace("Content-Type", "application/octet-stream"); + req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, + uint64_t /*total_length*/) { + if (state) { + state->total_bytes_received += data_length; + } + info.buffer_out += string(data, data_length); + return true; + }; + req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); + return TransformResult(client->send(req)); + } + +private: + duckdb_httplib_openssl::Headers TransformHeaders(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + duckdb_httplib_openssl::Headers headers; + for (auto &entry : header_map) { + headers.insert(entry); + } + for (auto &entry : params.extra_headers) { + headers.insert(entry); + } + return headers; + } + + unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { + auto status_code = HTTPUtil::ToStatusCode(response.status); + auto result = make_uniq(status_code); + result->body = response.body; + result->reason = response.reason; + for (auto &entry : response.headers) { + result->headers.Insert(entry.first, entry.second); + } + return result; + } + + unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { + if (res.error() == duckdb_httplib_openssl::Error::Success) { + auto &response = res.value(); + return TransformResponse(response); + } else { + auto result = make_uniq(HTTPStatusCode::INVALID); + result->request_error = to_string(res.error()); + return result; + } + } + +private: + unique_ptr client; + optional_ptr state; +}; + +unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { + auto client = make_uniq(http_params.Cast(), proto_host_port); + return std::move(client); +} + +unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { + duckdb_httplib_openssl::Params query_params; + duckdb_httplib_openssl::detail::parse_query_text(text, query_params); + + unordered_map result; + for (auto &entry : query_params) { + result.emplace(std::move(entry.first), std::move(entry.second)); + } + return result; +} + +string HTTPFSUtil::GetName() const { + return "HTTPFS"; +} + +} // namespace duckdb diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index 11a48a49..d540ce8b 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -37,6 +37,15 @@ class HTTPFSUtil : public HTTPUtil { string GetName() const override; }; +class HTTPFSCurlUtil : public HTTPFSUtil { +public: + unique_ptr InitializeClient(HTTPParams &http_params, const string &proto_host_port) override; + + static unordered_map ParseGetParameters(const string &text); + + string GetName() const override; +}; + class CURLHandle { public: CURLHandle(const string &token, const string &cert_path); From 401ff6be061b855af1bc707cd3269c0134610cc5 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 3 Jul 2025 12:21:34 +0200 Subject: [PATCH 37/78] Adapt test --- test/sql/test_headers_parsed.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test index 44a5121e..317ec820 100644 --- a/test/sql/test_headers_parsed.test +++ b/test/sql/test_headers_parsed.test @@ -6,6 +6,9 @@ require httpfs require parquet +statement ok +SET httpfs_client_implementation='curl'; + statement ok pragma enable_logging('HTTP'); @@ -31,7 +34,6 @@ select response.status from duckdb_logs_parsed('HTTP') order by all; OK_200 PartialContent_206 - # response status is either # HTTP/2 200 # HTTP/2 206 @@ -43,4 +45,4 @@ query I select response.headers['__RESPONSE_STATUS__'] LIKE 'HTTP%20%' from duckdb_logs_parsed('HTTP') order by all; ---- true -true \ No newline at end of file +true From 234845664f8ee1e5aa15c6cf0eee1f88927be4f4 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 15 Jul 2025 16:30:16 +0200 Subject: [PATCH 38/78] fix tests --- test/sql/secret/secret_refresh.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index 696279e0..7601dbc5 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -82,7 +82,7 @@ CREATE SECRET s1 ( statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP Error: Unable to connect to URL "s3://test-bucket/test-file.parquet": 403 () +HTTP Error: HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/test-file.parquet' (HTTP 403) query I SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' @@ -123,7 +123,7 @@ set s3_access_key_id='bogus' statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP Error: Unable to connect to URL "s3://test-bucket/test-file.parquet": 403 () +HTTP Error: HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/test-file.parquet' (HTTP 403) # -> log empty query II From dcfb5cc8e73388c8afa9a5b73e477fb0c59ceb8c Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 15 Jul 2025 17:06:11 +0200 Subject: [PATCH 39/78] add more tests, switch up the client implementation --- extension/httpfs/httpfs_curl_client.cpp | 8 +------- test/sql/copy/csv/test_csv_httpfs.test | 13 ++++++++++++- test/sql/secret/secret_aws.test | 11 +++++++++-- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/extension/httpfs/httpfs_curl_client.cpp b/extension/httpfs/httpfs_curl_client.cpp index d6482691..a1705a86 100644 --- a/extension/httpfs/httpfs_curl_client.cpp +++ b/extension/httpfs/httpfs_curl_client.cpp @@ -77,8 +77,7 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v header_collection->header_collection.back().Insert(part1, part2); } - // TODO: some headers may not follow standard response header formats. - // what to do in this case? Invalid does not mean we should abort. + // TODO: log headers that don't follow the header format return totalSize; } @@ -142,11 +141,6 @@ class HTTPFSCurlClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYHOST, 2L); // Verify that the cert matches the hostname } - // TODO: no global write timeout option, but you could put customize a timeout in the write functions - // or handle use CURLOPT_XFERINFOFUNCTION (progress callback) with CURLOPT_TIMEOUT_MS - // we could also set CURLOPT_LOW_SPEED_LIMIT and timeout if the speed is too low for - // too long. - // set read timeout curl_easy_setopt(*curl, CURLOPT_TIMEOUT, http_params.timeout); // set connection timeout diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index d62683ef..76b92bd6 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -12,6 +12,11 @@ PRAGMA enable_verification statement ok pragma enable_logging('HTTP'); +foreach httpfs_implementation curl httplib + +statement ok +SET httpfs_client_implementation='${httpfs_implementation}'; + query II select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; ---- @@ -298,6 +303,12 @@ select * from read_csv_auto('https://csvbase.com/meripaterson/stock-exchanges'); 250 Africa Zimbabwe Victoria Falls Stock Exchange NULL 2020-11-01 251 Asia China Beijing Stock Exchange NULL 2021-12-27 +endloop + +statement error +SET httpfs_client_implementation='unkown'; +---- +:.*Unsupported option for httpfs_client_implementation.* #FIXME this test fails: file is nonexistent mode skip @@ -387,4 +398,4 @@ SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/ query I SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ---- -1265 \ No newline at end of file +1265 diff --git a/test/sql/secret/secret_aws.test b/test/sql/secret/secret_aws.test index 529c4bc8..eef95039 100644 --- a/test/sql/secret/secret_aws.test +++ b/test/sql/secret/secret_aws.test @@ -18,6 +18,11 @@ require httpfs require parquet +foreach httpfs_implementation curl httplib + +statement ok +SET httpfs_client_implementation='${httpfs_implementation}'; + statement ok SET enable_logging=true @@ -32,7 +37,7 @@ set s3_region='${AWS_DEFAULT_REGION}' # Create some test data statement ok -CREATE SECRET s1 ( +CREATE or replace SECRET s1 ( TYPE AWS, KEY_ID '${AWS_ACCESS_KEY_ID}', SECRET '${AWS_SECRET_ACCESS_KEY}' @@ -44,4 +49,6 @@ copy (select 1 as a) to 's3://test-bucket/test-file.parquet' query I FROM "s3://test-bucket/test-file.parquet" ---- -1 \ No newline at end of file +1 + +endloop \ No newline at end of file From c15f72d44527be8edad5186fa2d9ac66dde7ce59 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Thu, 24 Jul 2025 17:25:23 +0200 Subject: [PATCH 40/78] apply patch --- extension/httpfs/create_secret_functions.cpp | 26 +++++++-------- extension/httpfs/crypto.cpp | 28 ++++++++++++---- extension/httpfs/httpfs.cpp | 32 ++++--------------- extension/httpfs/httpfs_extension.cpp | 24 +++++--------- .../include/create_secret_functions.hpp | 7 ++-- extension/httpfs/include/crypto.hpp | 8 ++--- .../httpfs/include/http_metadata_cache.hpp | 2 +- extension/httpfs/include/httpfs.hpp | 6 ++-- extension/httpfs/include/httpfs_extension.hpp | 2 +- 9 files changed, 62 insertions(+), 73 deletions(-) diff --git a/extension/httpfs/create_secret_functions.cpp b/extension/httpfs/create_secret_functions.cpp index c9ae0a7e..cf9997a6 100644 --- a/extension/httpfs/create_secret_functions.cpp +++ b/extension/httpfs/create_secret_functions.cpp @@ -1,15 +1,15 @@ #include "create_secret_functions.hpp" #include "s3fs.hpp" -#include "duckdb/main/extension_util.hpp" +#include "duckdb/main/extension/extension_loader.hpp" #include "duckdb/common/local_file_system.hpp" namespace duckdb { -void CreateS3SecretFunctions::Register(DatabaseInstance &instance) { - RegisterCreateSecretFunction(instance, "s3"); - RegisterCreateSecretFunction(instance, "aws"); - RegisterCreateSecretFunction(instance, "r2"); - RegisterCreateSecretFunction(instance, "gcs"); +void CreateS3SecretFunctions::Register(ExtensionLoader &loader) { + RegisterCreateSecretFunction(loader, "s3"); + RegisterCreateSecretFunction(loader, "aws"); + RegisterCreateSecretFunction(loader, "r2"); + RegisterCreateSecretFunction(loader, "gcs"); } static Value MapToStruct(const Value &map) { @@ -212,7 +212,7 @@ void CreateS3SecretFunctions::SetBaseNamedParams(CreateSecretFunction &function, } } -void CreateS3SecretFunctions::RegisterCreateSecretFunction(DatabaseInstance &instance, string type) { +void CreateS3SecretFunctions::RegisterCreateSecretFunction(ExtensionLoader &loader, string type) { // Register the new type SecretType secret_type; secret_type.name = type; @@ -220,31 +220,31 @@ void CreateS3SecretFunctions::RegisterCreateSecretFunction(DatabaseInstance &ins secret_type.default_provider = "config"; secret_type.extension = "httpfs"; - ExtensionUtil::RegisterSecretType(instance, secret_type); + loader.RegisterSecretType(secret_type); CreateSecretFunction from_empty_config_fun2 = {type, "config", CreateS3SecretFromConfig}; SetBaseNamedParams(from_empty_config_fun2, type); - ExtensionUtil::RegisterFunction(instance, from_empty_config_fun2); + loader.RegisterFunction(from_empty_config_fun2); } -void CreateBearerTokenFunctions::Register(DatabaseInstance &instance) { +void CreateBearerTokenFunctions::Register(ExtensionLoader &loader) { // HuggingFace secret SecretType secret_type_hf; secret_type_hf.name = HUGGINGFACE_TYPE; secret_type_hf.deserializer = KeyValueSecret::Deserialize; secret_type_hf.default_provider = "config"; secret_type_hf.extension = "httpfs"; - ExtensionUtil::RegisterSecretType(instance, secret_type_hf); + loader.RegisterSecretType(secret_type_hf); // Huggingface config provider CreateSecretFunction hf_config_fun = {HUGGINGFACE_TYPE, "config", CreateBearerSecretFromConfig}; hf_config_fun.named_parameters["token"] = LogicalType::VARCHAR; - ExtensionUtil::RegisterFunction(instance, hf_config_fun); + loader.RegisterFunction(hf_config_fun); // Huggingface credential_chain provider CreateSecretFunction hf_cred_fun = {HUGGINGFACE_TYPE, "credential_chain", CreateHuggingFaceSecretFromCredentialChain}; - ExtensionUtil::RegisterFunction(instance, hf_cred_fun); + loader.RegisterFunction(hf_cred_fun); } unique_ptr CreateBearerTokenFunctions::CreateSecretFunctionInternal(ClientContext &context, diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 3a89ca5f..3e61afa9 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -21,11 +21,11 @@ AESStateSSL::~AESStateSSL() { EVP_CIPHER_CTX_free(context); } -const EVP_CIPHER *AESStateSSL::GetCipher(const string &key) { +const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { switch (cipher) { case GCM: - switch (key.size()) { + switch (key_len) { case 16: return EVP_aes_128_gcm(); case 24: @@ -36,7 +36,7 @@ const EVP_CIPHER *AESStateSSL::GetCipher(const string &key) { throw InternalException("Invalid AES key length"); } case CTR: - switch (key.size()) { + switch (key_len) { case 16: return EVP_aes_128_ctr(); case 24: @@ -57,20 +57,34 @@ void AESStateSSL::GenerateRandomData(data_ptr_t data, idx_t len) { RAND_bytes(data, len); } -void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const string *key) { +void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, const_data_ptr_t aad, idx_t aad_len) { mode = ENCRYPT; - if (1 != EVP_EncryptInit_ex(context, GetCipher(*key), NULL, const_data_ptr_cast(key->data()), iv)) { + if (1 != EVP_EncryptInit_ex(context, GetCipher(key_len), NULL, key, iv)) { throw InternalException("EncryptInit failed"); } + + int len; + if (aad_len > 0){ + if (!EVP_DecryptUpdate(context, NULL, &len, aad, aad_len)) { + throw InternalException("Setting Additional Authenticated Data failed"); + } + } } -void AESStateSSL::InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const string *key) { +void AESStateSSL::InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, const_data_ptr_t aad, idx_t aad_len) { mode = DECRYPT; - if (1 != EVP_DecryptInit_ex(context, GetCipher(*key), NULL, const_data_ptr_cast(key->data()), iv)) { + if (1 != EVP_DecryptInit_ex(context, GetCipher(key_len), NULL, key, iv)) { throw InternalException("DecryptInit failed"); } + + int len; + if (aad_len > 0){ + if (!EVP_DecryptUpdate(context, NULL, &len, aad, aad_len)) { + throw InternalException("Setting Additional Authenticated Data failed"); + } + } } size_t AESStateSSL::Process(const_data_ptr_t in, idx_t in_len, data_ptr_t out, idx_t out_len) { diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 20f85a69..50b7ae9c 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -7,6 +7,7 @@ #include "duckdb/common/http_util.hpp" #include "duckdb/common/thread.hpp" #include "duckdb/common/types/hash.hpp" +#include "duckdb/common/types/time.hpp" #include "duckdb/function/scalar/strftime_format.hpp" #include "duckdb/logging/file_system_logger.hpp" #include "duckdb/main/client_context.hpp" @@ -259,19 +260,6 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str return response; } -void TimestampToTimeT(timestamp_t timestamp, time_t &result) { - auto components = Timestamp::GetComponents(timestamp); - struct tm tm {}; - tm.tm_year = components.year - 1900; - tm.tm_mon = components.month - 1; - tm.tm_mday = components.day; - tm.tm_hour = components.hour; - tm.tm_min = components.minute; - tm.tm_sec = components.second; - tm.tm_isdst = 0; - result = mktime(&tm); -} - HTTPFileHandle::HTTPFileHandle(FileSystem &fs, const OpenFileInfo &file, FileOpenFlags flags, unique_ptr params_p) : FileHandle(fs, file.path, flags), params(std::move(params_p)), http_params(params->Cast()), @@ -282,7 +270,7 @@ HTTPFileHandle::HTTPFileHandle(FileSystem &fs, const OpenFileInfo &file, FileOpe auto &info = file.extended_info->options; auto lm_entry = info.find("last_modified"); if (lm_entry != info.end()) { - TimestampToTimeT(lm_entry->second.GetValue(), last_modified); + last_modified = lm_entry->second.GetValue(); } auto etag_entry = info.find("etag"); if (etag_entry != info.end()) { @@ -462,7 +450,7 @@ int64_t HTTPFileSystem::GetFileSize(FileHandle &handle) { return sfh.length; } -time_t HTTPFileSystem::GetLastModifiedTime(FileHandle &handle) { +timestamp_t HTTPFileSystem::GetLastModifiedTime(FileHandle &handle) { auto &sfh = handle.Cast(); return sfh.last_modified; } @@ -545,20 +533,14 @@ void HTTPFileHandle::FullDownload(HTTPFileSystem &hfs, bool &should_write_cache) } } -bool HTTPFileSystem::TryParseLastModifiedTime(const string ×tamp, time_t &result) { +bool HTTPFileSystem::TryParseLastModifiedTime(const string ×tamp, timestamp_t &result) { StrpTimeFormat::ParseResult parse_result; if (!StrpTimeFormat::TryParse("%a, %d %h %Y %T %Z", timestamp, parse_result)) { return false; } - struct tm tm {}; - tm.tm_year = parse_result.data[0] - 1900; - tm.tm_mon = parse_result.data[1] - 1; - tm.tm_mday = parse_result.data[2]; - tm.tm_hour = parse_result.data[3]; - tm.tm_min = parse_result.data[4]; - tm.tm_sec = parse_result.data[5]; - tm.tm_isdst = 0; - result = mktime(&tm); + if (!parse_result.TryToTimestamp(result)) { + return false; + } return true; } diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index 392d249a..15a90ba9 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -1,5 +1,3 @@ -#define DUCKDB_EXTENSION_MAIN - #include "httpfs_extension.hpp" #include "create_secret_functions.hpp" @@ -31,7 +29,8 @@ static void SetHttpfsClientImplementation(DBConfig &config, const string &value) "`default` are currently supported"); } -static void LoadInternal(DatabaseInstance &instance) { +static void LoadInternal(ExtensionLoader &loader) { + auto &instance = loader.GetDatabaseInstance(); auto &fs = instance.GetFileSystem(); fs.RegisterSubSystem(make_uniq()); @@ -98,16 +97,16 @@ static void LoadInternal(DatabaseInstance &instance) { auto provider = make_uniq(config); provider->SetAll(); - CreateS3SecretFunctions::Register(instance); - CreateBearerTokenFunctions::Register(instance); + CreateS3SecretFunctions::Register(loader); + CreateBearerTokenFunctions::Register(loader); #ifdef OVERRIDE_ENCRYPTION_UTILS // set pointer to OpenSSL encryption state config.encryption_util = make_shared_ptr(); #endif // OVERRIDE_ENCRYPTION_UTILS } -void HttpfsExtension::Load(DuckDB &db) { - LoadInternal(*db.instance); +void HttpfsExtension::Load(ExtensionLoader &loader) { + LoadInternal(loader); } std::string HttpfsExtension::Name() { return "httpfs"; @@ -125,15 +124,8 @@ std::string HttpfsExtension::Version() const { extern "C" { -DUCKDB_EXTENSION_API void httpfs_init(duckdb::DatabaseInstance &db) { - LoadInternal(db); +DUCKDB_CPP_EXTENSION_ENTRY(httpfs, loader) { + duckdb::LoadInternal(loader); } -DUCKDB_EXTENSION_API const char *httpfs_version() { - return duckdb::DuckDB::LibraryVersion(); -} } - -#ifndef DUCKDB_EXTENSION_MAIN -#error DUCKDB_EXTENSION_MAIN not defined -#endif diff --git a/extension/httpfs/include/create_secret_functions.hpp b/extension/httpfs/include/create_secret_functions.hpp index 54b7566d..bd3bc4a3 100644 --- a/extension/httpfs/include/create_secret_functions.hpp +++ b/extension/httpfs/include/create_secret_functions.hpp @@ -8,11 +8,12 @@ struct S3AuthParams; class CreateSecretFunction; class BaseSecret; struct SecretEntry; +class ExtensionLoader; struct CreateS3SecretFunctions { public: //! Register all CreateSecretFunctions - static void Register(DatabaseInstance &instance); + static void Register(ExtensionLoader &loader); //! Secret refreshing mechanisms static CreateSecretInput GenerateRefreshSecretInfo(const SecretEntry &secret_entry, Value &refresh_info); @@ -30,7 +31,7 @@ struct CreateS3SecretFunctions { //! Helper function to set named params of secret function static void SetBaseNamedParams(CreateSecretFunction &function, string &type); //! Helper function to create secret types s3/r2/gcs - static void RegisterCreateSecretFunction(DatabaseInstance &instance, string type); + static void RegisterCreateSecretFunction(ExtensionLoader &loader, string type); }; struct CreateBearerTokenFunctions { @@ -38,7 +39,7 @@ struct CreateBearerTokenFunctions { static constexpr const char *HUGGINGFACE_TYPE = "huggingface"; //! Register all CreateSecretFunctions - static void Register(DatabaseInstance &instance); + static void Register(ExtensionLoader &loader); protected: //! Internal function to create bearer token diff --git a/extension/httpfs/include/crypto.hpp b/extension/httpfs/include/crypto.hpp index f819356f..1e142df3 100644 --- a/extension/httpfs/include/crypto.hpp +++ b/extension/httpfs/include/crypto.hpp @@ -29,13 +29,13 @@ class DUCKDB_EXTENSION_API AESStateSSL : public duckdb::EncryptionState { ~AESStateSSL() override; public: - void InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const std::string *key) override; - void InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const std::string *key) override; + void InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, const_data_ptr_t aad, idx_t aad_len) override; + void InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, const_data_ptr_t aad, idx_t aad_len) override; size_t Process(const_data_ptr_t in, idx_t in_len, data_ptr_t out, idx_t out_len) override; size_t Finalize(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len) override; void GenerateRandomData(data_ptr_t data, idx_t len) override; - const EVP_CIPHER *GetCipher(const string &key); + const EVP_CIPHER *GetCipher(idx_t key_len); size_t FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len); private: @@ -53,7 +53,7 @@ class DUCKDB_EXTENSION_API AESStateSSLFactory : public duckdb::EncryptionUtil { explicit AESStateSSLFactory() { } - duckdb::shared_ptr CreateEncryptionState(const std::string *key = nullptr) const override { + duckdb::shared_ptr CreateEncryptionState(duckdb::const_data_ptr_t key = nullptr, duckdb::idx_t key_len = 0) const override { return duckdb::make_shared_ptr(); } diff --git a/extension/httpfs/include/http_metadata_cache.hpp b/extension/httpfs/include/http_metadata_cache.hpp index 8fc7909c..83a05cce 100644 --- a/extension/httpfs/include/http_metadata_cache.hpp +++ b/extension/httpfs/include/http_metadata_cache.hpp @@ -17,7 +17,7 @@ namespace duckdb { struct HTTPMetadataCacheEntry { idx_t length; - time_t last_modified; + timestamp_t last_modified; string etag; }; diff --git a/extension/httpfs/include/httpfs.hpp b/extension/httpfs/include/httpfs.hpp index 62067d46..55c74cc4 100644 --- a/extension/httpfs/include/httpfs.hpp +++ b/extension/httpfs/include/httpfs.hpp @@ -46,7 +46,7 @@ class HTTPFileHandle : public FileHandle { // File handle info FileOpenFlags flags; idx_t length; - time_t last_modified; + timestamp_t last_modified; string etag; bool force_full_download; bool initialized = false; @@ -92,7 +92,7 @@ class HTTPFileHandle : public FileHandle { class HTTPFileSystem : public FileSystem { public: - static bool TryParseLastModifiedTime(const string ×tamp, time_t &result); + static bool TryParseLastModifiedTime(const string ×tamp, timestamp_t &result); vector Glob(const string &path, FileOpener *opener = nullptr) override { return {path}; // FIXME @@ -121,7 +121,7 @@ class HTTPFileSystem : public FileSystem { int64_t Write(FileHandle &handle, void *buffer, int64_t nr_bytes) override; void FileSync(FileHandle &handle) override; int64_t GetFileSize(FileHandle &handle) override; - time_t GetLastModifiedTime(FileHandle &handle) override; + timestamp_t GetLastModifiedTime(FileHandle &handle) override; string GetVersionTag(FileHandle &handle) override; bool FileExists(const string &filename, optional_ptr opener) override; void Seek(FileHandle &handle, idx_t location) override; diff --git a/extension/httpfs/include/httpfs_extension.hpp b/extension/httpfs/include/httpfs_extension.hpp index 3c4f3a11..eeca2c9c 100644 --- a/extension/httpfs/include/httpfs_extension.hpp +++ b/extension/httpfs/include/httpfs_extension.hpp @@ -6,7 +6,7 @@ namespace duckdb { class HttpfsExtension : public Extension { public: - void Load(DuckDB &db) override; + void Load(ExtensionLoader &loader) override; std::string Name() override; std::string Version() const override; }; From d5eaa2ad27b80687dac4d04b11bf787bdb6e347d Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Fri, 25 Jul 2025 11:16:35 +0200 Subject: [PATCH 41/78] rebase against main --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- duckdb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 7d0588bc..e3c397ba 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - duckdb_version: v1.3.2 + duckdb_version: main ci_tools_version: main @@ -28,6 +28,6 @@ jobs: secrets: inherit with: extension_name: httpfs - duckdb_version: v1.3.2 + duckdb_version: main ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} diff --git a/duckdb b/duckdb index 0b83e5d2..0769417a 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 0b83e5d2f68bc02dfefde74b846bd039f078affa +Subproject commit 0769417a29fc710aa2d9535fc281c1723def3f6e From f951aeab77dd395da51da68c88c58aee93f43488 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 12 Aug 2025 17:41:40 +0200 Subject: [PATCH 42/78] try to fix windows. may require duckdb main fix --- .github/workflows/MinioTests.yml | 2 +- extension/httpfs/httpfs_httplib_client.cpp | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index 467b5b3a..eb8ea6da 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -10,7 +10,7 @@ defaults: jobs: minio-tests: name: Minio Tests - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 env: S3_TEST_SERVER_AVAILABLE: 1 AWS_DEFAULT_REGION: eu-west-1 diff --git a/extension/httpfs/httpfs_httplib_client.cpp b/extension/httpfs/httpfs_httplib_client.cpp index 3bf5a64f..9c7457dc 100644 --- a/extension/httpfs/httpfs_httplib_client.cpp +++ b/extension/httpfs/httpfs_httplib_client.cpp @@ -1,8 +1,11 @@ #include "httpfs_client.hpp" #include "http_state.hpp" - -#define CPPHTTPLIB_OPENSSL_SUPPORT +#if defined(_WIN32) +#include "duckdb/common/windows.hpp" +#endif #include "httplib.hpp" +#define CPPHTTPLIB_OPENSSL_SUPPORT + namespace duckdb { From 9faf71a458fa867edbb634a11a61471810258b12 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 12 Aug 2025 18:16:36 +0200 Subject: [PATCH 43/78] fix build --- extension/httpfs/httpfs_httplib_client.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extension/httpfs/httpfs_httplib_client.cpp b/extension/httpfs/httpfs_httplib_client.cpp index 53fb7150..3fe7a94a 100644 --- a/extension/httpfs/httpfs_httplib_client.cpp +++ b/extension/httpfs/httpfs_httplib_client.cpp @@ -1,10 +1,11 @@ #include "httpfs_client.hpp" #include "http_state.hpp" +#define CPPHTTPLIB_OPENSSL_SUPPORT #if defined(_WIN32) #include "duckdb/common/windows.hpp" #endif #include "httplib.hpp" -#define CPPHTTPLIB_OPENSSL_SUPPORT + namespace duckdb { From 6295fbfa8cbd2c7354ab78698721268a2fcc0518 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 13 Aug 2025 10:26:55 +0200 Subject: [PATCH 44/78] remove windows requirement --- extension/httpfs/httpfs_httplib_client.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/extension/httpfs/httpfs_httplib_client.cpp b/extension/httpfs/httpfs_httplib_client.cpp index 3fe7a94a..fc26cc45 100644 --- a/extension/httpfs/httpfs_httplib_client.cpp +++ b/extension/httpfs/httpfs_httplib_client.cpp @@ -1,9 +1,6 @@ #include "httpfs_client.hpp" #include "http_state.hpp" #define CPPHTTPLIB_OPENSSL_SUPPORT -#if defined(_WIN32) -#include "duckdb/common/windows.hpp" -#endif #include "httplib.hpp" From d3faccf0b5a61686a21f2e4eb04f4b3cf24266bf Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 13 Aug 2025 11:02:42 +0200 Subject: [PATCH 45/78] forcing CI again From f3a8e2e8c637519429d0ea1696919d42bea74e14 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 13:50:04 +0200 Subject: [PATCH 46/78] add override to test with httlib with windows fixes --- .github/workflows/MainDistributionPipeline.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index e3c397ba..a4351c44 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,6 +17,8 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs + override_repository: Tmonster/duckdb + override_ref: 8a495cef3c25f3d48223018f6b6f95b0bf83299e duckdb_version: main ci_tools_version: main From d930fc1b70c8fc0d9123d146dc9c47794e5b167e Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 14:02:02 +0200 Subject: [PATCH 47/78] remove unnecesry things --- .github/workflows/MainDistributionPipeline.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index a4351c44..e3c397ba 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,8 +17,6 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - override_repository: Tmonster/duckdb - override_ref: 8a495cef3c25f3d48223018f6b6f95b0bf83299e duckdb_version: main ci_tools_version: main From 34143d17390b32bd4443debe4ed38838860bfbc1 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 14:02:51 +0200 Subject: [PATCH 48/78] update submodule pointer to see if it passes tests --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 0769417a..25f7e6fd 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 0769417a29fc710aa2d9535fc281c1723def3f6e +Subproject commit 25f7e6fd0e7243948202c5f850f388d88be49150 From 43fcbcdedf91016453082eb4a154f957ceb8befd Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 15:12:47 +0200 Subject: [PATCH 49/78] update submodule again --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 25f7e6fd..d53321d9 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 25f7e6fd0e7243948202c5f850f388d88be49150 +Subproject commit d53321d994af5a6c9e26908faf60e82cd6c5410a From 5ce32755fb44d46b5367cbe8cfa1701137405860 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 15:31:58 +0200 Subject: [PATCH 50/78] some changes to the workflow --- .github/workflows/MainDistributionPipeline.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index e3c397ba..05d70ac6 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,9 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - duckdb_version: main + override_duckdb_repository: tmonster/duckdb-httpfs + set_caller_as_duckdb: true + duckdb_version: d53321d994af5a6c9e26908faf60e82cd6c5410a ci_tools_version: main From 87f97976c2288b16b56577e92902947d269b4d8a Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 15:32:42 +0200 Subject: [PATCH 51/78] update duckdb ref --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index d53321d9..981f07e8 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit d53321d994af5a6c9e26908faf60e82cd6c5410a +Subproject commit 981f07e87f132eb3d46841e19d30a8b7e28c34b9 From 6a4403370517a8a54da08280c58e06f2d7a92f0f Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 15:33:13 +0200 Subject: [PATCH 52/78] update ref --- .github/workflows/MainDistributionPipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 05d70ac6..a428bcab 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -19,7 +19,7 @@ jobs: extension_name: httpfs override_duckdb_repository: tmonster/duckdb-httpfs set_caller_as_duckdb: true - duckdb_version: d53321d994af5a6c9e26908faf60e82cd6c5410a + duckdb_version: 981f07e87f132eb3d46841e19d30a8b7e28c34b9 ci_tools_version: main From 860855221966db36439ae41ddcd87adecd49ec98 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 15:36:14 +0200 Subject: [PATCH 53/78] fix the workflow --- .github/workflows/MainDistributionPipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index a428bcab..36c45301 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - override_duckdb_repository: tmonster/duckdb-httpfs + override_duckdb_repository: https://github.com/tmonster/duckdb-httpfs.git set_caller_as_duckdb: true duckdb_version: 981f07e87f132eb3d46841e19d30a8b7e28c34b9 ci_tools_version: main From 07dbd119f01b1e86ef1ef4a84b821937ce80bce4 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 15:40:40 +0200 Subject: [PATCH 54/78] do not set duckdb to ref of calling worfklow --- .github/workflows/MainDistributionPipeline.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 36c45301..dbfd0535 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -18,7 +18,6 @@ jobs: with: extension_name: httpfs override_duckdb_repository: https://github.com/tmonster/duckdb-httpfs.git - set_caller_as_duckdb: true duckdb_version: 981f07e87f132eb3d46841e19d30a8b7e28c34b9 ci_tools_version: main From f77d92c134e82796b10159ef8db539fb48326d9a Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 15:45:20 +0200 Subject: [PATCH 55/78] needs to be my duckdb repo --- .github/workflows/MainDistributionPipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index dbfd0535..5adaea8f 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - override_duckdb_repository: https://github.com/tmonster/duckdb-httpfs.git + override_duckdb_repository: https://github.com/tmonster/duckdb.git duckdb_version: 981f07e87f132eb3d46841e19d30a8b7e28c34b9 ci_tools_version: main From ab630dc5b392de8864d1840ec3ec20c02b3c6476 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 17:09:30 +0200 Subject: [PATCH 56/78] fix up main distribution pipeline --- .github/workflows/MainDistributionPipeline.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 5adaea8f..7d0588bc 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,8 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - override_duckdb_repository: https://github.com/tmonster/duckdb.git - duckdb_version: 981f07e87f132eb3d46841e19d30a8b7e28c34b9 + duckdb_version: v1.3.2 ci_tools_version: main @@ -29,6 +28,6 @@ jobs: secrets: inherit with: extension_name: httpfs - duckdb_version: main + duckdb_version: v1.3.2 ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} From 661c861580d810d8f458afe83ecfc3d51395dc6d Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 15 Aug 2025 17:10:35 +0200 Subject: [PATCH 57/78] use duckdb v1.4 --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- duckdb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 7d0588bc..e3c397ba 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - duckdb_version: v1.3.2 + duckdb_version: main ci_tools_version: main @@ -28,6 +28,6 @@ jobs: secrets: inherit with: extension_name: httpfs - duckdb_version: v1.3.2 + duckdb_version: main ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} diff --git a/duckdb b/duckdb index 981f07e8..2ed9bf88 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 981f07e87f132eb3d46841e19d30a8b7e28c34b9 +Subproject commit 2ed9bf887f61a0ac226ab8c8f1164601d985d607 From 4e5a9858cd8c22bcb032782f167d3301c855e7cb Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 18 Aug 2025 13:55:44 +0200 Subject: [PATCH 58/78] bump duckdb to include build fix --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 2ed9bf88..2e67bfb6 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 2ed9bf887f61a0ac226ab8c8f1164601d985d607 +Subproject commit 2e67bfb6159b833e00c34e091f2ae02cefb87b75 From 13972ba8151adca2d3f9f62d765d54dcf7948881 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 20 Aug 2025 11:28:19 +0200 Subject: [PATCH 59/78] if there is no content handler, do not call it --- extension/httpfs/httpfs_curl_client.cpp | 5 ++++- .../curl_client/test_load_other_extensions.test | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 test/sql/curl_client/test_load_other_extensions.test diff --git a/extension/httpfs/httpfs_curl_client.cpp b/extension/httpfs/httpfs_curl_client.cpp index a1705a86..3c45c80a 100644 --- a/extension/httpfs/httpfs_curl_client.cpp +++ b/extension/httpfs/httpfs_curl_client.cpp @@ -206,7 +206,10 @@ class HTTPFSCurlClient : public HTTPClient { } const char* data = request_info->body.c_str(); - info.content_handler(const_data_ptr_cast(data), bytes_received); + if (info.content_handler) { + info.content_handler(const_data_ptr_cast(data), bytes_received); + } + return TransformResponseCurl(res); } diff --git a/test/sql/curl_client/test_load_other_extensions.test b/test/sql/curl_client/test_load_other_extensions.test new file mode 100644 index 00000000..5ce322f5 --- /dev/null +++ b/test/sql/curl_client/test_load_other_extensions.test @@ -0,0 +1,17 @@ +# name: test/sql/curl_client/test_load_other_extensions.test +# description: when using the curl client, test loading other extensions +# group: [httpfs_client] + +require httpfs + +# Do not ignore 'HTTP' error messages! +set ignore_error_messages + +statement ok +SET httpfs_client_implementation='curl'; + +statement error +INSTALL non_existent_extension; +---- +:.*HTTP Error: Failed to download extension "non_existent_extension" at URL.* + \ No newline at end of file From 0bb4efa2d84fc077290338d51e008cc1f59f42b5 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 20 Aug 2025 12:26:55 +0200 Subject: [PATCH 60/78] change expected error message to pass tests --- test/sql/curl_client/test_load_other_extensions.test | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/sql/curl_client/test_load_other_extensions.test b/test/sql/curl_client/test_load_other_extensions.test index 5ce322f5..6a4fa517 100644 --- a/test/sql/curl_client/test_load_other_extensions.test +++ b/test/sql/curl_client/test_load_other_extensions.test @@ -13,5 +13,4 @@ SET httpfs_client_implementation='curl'; statement error INSTALL non_existent_extension; ---- -:.*HTTP Error: Failed to download extension "non_existent_extension" at URL.* - \ No newline at end of file +:.*HTTP Error: Failed to download extension.* From c1760075e74c91fd187990d32afa328e1f616bc0 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 3 Sep 2025 15:40:58 +0200 Subject: [PATCH 61/78] add CALL enable_profiling --- test/sql/copy/csv/test_csv_httpfs.test | 2 +- test/sql/test_headers_parsed.test | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index 76b92bd6..d461416b 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -10,7 +10,7 @@ statement ok PRAGMA enable_verification statement ok -pragma enable_logging('HTTP'); +CALL enable_logging('HTTP'); foreach httpfs_implementation curl httplib diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test index 317ec820..d0e76bd8 100644 --- a/test/sql/test_headers_parsed.test +++ b/test/sql/test_headers_parsed.test @@ -10,7 +10,7 @@ statement ok SET httpfs_client_implementation='curl'; statement ok -pragma enable_logging('HTTP'); +CALL enable_logging('HTTP'); query II select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; From c5ad14f13cd2c9bef5383b64aec084452dbae99d Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 3 Sep 2025 19:11:11 +0200 Subject: [PATCH 62/78] update duckdb submodule pointer as well so we can use 'call enable_logging(http)' --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 2e67bfb6..948db06d 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 2e67bfb6159b833e00c34e091f2ae02cefb87b75 +Subproject commit 948db06dc94366a36b98eba26ecb7d8398a75c84 From 7e8f90b21475bb616e0f75a354061ce6a90c386a Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 4 Sep 2025 10:47:48 +0200 Subject: [PATCH 63/78] bump duckdb submodule --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 948db06d..605eaf76 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 948db06dc94366a36b98eba26ecb7d8398a75c84 +Subproject commit 605eaf76be154d5c6d38353f96b23c031795572d From db5ce0f437cc77960fb2843e009228d2abc6bad9 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 4 Sep 2025 10:48:52 +0200 Subject: [PATCH 64/78] bump submodule and apply patches --- extension/httpfs/crypto.cpp | 36 ++- extension/httpfs/include/crypto.hpp | 10 +- test/sql/copy/csv/test_csv_httpfs.test | 401 ------------------------- 3 files changed, 25 insertions(+), 422 deletions(-) delete mode 100644 test/sql/copy/csv/test_csv_httpfs.test diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 8578f27e..18878ed9 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -7,6 +7,10 @@ #define CPPHTTPLIB_OPENSSL_SUPPORT +#include "include/crypto.hpp" + +#include "re2/re2.h" + #include #include #include @@ -19,7 +23,7 @@ namespace duckdb { -AESStateSSL::AESStateSSL(const std::string *key) : context(EVP_CIPHER_CTX_new()) { +AESStateSSL::AESStateSSL(EncryptionTypes::CipherType cipher_p, const std::string *key) : EncryptionState(cipher_p), context(EVP_CIPHER_CTX_new()), cipher(cipher_p) { if (!(context)) { throw InternalException("AES GCM failed with initializing context"); } @@ -33,7 +37,7 @@ AESStateSSL::~AESStateSSL() { const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { switch (cipher) { - case GCM: + case EncryptionTypes::GCM: switch (key_len) { case 16: return EVP_aes_128_gcm(); @@ -44,18 +48,18 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { default: throw InternalException("Invalid AES key length"); } - case CTR: + case EncryptionTypes::CBC: { switch (key_len) { case 16: - return EVP_aes_128_ctr(); + return EVP_aes_128_cbc(); case 24: - return EVP_aes_192_ctr(); + return EVP_aes_192_cbc(); case 32: - return EVP_aes_256_ctr(); + return EVP_aes_256_cbc(); default: throw InternalException("Invalid AES key length"); } - + } default: throw duckdb::InternalException("Invalid Encryption/Decryption Cipher: %d", static_cast(cipher)); } @@ -67,7 +71,7 @@ void AESStateSSL::GenerateRandomData(data_ptr_t data, idx_t len) { } void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, const_data_ptr_t aad, idx_t aad_len) { - mode = ENCRYPT; + mode = EncryptionTypes::ENCRYPT; if (1 != EVP_EncryptInit_ex(context, GetCipher(key_len), NULL, key, iv)) { throw InternalException("EncryptInit failed"); @@ -82,7 +86,7 @@ void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_ } void AESStateSSL::InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, const_data_ptr_t aad, idx_t aad_len) { - mode = DECRYPT; + mode = EncryptionTypes::DECRYPT; if (1 != EVP_DecryptInit_ex(context, GetCipher(key_len), NULL, key, iv)) { throw InternalException("DecryptInit failed"); @@ -99,14 +103,14 @@ void AESStateSSL::InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const_ size_t AESStateSSL::Process(const_data_ptr_t in, idx_t in_len, data_ptr_t out, idx_t out_len) { switch (mode) { - case ENCRYPT: + case EncryptionTypes::ENCRYPT: if (1 != EVP_EncryptUpdate(context, data_ptr_cast(out), reinterpret_cast(&out_len), const_data_ptr_cast(in), (int)in_len)) { throw InternalException("EncryptUpdate failed"); } break; - case DECRYPT: + case EncryptionTypes::DECRYPT: if (1 != EVP_DecryptUpdate(context, data_ptr_cast(out), reinterpret_cast(&out_len), const_data_ptr_cast(in), (int)in_len)) { @@ -126,7 +130,7 @@ size_t AESStateSSL::FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, i auto text_len = out_len; switch (mode) { - case ENCRYPT: { + case EncryptionTypes::ENCRYPT: { if (1 != EVP_EncryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len))) { throw InternalException("EncryptFinal failed"); } @@ -138,7 +142,7 @@ size_t AESStateSSL::FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, i } return text_len; } - case DECRYPT: { + case EncryptionTypes::DECRYPT: { // Set expected tag value if (!EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) { throw InternalException("Finalizing tag failed"); @@ -161,14 +165,14 @@ size_t AESStateSSL::FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, i size_t AESStateSSL::Finalize(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len) { - if (cipher == GCM) { + if (cipher == EncryptionTypes::GCM) { return FinalizeGCM(out, out_len, tag, tag_len); } auto text_len = out_len; switch (mode) { - case ENCRYPT: { + case EncryptionTypes::ENCRYPT: { if (1 != EVP_EncryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len))) { throw InternalException("EncryptFinal failed"); } @@ -176,7 +180,7 @@ size_t AESStateSSL::Finalize(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_ return text_len += out_len; } - case DECRYPT: { + case EncryptionTypes::DECRYPT: { // EVP_DecryptFinal() will return an error code if final block is not correctly formatted. int ret = EVP_DecryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len)); text_len += out_len; diff --git a/extension/httpfs/include/crypto.hpp b/extension/httpfs/include/crypto.hpp index 1e142df3..72100efe 100644 --- a/extension/httpfs/include/crypto.hpp +++ b/extension/httpfs/include/crypto.hpp @@ -25,7 +25,7 @@ void hex256(hash_bytes &in, hash_str &out); class DUCKDB_EXTENSION_API AESStateSSL : public duckdb::EncryptionState { public: - explicit AESStateSSL(const std::string *key = nullptr); + explicit AESStateSSL(duckdb::EncryptionTypes::CipherType cipher_p, const std::string *key = nullptr); ~AESStateSSL() override; public: @@ -40,8 +40,8 @@ class DUCKDB_EXTENSION_API AESStateSSL : public duckdb::EncryptionState { private: EVP_CIPHER_CTX *context; - Mode mode; - Cipher cipher = GCM; + duckdb::EncryptionTypes::Mode mode; + duckdb::EncryptionTypes::CipherType cipher; }; } // namespace duckdb @@ -53,8 +53,8 @@ class DUCKDB_EXTENSION_API AESStateSSLFactory : public duckdb::EncryptionUtil { explicit AESStateSSLFactory() { } - duckdb::shared_ptr CreateEncryptionState(duckdb::const_data_ptr_t key = nullptr, duckdb::idx_t key_len = 0) const override { - return duckdb::make_shared_ptr(); + duckdb::shared_ptr CreateEncryptionState(duckdb::EncryptionTypes::CipherType cipher_p, duckdb::const_data_ptr_t key = nullptr, duckdb::idx_t key_len = 0) const override { + return duckdb::make_shared_ptr(cipher_p); } ~AESStateSSLFactory() override { diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test deleted file mode 100644 index d461416b..00000000 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ /dev/null @@ -1,401 +0,0 @@ -# name: test/sql/copy/csv/test_csv_httpfs.test -# description: This test triggers the http prefetch mechanism. -# group: [csv] - -require httpfs - -require parquet - -statement ok -PRAGMA enable_verification - -statement ok -CALL enable_logging('HTTP'); - -foreach httpfs_implementation curl httplib - -statement ok -SET httpfs_client_implementation='${httpfs_implementation}'; - -query II -select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; ----- -1 actor -2 actress -3 producer -4 writer -5 cinematographer -6 composer -7 costume designer -8 director -9 editor -10 miscellaneous crew -11 production designer -12 guest - -query IIIIIIIIIIIIIIIIII -select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; ----- -1 AAAAAAAABAAAAAAA 980124 7135 32946 2452238 2452208 Mr. Javier Lewis Y 9 12 1936 CHILE NULL Javier.Lewis@VFAxlnZEvOx.org 2452508 -2 AAAAAAAACAAAAAAA 819667 1461 31655 2452318 2452288 Dr. Amy Moses Y 9 4 1966 TOGO NULL Amy.Moses@Ovk9KjHH.com 2452318 -3 AAAAAAAADAAAAAAA 1473522 6247 48572 2449130 2449100 Miss Latisha Hamilton Y 18 9 1979 NIUE NULL Latisha.Hamilton@V.com 2452313 -4 AAAAAAAAEAAAAAAA 1703214 3986 39558 2450030 2450000 Dr. Michael White Y 7 6 1983 MEXICO NULL Michael.White@i.org 2452361 -5 AAAAAAAAFAAAAAAA 953372 4470 36368 2449438 2449408 Sir Robert Moran N 8 5 1956 FIJI NULL Robert.Moran@Hh.edu 2452469 -6 AAAAAAAAGAAAAAAA 213219 6374 27082 2451883 2451853 Ms. Brunilda Sharp Y 4 12 1925 SURINAME NULL Brunilda.Sharp@T3pylZEUQjm.org 2452430 -7 AAAAAAAAHAAAAAAA 68377 3219 44814 2451438 2451408 Ms. Fonda Wiles N 24 4 1985 GAMBIA NULL Fonda.Wiles@S9KnyEtz9hv.org 2452360 -8 AAAAAAAAIAAAAAAA 1215897 2471 16598 2449406 2449376 Sir Ollie Shipman N 26 12 1938 KOREA, REPUBLIC OF NULL Ollie.Shipman@be.org 2452334 -9 AAAAAAAAJAAAAAAA 1168667 1404 49388 2452275 2452245 Sir Karl Gilbert N 26 10 1966 MONTSERRAT NULL Karl.Gilbert@Crg5KyP2IxX9C4d6.edu 2452454 -10 AAAAAAAAKAAAAAAA 1207553 5143 19580 2451353 2451323 Ms. Albert Brunson N 15 10 1973 JORDAN NULL Albert.Brunson@62.com 2452641 - -#Add test for 5924 -query IIIIII -select * from read_csv_auto('https://csvbase.com/meripaterson/stock-exchanges'); ----- -1 Africa Lesotho HYBSE NULL 2019-03-25 -2 Asia Kazakhstan Astana International Financial Centre AIXK 2018-11-18 -3 Africa South Africa ZAR X ZARX 2018-11-18 -4 South America Argentina Bolsas y Mercados Argentinos NULL 2018-04-02 -5 North America United States of America Delaware Board of Trade NULL 2018-04-02 -6 Australia & Oceania Australia Chi-X Asia Pacific NULL 2018-04-02 -7 Australia & Oceania Australia Chi-X Australia CHIA 2018-04-02 -8 South America Mexico BIVA BIVA 2018-01-06 -9 Africa South Africa Equity Express Securities Exchange NULL 2017-12-11 -10 Africa South Africa Cape Town Stock Exchange NULL 2021-10-22 -11 North America Curacao Dutch Caribbean Securities Exchange DCSX 2017-09-12 -12 North America Canada NEO NEOE 2017-09-06 -13 North America Canada Canadian Securities Exchange CNSX 2017-09-06 -14 Western Europe Germany XETRA XETR 2017-08-21 -15 Western Europe France Euronext Paris XPAR 2017-08-19 -16 Western Europe United Kingdom Euronext London XLDN 2017-08-19 -17 Eastern Europe Albania Tirana Stock Exchange XTIR 2017-08-16 -18 Africa Algeria Bourse d'Alger XALG 2017-08-16 -19 Africa Angola BODIVA XBDV 2017-08-16 -20 South America Argentina Buenos Aires Stock Exchange XBUE 2017-08-16 -21 South America Argentina Mercado Abierto Electrónico XMAB 2017-08-16 -22 Eastern Europe Armenia Armenia Securities Exchange XARM 2020-07-29 -23 Australia & Oceania Australia Australian Securities Exchange XASX 2017-08-16 -24 Australia & Oceania Australia Block Event BLEV 2017-08-16 -25 Australia & Oceania Australia IR Plus Securities Exchange SIMV 2017-08-16 -26 Australia & Oceania Australia National Stock Exchange of Australia XNEC 2017-08-16 -27 Australia & Oceania Australia Sydney Stock Exchange APXL 2017-08-16 -28 Western Europe Austria Wiener Börse XWBO 2017-08-16 -29 Asia Azerbaijan Baku Stock Exchange BSEX 2017-08-16 -30 North America Bahamas Bahamas International Securities Exchange XBAA 2017-08-16 -31 Middle East Bahrain Bahrain Bourse XBAH 2017-08-16 -32 Asia Bangladesh Chittagong Stock Exchange XCHG 2017-08-16 -33 Asia Bangladesh Dhaka Stock Exchange XDHA 2017-08-16 -34 North America Barbados Barbados Stock Exchange XBAB 2017-08-16 -35 Eastern Europe Belarus Belarusian Currency and Stock Exchange BCSE 2017-08-16 -36 Western Europe Belgium Euronext Brussels XBRU 2017-08-16 -37 North America Bermuda Bermuda Stock Exchange XBDA 2017-08-16 -38 Asia Bhutan Royal Securities Exchange of Bhutan NULL 2017-08-16 -39 South America Bolivia Bolsa de Valores de Bolivia XBOL 2017-08-16 -40 Eastern Europe Bosnia and Herzegovina Banja Luka Stock Exchange XBLB 2017-08-16 -41 Eastern Europe Bosnia and Herzegovina Sarajevo Stock Exchange XSSE 2017-08-16 -42 Africa Botswana Botswana Stock Exchange XBOT 2017-08-16 -43 South America Brazil B3 - Brasil Bolsa Balcão BVMF 2017-08-16 -44 South America Brazil Bolsa de Valores Minab - Espírito Santo BOVM 2017-08-16 -45 Eastern Europe Bulgaria Bulgarian Stock Exchange XBUL 2017-08-16 -46 Asia Cambodia Cambodia Securities Exchange XCSX 2017-08-16 -47 North America Canada Montreal Exchange XMOD 2017-08-16 -48 North America Canada Nasdaq Canada XCSD 2017-08-16 -49 North America Canada TMX TMXS 2017-08-16 -50 North America Canada Toronto Stock Exchange XTSE 2017-08-16 -51 Africa Cape Verde Bolsa de Valores de Cabo Verde XBVC 2017-08-16 -52 North America Cayman Islands Cayman Islands Stock Exchange XCAY 2017-08-16 -53 Western Europe Channel Islands Channel Islands Stock Exchange NULL 2017-08-16 -54 South America Chile Santiago Electronic Stock Exchange XBCL 2017-08-16 -55 South America Chile Santiago Stock Exchange XSGO 2017-08-16 -56 South America Chile Valparaiso Stock Exchange BOVA 2017-08-16 -57 Asia China Shanghai Stock Exchange XSHG 2017-08-16 -58 Asia China Shenzhen Stock Exchange XSHE 2017-08-16 -59 South America Colombia Bolsa de Valores de Colombia XBOG 2017-08-16 -60 North America Costa Rica Bolsa Nacional de Valores de Costa Rica XBNV 2017-08-16 -61 Eastern Europe Croatia Zagreb Stock Exchange XZAG 2017-08-16 -62 Eastern Europe Cyprus Cyprus Stock Exchange XCYS 2017-08-16 -63 Eastern Europe Czech Republic Prague Stock Exchange XPRAG 2017-08-16 -64 Eastern Europe Czech Republic RM-System Czech Stock Exchange XRMZ 2017-08-16 -65 Western Europe Denmark Nasdaq Copenhagen XCSE 2017-08-16 -66 North America Dominican Republic Bolsa de Valores de la República Dominicana XBVR 2017-08-16 -67 South America Ecuador Bolsa de Valores de Guayaquil XGUA 2017-08-16 -68 South America Ecuador Bolsa de Valores de Quito XQUI 2017-08-16 -69 Africa Egypt Egyptian Exchange XCAI 2017-08-16 -70 Africa Egypt Nilex NILX 2017-08-16 -71 North America El Salvador Bolsa de Valores de El Salvador XSVA 2017-08-16 -72 Eastern Europe Estonia Tallinn Stock Exchange XTAL 2017-08-16 -73 Australia & Oceania Fiji South Pacific Stock Exchange XSPS 2017-08-16 -74 Western Europe Finland Nasdaq Helsinki XHEL 2017-08-16 -75 Africa Gabon Bourse Régionale des Valeurs Mobilières d'Afrique Centrale NULL 2017-08-16 -76 Asia Georgia Georgian Stock Exchange XGSE 2017-08-16 -77 Western Europe Germany Börse Berlin XBER 2017-08-16 -78 Western Europe Germany Börse Düsseldorf XDUS 2017-08-16 -79 Western Europe Germany Börse Hamburg & Hannover HAMB 2017-08-16 -80 Western Europe Germany Börse München XMUN 2017-08-16 -81 Western Europe Germany Börse Stuttgart XSTU 2017-08-16 -82 Western Europe Germany Deutsche Börse Group XFRA 2017-08-16 -83 Western Europe Germany Eurex XEUR 2017-08-16 -84 Western Europe Germany Tradegate Exchange TGAT 2017-08-16 -85 Africa Ghana Ghana Stock Exchange XGHA 2017-08-16 -86 Western Europe Gibraltar Gibraltar Stock Exchange GSXL 2017-08-16 -87 Western Europe Greece Athens Stock Exchange ASEX 2017-08-16 -88 North America Guatemala Bolsa Nacional de Valores XGTG 2017-08-16 -89 Western Europe Guernsey International Stock Exchange XCIE 2017-08-16 -90 South America Guyana Guyana Stock Exchange GSCI 2017-08-16 -91 North America Haiti Haitian Stock Exchange NULL 2017-08-16 -92 North America Honduras Bolsa Centroamericana de Valores XBCV 2017-08-16 -93 Asia Hong Kong Hong Kong Growth Enterprise Market XGEM 2017-08-16 -94 Asia Hong Kong Hong Kong Stock Exchange XHKG 2017-08-16 -95 Eastern Europe Hungary Budapest Stock Exchange XBUD 2017-08-16 -96 Western Europe Iceland Nasdaq Iceland XICE 2017-08-16 -97 Asia India Ahmedabad Stock Exchange NULL 2017-08-16 -98 Asia India Bangalore Stock Exchange XBAN 2017-08-16 -99 Asia India Bombay Stock Exchange XBOM 2017-08-16 -100 Asia India BSE SME BSME 2017-08-16 -101 Asia India Calcutta Stock Exchange XCAL 2017-08-16 -102 Asia India Cochin Stock Exchange NULL 2017-08-16 -103 Asia India Coimbatore Stock Exchange NULL 2017-08-16 -104 Asia India Delhi Stock Exchange XDES 2017-08-16 -105 Asia India Inter-Connected Stock Exchange of India ISEX 2017-08-16 -106 Asia India Ludhiana Stock and Capital NULL 2017-08-16 -107 Asia India Metropolitan Stock Exchange NULL 2017-08-16 -108 Asia India National Stock Exchange of India XNSE 2017-08-16 -109 Asia India OTC Exchange of India OTCX 2017-08-16 -110 Asia India Pune Stock Exchange NULL 2017-08-16 -111 Asia India Saurashtra Kutch Stock Exchange NULL 2017-08-16 -112 Asia India United Stock Exchange of India XUSE 2017-08-16 -113 Asia India Vadodara Stock Exchange NULL 2017-08-16 -114 Asia Indonesia Indonesia Stock Exchange XIDX 2017-08-16 -115 Asia Iran Iran Fara Bourse NULL 2017-08-16 -116 Middle East Iran Tehran Stock Exchange XTEH 2017-08-16 -117 Middle East Iraq Iraq Stock Exchange XIQS 2017-08-16 -118 Western Europe Ireland Irish Stock Exchange XDUB 2017-08-16 -119 Middle East Israel Tel Aviv Stock Exchange XTAE 2017-08-16 -120 Western Europe Italy Borsa Italiana XMIL 2017-08-16 -121 Africa Ivory Coast Bourse Regionale des Valeurs Mobilieres XBRV 2017-08-16 -122 North America Jamaica Jamaica Stock Exchange XJAM 2017-08-16 -123 Asia Japan Chi-X Japan CHIJ 2017-08-16 -124 Asia Japan Daiwa Securities DRCT 2017-08-16 -125 Asia Japan Fukuoka Stock Exchange XFKA 2017-08-16 -126 Asia Japan Japan Exchange Group XJPX 2017-08-16 -127 Asia Japan Nagoya Stock Exchange XNGO 2017-08-16 -128 Asia Japan Sapporo Securities Exchange XSAP 2017-08-16 -129 Asia Japan SBI Japannext SBIJ 2017-08-16 -130 Middle East Jordan Amman Stock Exchange XAMM 2017-08-16 -131 Asia Kazakhstan Kazakhstan Stock Exchange XKAZ 2017-08-16 -132 Africa Kenya Nairobi Stock Exchange XNAI 2017-08-16 -133 Middle East Kuwait Kuwait Stock Exchange XKUW 2017-08-16 -134 Asia Kyrgyzstan Kyrgyz Stock Exchange XKSE 2017-08-16 -135 Asia Laos Lao Securities Exchange XLAO 2017-08-16 -136 Eastern Europe Latvia Riga Stock Exchange XRIS 2017-08-16 -137 Middle East Lebanon Beirut Stock Exchange XBEY 2017-08-16 -138 Africa Lesotho Maseru Securities Exchange NULL 2017-08-16 -139 Eastern Europe Lithuania Vilnius Stock Exchange XLIT 2017-08-16 -140 Western Europe Luxembourg Luxembourg Stock Exchange XLUX 2017-08-16 -141 Eastern Europe Macedonia Macedonian Stock Exchange XMAE 2017-08-16 -142 Africa Malawi Malawi Stock Exchange XMSW 2017-08-16 -143 Asia Malaysia Bursa Malaysia XKLS 2017-08-16 -144 Asia Maldives Maldives Stock Exchange MALX 2017-08-16 -145 Western Europe Malta Malta Stock Exchange XMAL 2017-08-16 -146 Western Europe Malta Malta Stock Exchange Prospects PROS 2017-08-16 -147 Africa Mauritius Stock Exchange of Mauritius XMAU 2017-08-16 -148 North America Mexico Bolsa Mexicana de Valores XMEX 2017-08-16 -149 Western Europe Moldova Moldova Stock Exchange XMOL 2017-08-16 -150 Asia Mongolia Mongolian Stock Exchange XULA 2017-08-16 -151 Eastern Europe Montenegro Montenegro Stock Exchange XMNX 2017-08-16 -152 Africa Morocco Casablanca Stock Exchange XCAS 2017-08-16 -153 Africa Mozambique Bolsa de Valores de Mozambique XBVM 2017-08-16 -154 Asia Myanmar Myanmar Securities Exchange Centre NULL 2017-08-16 -155 Asia Myanmar Yangon Stock Exchange NULL 2017-08-16 -156 Africa Namibia Namibian Stock Exchange XNAM 2017-08-16 -157 Asia Nepal Nepal Stock Exchange XNEP 2017-08-16 -158 Western Europe Netherlands Euronext Amsterdam XAMS 2017-08-16 -159 Western Europe Netherlands Nxchange XNXC 2017-08-16 -160 Australia & Oceania New Zealand New Zealand Exchange XNZE 2017-08-16 -161 North America Nicaragua Bolsa de Valores de Nicaragua XMAN 2017-08-16 -162 Africa Nigeria Nigerian Stock Exchange XNSA 2017-08-16 -163 Western Europe Norway Oslo Stock Exchange XOSL 2017-08-16 -164 Middle East Oman Muscat Securities Market XMUS 2017-08-16 -165 Asia Pakistan Lahore Stock Exchange NULL 2017-08-16 -166 Asia Pakistan Pakistan Stock Exchange XKAR 2017-08-16 -167 Middle East Palestine Palestine Securities Exchange XPAE 2017-08-16 -168 North America Panama Bolsa de Valores de Panama XPTY 2017-08-16 -169 Australia & Oceania Papua New Guinea Port Moresby Stock Exchange XPOM 2017-08-16 -170 South America Paraguay Bolsa de Valores & Productos de Asuncíon XVPA 2017-08-16 -171 South America Peru Bolsa de Valores de Lima XLIM 2017-08-16 -172 Asia Philippines Philippine Stock Exchange XPHS 2017-08-16 -173 Eastern Europe Poland NewConnect XNCO 2017-08-16 -174 Eastern Europe Poland Warsaw Stock Exchange XWAR 2017-08-16 -175 Western Europe Portugal Euronext Lisbon XLIS 2017-08-16 -176 Western Europe Portugal OPEX OPEX 2017-08-16 -177 Middle East Qatar Qatar Stock Exchange DSMD 2017-08-16 -178 Eastern Europe Romania Bucharest Stock Exchange XRAS 2017-08-16 -179 Eastern Europe Russia Moscow Exchange MISX 2017-08-16 -180 Eastern Europe Russia Saint Petersburg Stock Exchange XPET 2017-08-16 -181 Eastern Europe Russia Siberian Exchange XSIB 2017-08-16 -182 Africa Rwanda Rwanda Stock Exchange RSEX 2017-08-16 -183 North America Saint Kitts and Nevis Eastern Caribbean Securities Exchange XECS 2017-08-16 -184 Middle East Saudi Arabia Saudi Stock Exchange XSAU 2017-08-16 -185 Eastern Europe Serbia Belgrade Stock Exchange XBEL 2017-08-16 -186 Africa Seychelles Seychelles Securities Exchange (Trop-X) TRPX 2017-08-16 -187 Asia Singapore Singapore Exchange XSES 2017-08-16 -188 Eastern Europe Slovakia Bratislava Stock Exchange XBRA 2017-08-16 -189 Eastern Europe Slovenia Ljubljana Stock Exchange XLJU 2017-08-16 -190 Africa Somalia Somali Stock Exchange NULL 2017-08-16 -191 Africa South Africa A2X Markets A2XX 2017-08-16 -192 Africa South Africa Johannesburg Stock Exchange XJSE 2017-08-16 -193 Asia South Korea Korea New Exchange XKON 2017-08-16 -194 Asia South Korea Korea Stock Exchange XKRX 2017-08-16 -195 Asia South Korea KOSDAQ Securities Exchange XKOS 2017-08-16 -196 Western Europe Spain Bolsa de Bilbao XBIL 2017-08-16 -197 Western Europe Spain Bolsa de Madrid XMAD 2017-08-16 -198 Western Europe Spain Bolsa de Valencia XVAL 2017-08-16 -199 Western Europe Spain Borsa de Barcelona XBAR 2017-08-16 -200 Western Europe Spain Latibex XLAT 2017-08-16 -201 Asia Sri Lanka Colombo Stock Exchange XCOL 2017-08-16 -202 Africa Sudan Khartoum Stock Exchange XKHA 2017-08-16 -203 Africa Swaziland Swaziland Stock Exchange XSWA 2017-08-16 -204 Western Europe Sweden Aktietorget XSAT 2017-08-16 -205 Western Europe Sweden Nasdaq Stockholm XSTO 2017-08-16 -206 Western Europe Sweden Nordic Growth Market XNGM 2017-08-16 -207 Western Europe Switzerland Berne eXchange XBRN 2017-08-16 -208 Western Europe Switzerland SIX Swiss Exchange XSWX 2017-08-16 -209 Middle East Syria Damascus Securities Exchange XDSE 2017-08-16 -210 Asia Taiwan Taipei Exchange ROCO 2017-08-16 -211 Asia Taiwan Taiwan Stock Exchange XTAI 2017-08-16 -212 Africa Tanzania Dar-es-Salaam Stock Exchange XDAR 2017-08-16 -213 Asia Thailand Stock Exchange of Thailand XBKK 2017-08-16 -214 North America Trinidad and Tobago Trinidad and Tobago Stock Exchange XTRN 2017-08-16 -215 Africa Tunisia Bourse de Tunis XTUN 2017-08-16 -216 Eastern Europe Turkey Borsa İstanbul XIST 2017-08-16 -217 Africa Uganda Uganda Securities Exchange XUGA 2017-08-16 -218 Eastern Europe Ukraine East European Stock Exchange EESE 2017-08-16 -219 Eastern Europe Ukraine PFTS Ukraine Stock Exchange PFTS 2017-08-16 -220 Eastern Europe Ukraine Stock Exchange Perspectiva SEPE 2017-08-16 -221 Eastern Europe Ukraine Ukrainian Exchange UKEX 2017-08-16 -222 Middle East United Arab Emirates Abu Dhabi Securities Market XADS 2017-08-16 -223 Middle East United Arab Emirates Dubai Financial Market XDFM 2017-08-16 -224 Middle East United Arab Emirates Nasdaq Dubai DIFX 2017-08-16 -225 Western Europe United Kingdom Aquis Exchange AQXE 2017-08-16 -226 Western Europe United Kingdom Asset Match AMPX 2017-08-16 -227 Western Europe United Kingdom London Stock Exchange XLON 2017-08-16 -228 Western Europe United Kingdom NEX NEXS 2017-08-16 -229 Western Europe United Kingdom Turquoise TRQX 2017-08-16 -230 North America United States of America Bats BYX Exchange BYXD 2017-08-16 -231 North America United States of America Bats EDGA Exchange EDGA 2017-08-16 -232 North America United States of America Bats US BATS 2017-08-16 -233 North America United States of America BatsEDGX Exchange EDGX 2017-08-16 -234 North America United States of America Chicago Stock Exchange XCHI 2017-08-16 -235 North America United States of America Investors Exchange IEXG 2017-08-16 -236 North America United States of America NASDAQ XNAS 2017-08-16 -237 North America United States of America New York Stock Exchange XNYS 2017-08-16 -238 North America United States of America North American Derivatives Exchange NADEX HEGX 2017-08-16 -239 South America Uruguay Bolsa de Valores de Montevideo XMNT 2017-08-16 -240 South America Uruguay Bolsa Electronica de Valores de Uruguay BVUR 2017-08-16 -241 Asia Uzbekistan Tashkent Stock Exchange XSTE 2017-08-16 -242 Asia Vietnam Hanoi Stock Exchange HSTC 2017-08-16 -243 Asia Vietnam Ho Chi Minh Stock Exchange XSTC 2017-08-16 -244 Africa Zambia Lusaka Stock Exchange XLUS 2017-08-16 -245 Africa Zimbabwe Zimbabwe Stock Exchange XZIM 2017-08-16 -246 Eastern Europe Albania Albanian Securities Exchange XALS 2019-11-17 -247 North America United States of America Long-Term Stock Exchange LTSE 2020-09-14 -248 North America United States of America Miami International Securities Exchange MIHI 2020-09-24 -249 North America United States of America Members' Exchange NULL 2020-09-24 -250 Africa Zimbabwe Victoria Falls Stock Exchange NULL 2020-11-01 -251 Asia China Beijing Stock Exchange NULL 2021-12-27 - -endloop - -statement error -SET httpfs_client_implementation='unkown'; ----- -:.*Unsupported option for httpfs_client_implementation.* - -#FIXME this test fails: file is nonexistent -mode skip - -query IIIIII rowsort -SELECT * from read_csv_auto('https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'); ----- -2020 Allemagne Germany 26.1 53196.069 200601.2 -2020 Autriche Austria 18.0 4723.5 26215.8 -2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 -2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 -2020 Chypre Cyprus 0.0 0.0 1627.6 -2020 Croatie Croatia 16.3 1094.8 6726.3 -2020 Danemark Denmark 11.600000000000001 1579.0 13601.4 -2020 Espagne Spain 17.4 14211.7 81512.9 -2020 Estonie Estonia 8.5 241.1 2827.3 -2020 Finlande Finland 2.8000000000000003 692.3 24674.4 -2020 France France 20.3 28278.9 139375.8 -2020 Grèce Greece 5.800000000000001 896.5 15401.9 -2020 Hongrie Hungary 30.5 5486.7 17872.4 -2020 Irlande Ireland 17.4 1968.477 11296.601 -2020 Italie Italy 29.2 33042.585 113119.475 -2020 Lettonie Latvia 8.200000000000001 323.605 3926.131 -2020 Lituanie Lithuania 10.7 584.104 5457.728 -2020 Luxembourg Luxembourg 16.5 623.165 3786.785 -2020 Malte Malta 0.0 0.0 547.5 -2020 Pays-Bas Netherlands 37.1 16588.314 44682.656 -2020 Pologne Poland 13.5 9323.205 69135.018 -2020 Portugal Portugal 11.1 1814.878 16354.725 -2020 Roumanie Romania 23.7 5626.161 23712.653 -2020 Royaume-Uni United Kingdom 32.4 39311.416 121414.483 -2020 République tchèque Czech Republic 21.4 5187.282 24263.896 -2020 Slovaquie Slovakia 25.0 2564.876 10248.401 -2020 Slovénie Slovenia 12.1 590.243 4861.315 -2020 Suède Sweden 1.5 475.195 31311.413 -2020 UE 28 Europe 28 22.5 238152.4 1056907.5 -2021 Allemagne Germany 26.760345686044435 51812.567 193616.957 -2021 Autriche Austria 18.720006775926056 4645.795 24817.272 -2021 Belgique Belgium 29.279402721103864 9088.083 31039.168 -2021 Bulgarie Bulgaria 12.368015142641884 1176.537 9512.739 -2021 Chypre Cyprus 0.0 0.0 1528.558 -2021 Croatie Croatia 17.10389029082304 1100.12 6431.987 -2021 Danemark Denmark 11.485631727184947 1508.152 13130.771 -2021 Espagne Spain 19.10173955663722 13815.0 72323.256 -2021 Estonie Estonia 8.988278645659518 245.094 2726.818 -2021 Finlande Finland 2.9937725178230212 694.288 23191.074 -2021 France France 20.649030024470434 26465.646 128168.955 -2021 Grèce Greece 7.580480506088059 1097.87 14482.855 -2021 Hongrie Hungary 32.344729318831554 5693.164 17601.52 -2021 Irlande Ireland 18.020604987495144 1953.468 10840.191 -2021 Italie Italy 30.86368769746751 31807.236 103057.147 -2021 Lettonie Latvia 8.502139837843602 322.927 3798.185 -2021 Lituanie Lithuania 11.029023816606903 582.797 5284.212 -2021 Luxembourg Luxembourg 17.282784281000467 564.365 3265.475 -2021 Malte Malta 0.0 0.0 499.875 -2021 Pays-Bas Netherlands 37.61392206122467 15896.316 42261.788 -2021 Pologne Poland 13.146720200313602 9235.656 70250.647 -2021 Portugal Portugal 11.437926753365227 1740.3 15215.17 -2021 Roumanie Romania 24.909638477223016 5846.885 23472.38 -2021 République tchèque Czech Republic 21.716683280446812 5158.445 23753.374 -2021 Slovaquie Slovakia 25.253930010417324 2427.134 9610.916 -2021 Slovénie Slovenia 13.141683407321874 582.024 4428.839 -2021 Suède Sweden 1.497679952802663 471.085 31454.317 -2021 UE 27 UE 27 21.894190365821018 193930.95399999994 885764.4460000001 - -query IIIIII rowsort res -SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); - - -query IIIIII rowsort res -SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); - - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ----- -1265 - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ----- -1265 - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ----- -1265 From b67d9f9d2934d1c9a2cc4e2d60078965aecfc2be Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 4 Sep 2025 10:54:17 +0200 Subject: [PATCH 65/78] fix patch errors --- extension/httpfs/crypto.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 18878ed9..1a7066d4 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -37,7 +37,7 @@ AESStateSSL::~AESStateSSL() { const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { switch (cipher) { - case EncryptionTypes::GCM: + case EncryptionTypes::GCM: { switch (key_len) { case 16: return EVP_aes_128_gcm(); @@ -48,6 +48,19 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { default: throw InternalException("Invalid AES key length"); } + } + case EncryptionTypes::CTR: { + switch (key_len) { + case 16: + return EVP_aes_128_cbc(); + case 24: + return EVP_aes_192_cbc(); + case 32: + return EVP_aes_256_cbc(); + default: + throw InternalException("Invalid AES key length"); + } + } case EncryptionTypes::CBC: { switch (key_len) { case 16: From 7244ef67569215265bee25fbacf9399a849cd8bd Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 4 Sep 2025 10:56:55 +0200 Subject: [PATCH 66/78] Add back in csv test --- extension/httpfs/crypto.cpp | 6 +- test/sql/copy/csv/test_csv_httpfs.test | 401 +++++++++++++++++++++++++ 2 files changed, 404 insertions(+), 3 deletions(-) create mode 100644 test/sql/copy/csv/test_csv_httpfs.test diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 1a7066d4..3681c9fa 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -52,11 +52,11 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { case EncryptionTypes::CTR: { switch (key_len) { case 16: - return EVP_aes_128_cbc(); + return EVP_aes_128_ctr()(); case 24: - return EVP_aes_192_cbc(); + return EVP_aes_192_ctr(); case 32: - return EVP_aes_256_cbc(); + return EVP_aes_256_ctr(); default: throw InternalException("Invalid AES key length"); } diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test new file mode 100644 index 00000000..d461416b --- /dev/null +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -0,0 +1,401 @@ +# name: test/sql/copy/csv/test_csv_httpfs.test +# description: This test triggers the http prefetch mechanism. +# group: [csv] + +require httpfs + +require parquet + +statement ok +PRAGMA enable_verification + +statement ok +CALL enable_logging('HTTP'); + +foreach httpfs_implementation curl httplib + +statement ok +SET httpfs_client_implementation='${httpfs_implementation}'; + +query II +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; +---- +1 actor +2 actress +3 producer +4 writer +5 cinematographer +6 composer +7 costume designer +8 director +9 editor +10 miscellaneous crew +11 production designer +12 guest + +query IIIIIIIIIIIIIIIIII +select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; +---- +1 AAAAAAAABAAAAAAA 980124 7135 32946 2452238 2452208 Mr. Javier Lewis Y 9 12 1936 CHILE NULL Javier.Lewis@VFAxlnZEvOx.org 2452508 +2 AAAAAAAACAAAAAAA 819667 1461 31655 2452318 2452288 Dr. Amy Moses Y 9 4 1966 TOGO NULL Amy.Moses@Ovk9KjHH.com 2452318 +3 AAAAAAAADAAAAAAA 1473522 6247 48572 2449130 2449100 Miss Latisha Hamilton Y 18 9 1979 NIUE NULL Latisha.Hamilton@V.com 2452313 +4 AAAAAAAAEAAAAAAA 1703214 3986 39558 2450030 2450000 Dr. Michael White Y 7 6 1983 MEXICO NULL Michael.White@i.org 2452361 +5 AAAAAAAAFAAAAAAA 953372 4470 36368 2449438 2449408 Sir Robert Moran N 8 5 1956 FIJI NULL Robert.Moran@Hh.edu 2452469 +6 AAAAAAAAGAAAAAAA 213219 6374 27082 2451883 2451853 Ms. Brunilda Sharp Y 4 12 1925 SURINAME NULL Brunilda.Sharp@T3pylZEUQjm.org 2452430 +7 AAAAAAAAHAAAAAAA 68377 3219 44814 2451438 2451408 Ms. Fonda Wiles N 24 4 1985 GAMBIA NULL Fonda.Wiles@S9KnyEtz9hv.org 2452360 +8 AAAAAAAAIAAAAAAA 1215897 2471 16598 2449406 2449376 Sir Ollie Shipman N 26 12 1938 KOREA, REPUBLIC OF NULL Ollie.Shipman@be.org 2452334 +9 AAAAAAAAJAAAAAAA 1168667 1404 49388 2452275 2452245 Sir Karl Gilbert N 26 10 1966 MONTSERRAT NULL Karl.Gilbert@Crg5KyP2IxX9C4d6.edu 2452454 +10 AAAAAAAAKAAAAAAA 1207553 5143 19580 2451353 2451323 Ms. Albert Brunson N 15 10 1973 JORDAN NULL Albert.Brunson@62.com 2452641 + +#Add test for 5924 +query IIIIII +select * from read_csv_auto('https://csvbase.com/meripaterson/stock-exchanges'); +---- +1 Africa Lesotho HYBSE NULL 2019-03-25 +2 Asia Kazakhstan Astana International Financial Centre AIXK 2018-11-18 +3 Africa South Africa ZAR X ZARX 2018-11-18 +4 South America Argentina Bolsas y Mercados Argentinos NULL 2018-04-02 +5 North America United States of America Delaware Board of Trade NULL 2018-04-02 +6 Australia & Oceania Australia Chi-X Asia Pacific NULL 2018-04-02 +7 Australia & Oceania Australia Chi-X Australia CHIA 2018-04-02 +8 South America Mexico BIVA BIVA 2018-01-06 +9 Africa South Africa Equity Express Securities Exchange NULL 2017-12-11 +10 Africa South Africa Cape Town Stock Exchange NULL 2021-10-22 +11 North America Curacao Dutch Caribbean Securities Exchange DCSX 2017-09-12 +12 North America Canada NEO NEOE 2017-09-06 +13 North America Canada Canadian Securities Exchange CNSX 2017-09-06 +14 Western Europe Germany XETRA XETR 2017-08-21 +15 Western Europe France Euronext Paris XPAR 2017-08-19 +16 Western Europe United Kingdom Euronext London XLDN 2017-08-19 +17 Eastern Europe Albania Tirana Stock Exchange XTIR 2017-08-16 +18 Africa Algeria Bourse d'Alger XALG 2017-08-16 +19 Africa Angola BODIVA XBDV 2017-08-16 +20 South America Argentina Buenos Aires Stock Exchange XBUE 2017-08-16 +21 South America Argentina Mercado Abierto Electrónico XMAB 2017-08-16 +22 Eastern Europe Armenia Armenia Securities Exchange XARM 2020-07-29 +23 Australia & Oceania Australia Australian Securities Exchange XASX 2017-08-16 +24 Australia & Oceania Australia Block Event BLEV 2017-08-16 +25 Australia & Oceania Australia IR Plus Securities Exchange SIMV 2017-08-16 +26 Australia & Oceania Australia National Stock Exchange of Australia XNEC 2017-08-16 +27 Australia & Oceania Australia Sydney Stock Exchange APXL 2017-08-16 +28 Western Europe Austria Wiener Börse XWBO 2017-08-16 +29 Asia Azerbaijan Baku Stock Exchange BSEX 2017-08-16 +30 North America Bahamas Bahamas International Securities Exchange XBAA 2017-08-16 +31 Middle East Bahrain Bahrain Bourse XBAH 2017-08-16 +32 Asia Bangladesh Chittagong Stock Exchange XCHG 2017-08-16 +33 Asia Bangladesh Dhaka Stock Exchange XDHA 2017-08-16 +34 North America Barbados Barbados Stock Exchange XBAB 2017-08-16 +35 Eastern Europe Belarus Belarusian Currency and Stock Exchange BCSE 2017-08-16 +36 Western Europe Belgium Euronext Brussels XBRU 2017-08-16 +37 North America Bermuda Bermuda Stock Exchange XBDA 2017-08-16 +38 Asia Bhutan Royal Securities Exchange of Bhutan NULL 2017-08-16 +39 South America Bolivia Bolsa de Valores de Bolivia XBOL 2017-08-16 +40 Eastern Europe Bosnia and Herzegovina Banja Luka Stock Exchange XBLB 2017-08-16 +41 Eastern Europe Bosnia and Herzegovina Sarajevo Stock Exchange XSSE 2017-08-16 +42 Africa Botswana Botswana Stock Exchange XBOT 2017-08-16 +43 South America Brazil B3 - Brasil Bolsa Balcão BVMF 2017-08-16 +44 South America Brazil Bolsa de Valores Minab - Espírito Santo BOVM 2017-08-16 +45 Eastern Europe Bulgaria Bulgarian Stock Exchange XBUL 2017-08-16 +46 Asia Cambodia Cambodia Securities Exchange XCSX 2017-08-16 +47 North America Canada Montreal Exchange XMOD 2017-08-16 +48 North America Canada Nasdaq Canada XCSD 2017-08-16 +49 North America Canada TMX TMXS 2017-08-16 +50 North America Canada Toronto Stock Exchange XTSE 2017-08-16 +51 Africa Cape Verde Bolsa de Valores de Cabo Verde XBVC 2017-08-16 +52 North America Cayman Islands Cayman Islands Stock Exchange XCAY 2017-08-16 +53 Western Europe Channel Islands Channel Islands Stock Exchange NULL 2017-08-16 +54 South America Chile Santiago Electronic Stock Exchange XBCL 2017-08-16 +55 South America Chile Santiago Stock Exchange XSGO 2017-08-16 +56 South America Chile Valparaiso Stock Exchange BOVA 2017-08-16 +57 Asia China Shanghai Stock Exchange XSHG 2017-08-16 +58 Asia China Shenzhen Stock Exchange XSHE 2017-08-16 +59 South America Colombia Bolsa de Valores de Colombia XBOG 2017-08-16 +60 North America Costa Rica Bolsa Nacional de Valores de Costa Rica XBNV 2017-08-16 +61 Eastern Europe Croatia Zagreb Stock Exchange XZAG 2017-08-16 +62 Eastern Europe Cyprus Cyprus Stock Exchange XCYS 2017-08-16 +63 Eastern Europe Czech Republic Prague Stock Exchange XPRAG 2017-08-16 +64 Eastern Europe Czech Republic RM-System Czech Stock Exchange XRMZ 2017-08-16 +65 Western Europe Denmark Nasdaq Copenhagen XCSE 2017-08-16 +66 North America Dominican Republic Bolsa de Valores de la República Dominicana XBVR 2017-08-16 +67 South America Ecuador Bolsa de Valores de Guayaquil XGUA 2017-08-16 +68 South America Ecuador Bolsa de Valores de Quito XQUI 2017-08-16 +69 Africa Egypt Egyptian Exchange XCAI 2017-08-16 +70 Africa Egypt Nilex NILX 2017-08-16 +71 North America El Salvador Bolsa de Valores de El Salvador XSVA 2017-08-16 +72 Eastern Europe Estonia Tallinn Stock Exchange XTAL 2017-08-16 +73 Australia & Oceania Fiji South Pacific Stock Exchange XSPS 2017-08-16 +74 Western Europe Finland Nasdaq Helsinki XHEL 2017-08-16 +75 Africa Gabon Bourse Régionale des Valeurs Mobilières d'Afrique Centrale NULL 2017-08-16 +76 Asia Georgia Georgian Stock Exchange XGSE 2017-08-16 +77 Western Europe Germany Börse Berlin XBER 2017-08-16 +78 Western Europe Germany Börse Düsseldorf XDUS 2017-08-16 +79 Western Europe Germany Börse Hamburg & Hannover HAMB 2017-08-16 +80 Western Europe Germany Börse München XMUN 2017-08-16 +81 Western Europe Germany Börse Stuttgart XSTU 2017-08-16 +82 Western Europe Germany Deutsche Börse Group XFRA 2017-08-16 +83 Western Europe Germany Eurex XEUR 2017-08-16 +84 Western Europe Germany Tradegate Exchange TGAT 2017-08-16 +85 Africa Ghana Ghana Stock Exchange XGHA 2017-08-16 +86 Western Europe Gibraltar Gibraltar Stock Exchange GSXL 2017-08-16 +87 Western Europe Greece Athens Stock Exchange ASEX 2017-08-16 +88 North America Guatemala Bolsa Nacional de Valores XGTG 2017-08-16 +89 Western Europe Guernsey International Stock Exchange XCIE 2017-08-16 +90 South America Guyana Guyana Stock Exchange GSCI 2017-08-16 +91 North America Haiti Haitian Stock Exchange NULL 2017-08-16 +92 North America Honduras Bolsa Centroamericana de Valores XBCV 2017-08-16 +93 Asia Hong Kong Hong Kong Growth Enterprise Market XGEM 2017-08-16 +94 Asia Hong Kong Hong Kong Stock Exchange XHKG 2017-08-16 +95 Eastern Europe Hungary Budapest Stock Exchange XBUD 2017-08-16 +96 Western Europe Iceland Nasdaq Iceland XICE 2017-08-16 +97 Asia India Ahmedabad Stock Exchange NULL 2017-08-16 +98 Asia India Bangalore Stock Exchange XBAN 2017-08-16 +99 Asia India Bombay Stock Exchange XBOM 2017-08-16 +100 Asia India BSE SME BSME 2017-08-16 +101 Asia India Calcutta Stock Exchange XCAL 2017-08-16 +102 Asia India Cochin Stock Exchange NULL 2017-08-16 +103 Asia India Coimbatore Stock Exchange NULL 2017-08-16 +104 Asia India Delhi Stock Exchange XDES 2017-08-16 +105 Asia India Inter-Connected Stock Exchange of India ISEX 2017-08-16 +106 Asia India Ludhiana Stock and Capital NULL 2017-08-16 +107 Asia India Metropolitan Stock Exchange NULL 2017-08-16 +108 Asia India National Stock Exchange of India XNSE 2017-08-16 +109 Asia India OTC Exchange of India OTCX 2017-08-16 +110 Asia India Pune Stock Exchange NULL 2017-08-16 +111 Asia India Saurashtra Kutch Stock Exchange NULL 2017-08-16 +112 Asia India United Stock Exchange of India XUSE 2017-08-16 +113 Asia India Vadodara Stock Exchange NULL 2017-08-16 +114 Asia Indonesia Indonesia Stock Exchange XIDX 2017-08-16 +115 Asia Iran Iran Fara Bourse NULL 2017-08-16 +116 Middle East Iran Tehran Stock Exchange XTEH 2017-08-16 +117 Middle East Iraq Iraq Stock Exchange XIQS 2017-08-16 +118 Western Europe Ireland Irish Stock Exchange XDUB 2017-08-16 +119 Middle East Israel Tel Aviv Stock Exchange XTAE 2017-08-16 +120 Western Europe Italy Borsa Italiana XMIL 2017-08-16 +121 Africa Ivory Coast Bourse Regionale des Valeurs Mobilieres XBRV 2017-08-16 +122 North America Jamaica Jamaica Stock Exchange XJAM 2017-08-16 +123 Asia Japan Chi-X Japan CHIJ 2017-08-16 +124 Asia Japan Daiwa Securities DRCT 2017-08-16 +125 Asia Japan Fukuoka Stock Exchange XFKA 2017-08-16 +126 Asia Japan Japan Exchange Group XJPX 2017-08-16 +127 Asia Japan Nagoya Stock Exchange XNGO 2017-08-16 +128 Asia Japan Sapporo Securities Exchange XSAP 2017-08-16 +129 Asia Japan SBI Japannext SBIJ 2017-08-16 +130 Middle East Jordan Amman Stock Exchange XAMM 2017-08-16 +131 Asia Kazakhstan Kazakhstan Stock Exchange XKAZ 2017-08-16 +132 Africa Kenya Nairobi Stock Exchange XNAI 2017-08-16 +133 Middle East Kuwait Kuwait Stock Exchange XKUW 2017-08-16 +134 Asia Kyrgyzstan Kyrgyz Stock Exchange XKSE 2017-08-16 +135 Asia Laos Lao Securities Exchange XLAO 2017-08-16 +136 Eastern Europe Latvia Riga Stock Exchange XRIS 2017-08-16 +137 Middle East Lebanon Beirut Stock Exchange XBEY 2017-08-16 +138 Africa Lesotho Maseru Securities Exchange NULL 2017-08-16 +139 Eastern Europe Lithuania Vilnius Stock Exchange XLIT 2017-08-16 +140 Western Europe Luxembourg Luxembourg Stock Exchange XLUX 2017-08-16 +141 Eastern Europe Macedonia Macedonian Stock Exchange XMAE 2017-08-16 +142 Africa Malawi Malawi Stock Exchange XMSW 2017-08-16 +143 Asia Malaysia Bursa Malaysia XKLS 2017-08-16 +144 Asia Maldives Maldives Stock Exchange MALX 2017-08-16 +145 Western Europe Malta Malta Stock Exchange XMAL 2017-08-16 +146 Western Europe Malta Malta Stock Exchange Prospects PROS 2017-08-16 +147 Africa Mauritius Stock Exchange of Mauritius XMAU 2017-08-16 +148 North America Mexico Bolsa Mexicana de Valores XMEX 2017-08-16 +149 Western Europe Moldova Moldova Stock Exchange XMOL 2017-08-16 +150 Asia Mongolia Mongolian Stock Exchange XULA 2017-08-16 +151 Eastern Europe Montenegro Montenegro Stock Exchange XMNX 2017-08-16 +152 Africa Morocco Casablanca Stock Exchange XCAS 2017-08-16 +153 Africa Mozambique Bolsa de Valores de Mozambique XBVM 2017-08-16 +154 Asia Myanmar Myanmar Securities Exchange Centre NULL 2017-08-16 +155 Asia Myanmar Yangon Stock Exchange NULL 2017-08-16 +156 Africa Namibia Namibian Stock Exchange XNAM 2017-08-16 +157 Asia Nepal Nepal Stock Exchange XNEP 2017-08-16 +158 Western Europe Netherlands Euronext Amsterdam XAMS 2017-08-16 +159 Western Europe Netherlands Nxchange XNXC 2017-08-16 +160 Australia & Oceania New Zealand New Zealand Exchange XNZE 2017-08-16 +161 North America Nicaragua Bolsa de Valores de Nicaragua XMAN 2017-08-16 +162 Africa Nigeria Nigerian Stock Exchange XNSA 2017-08-16 +163 Western Europe Norway Oslo Stock Exchange XOSL 2017-08-16 +164 Middle East Oman Muscat Securities Market XMUS 2017-08-16 +165 Asia Pakistan Lahore Stock Exchange NULL 2017-08-16 +166 Asia Pakistan Pakistan Stock Exchange XKAR 2017-08-16 +167 Middle East Palestine Palestine Securities Exchange XPAE 2017-08-16 +168 North America Panama Bolsa de Valores de Panama XPTY 2017-08-16 +169 Australia & Oceania Papua New Guinea Port Moresby Stock Exchange XPOM 2017-08-16 +170 South America Paraguay Bolsa de Valores & Productos de Asuncíon XVPA 2017-08-16 +171 South America Peru Bolsa de Valores de Lima XLIM 2017-08-16 +172 Asia Philippines Philippine Stock Exchange XPHS 2017-08-16 +173 Eastern Europe Poland NewConnect XNCO 2017-08-16 +174 Eastern Europe Poland Warsaw Stock Exchange XWAR 2017-08-16 +175 Western Europe Portugal Euronext Lisbon XLIS 2017-08-16 +176 Western Europe Portugal OPEX OPEX 2017-08-16 +177 Middle East Qatar Qatar Stock Exchange DSMD 2017-08-16 +178 Eastern Europe Romania Bucharest Stock Exchange XRAS 2017-08-16 +179 Eastern Europe Russia Moscow Exchange MISX 2017-08-16 +180 Eastern Europe Russia Saint Petersburg Stock Exchange XPET 2017-08-16 +181 Eastern Europe Russia Siberian Exchange XSIB 2017-08-16 +182 Africa Rwanda Rwanda Stock Exchange RSEX 2017-08-16 +183 North America Saint Kitts and Nevis Eastern Caribbean Securities Exchange XECS 2017-08-16 +184 Middle East Saudi Arabia Saudi Stock Exchange XSAU 2017-08-16 +185 Eastern Europe Serbia Belgrade Stock Exchange XBEL 2017-08-16 +186 Africa Seychelles Seychelles Securities Exchange (Trop-X) TRPX 2017-08-16 +187 Asia Singapore Singapore Exchange XSES 2017-08-16 +188 Eastern Europe Slovakia Bratislava Stock Exchange XBRA 2017-08-16 +189 Eastern Europe Slovenia Ljubljana Stock Exchange XLJU 2017-08-16 +190 Africa Somalia Somali Stock Exchange NULL 2017-08-16 +191 Africa South Africa A2X Markets A2XX 2017-08-16 +192 Africa South Africa Johannesburg Stock Exchange XJSE 2017-08-16 +193 Asia South Korea Korea New Exchange XKON 2017-08-16 +194 Asia South Korea Korea Stock Exchange XKRX 2017-08-16 +195 Asia South Korea KOSDAQ Securities Exchange XKOS 2017-08-16 +196 Western Europe Spain Bolsa de Bilbao XBIL 2017-08-16 +197 Western Europe Spain Bolsa de Madrid XMAD 2017-08-16 +198 Western Europe Spain Bolsa de Valencia XVAL 2017-08-16 +199 Western Europe Spain Borsa de Barcelona XBAR 2017-08-16 +200 Western Europe Spain Latibex XLAT 2017-08-16 +201 Asia Sri Lanka Colombo Stock Exchange XCOL 2017-08-16 +202 Africa Sudan Khartoum Stock Exchange XKHA 2017-08-16 +203 Africa Swaziland Swaziland Stock Exchange XSWA 2017-08-16 +204 Western Europe Sweden Aktietorget XSAT 2017-08-16 +205 Western Europe Sweden Nasdaq Stockholm XSTO 2017-08-16 +206 Western Europe Sweden Nordic Growth Market XNGM 2017-08-16 +207 Western Europe Switzerland Berne eXchange XBRN 2017-08-16 +208 Western Europe Switzerland SIX Swiss Exchange XSWX 2017-08-16 +209 Middle East Syria Damascus Securities Exchange XDSE 2017-08-16 +210 Asia Taiwan Taipei Exchange ROCO 2017-08-16 +211 Asia Taiwan Taiwan Stock Exchange XTAI 2017-08-16 +212 Africa Tanzania Dar-es-Salaam Stock Exchange XDAR 2017-08-16 +213 Asia Thailand Stock Exchange of Thailand XBKK 2017-08-16 +214 North America Trinidad and Tobago Trinidad and Tobago Stock Exchange XTRN 2017-08-16 +215 Africa Tunisia Bourse de Tunis XTUN 2017-08-16 +216 Eastern Europe Turkey Borsa İstanbul XIST 2017-08-16 +217 Africa Uganda Uganda Securities Exchange XUGA 2017-08-16 +218 Eastern Europe Ukraine East European Stock Exchange EESE 2017-08-16 +219 Eastern Europe Ukraine PFTS Ukraine Stock Exchange PFTS 2017-08-16 +220 Eastern Europe Ukraine Stock Exchange Perspectiva SEPE 2017-08-16 +221 Eastern Europe Ukraine Ukrainian Exchange UKEX 2017-08-16 +222 Middle East United Arab Emirates Abu Dhabi Securities Market XADS 2017-08-16 +223 Middle East United Arab Emirates Dubai Financial Market XDFM 2017-08-16 +224 Middle East United Arab Emirates Nasdaq Dubai DIFX 2017-08-16 +225 Western Europe United Kingdom Aquis Exchange AQXE 2017-08-16 +226 Western Europe United Kingdom Asset Match AMPX 2017-08-16 +227 Western Europe United Kingdom London Stock Exchange XLON 2017-08-16 +228 Western Europe United Kingdom NEX NEXS 2017-08-16 +229 Western Europe United Kingdom Turquoise TRQX 2017-08-16 +230 North America United States of America Bats BYX Exchange BYXD 2017-08-16 +231 North America United States of America Bats EDGA Exchange EDGA 2017-08-16 +232 North America United States of America Bats US BATS 2017-08-16 +233 North America United States of America BatsEDGX Exchange EDGX 2017-08-16 +234 North America United States of America Chicago Stock Exchange XCHI 2017-08-16 +235 North America United States of America Investors Exchange IEXG 2017-08-16 +236 North America United States of America NASDAQ XNAS 2017-08-16 +237 North America United States of America New York Stock Exchange XNYS 2017-08-16 +238 North America United States of America North American Derivatives Exchange NADEX HEGX 2017-08-16 +239 South America Uruguay Bolsa de Valores de Montevideo XMNT 2017-08-16 +240 South America Uruguay Bolsa Electronica de Valores de Uruguay BVUR 2017-08-16 +241 Asia Uzbekistan Tashkent Stock Exchange XSTE 2017-08-16 +242 Asia Vietnam Hanoi Stock Exchange HSTC 2017-08-16 +243 Asia Vietnam Ho Chi Minh Stock Exchange XSTC 2017-08-16 +244 Africa Zambia Lusaka Stock Exchange XLUS 2017-08-16 +245 Africa Zimbabwe Zimbabwe Stock Exchange XZIM 2017-08-16 +246 Eastern Europe Albania Albanian Securities Exchange XALS 2019-11-17 +247 North America United States of America Long-Term Stock Exchange LTSE 2020-09-14 +248 North America United States of America Miami International Securities Exchange MIHI 2020-09-24 +249 North America United States of America Members' Exchange NULL 2020-09-24 +250 Africa Zimbabwe Victoria Falls Stock Exchange NULL 2020-11-01 +251 Asia China Beijing Stock Exchange NULL 2021-12-27 + +endloop + +statement error +SET httpfs_client_implementation='unkown'; +---- +:.*Unsupported option for httpfs_client_implementation.* + +#FIXME this test fails: file is nonexistent +mode skip + +query IIIIII rowsort +SELECT * from read_csv_auto('https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'); +---- +2020 Allemagne Germany 26.1 53196.069 200601.2 +2020 Autriche Austria 18.0 4723.5 26215.8 +2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 +2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 +2020 Chypre Cyprus 0.0 0.0 1627.6 +2020 Croatie Croatia 16.3 1094.8 6726.3 +2020 Danemark Denmark 11.600000000000001 1579.0 13601.4 +2020 Espagne Spain 17.4 14211.7 81512.9 +2020 Estonie Estonia 8.5 241.1 2827.3 +2020 Finlande Finland 2.8000000000000003 692.3 24674.4 +2020 France France 20.3 28278.9 139375.8 +2020 Grèce Greece 5.800000000000001 896.5 15401.9 +2020 Hongrie Hungary 30.5 5486.7 17872.4 +2020 Irlande Ireland 17.4 1968.477 11296.601 +2020 Italie Italy 29.2 33042.585 113119.475 +2020 Lettonie Latvia 8.200000000000001 323.605 3926.131 +2020 Lituanie Lithuania 10.7 584.104 5457.728 +2020 Luxembourg Luxembourg 16.5 623.165 3786.785 +2020 Malte Malta 0.0 0.0 547.5 +2020 Pays-Bas Netherlands 37.1 16588.314 44682.656 +2020 Pologne Poland 13.5 9323.205 69135.018 +2020 Portugal Portugal 11.1 1814.878 16354.725 +2020 Roumanie Romania 23.7 5626.161 23712.653 +2020 Royaume-Uni United Kingdom 32.4 39311.416 121414.483 +2020 République tchèque Czech Republic 21.4 5187.282 24263.896 +2020 Slovaquie Slovakia 25.0 2564.876 10248.401 +2020 Slovénie Slovenia 12.1 590.243 4861.315 +2020 Suède Sweden 1.5 475.195 31311.413 +2020 UE 28 Europe 28 22.5 238152.4 1056907.5 +2021 Allemagne Germany 26.760345686044435 51812.567 193616.957 +2021 Autriche Austria 18.720006775926056 4645.795 24817.272 +2021 Belgique Belgium 29.279402721103864 9088.083 31039.168 +2021 Bulgarie Bulgaria 12.368015142641884 1176.537 9512.739 +2021 Chypre Cyprus 0.0 0.0 1528.558 +2021 Croatie Croatia 17.10389029082304 1100.12 6431.987 +2021 Danemark Denmark 11.485631727184947 1508.152 13130.771 +2021 Espagne Spain 19.10173955663722 13815.0 72323.256 +2021 Estonie Estonia 8.988278645659518 245.094 2726.818 +2021 Finlande Finland 2.9937725178230212 694.288 23191.074 +2021 France France 20.649030024470434 26465.646 128168.955 +2021 Grèce Greece 7.580480506088059 1097.87 14482.855 +2021 Hongrie Hungary 32.344729318831554 5693.164 17601.52 +2021 Irlande Ireland 18.020604987495144 1953.468 10840.191 +2021 Italie Italy 30.86368769746751 31807.236 103057.147 +2021 Lettonie Latvia 8.502139837843602 322.927 3798.185 +2021 Lituanie Lithuania 11.029023816606903 582.797 5284.212 +2021 Luxembourg Luxembourg 17.282784281000467 564.365 3265.475 +2021 Malte Malta 0.0 0.0 499.875 +2021 Pays-Bas Netherlands 37.61392206122467 15896.316 42261.788 +2021 Pologne Poland 13.146720200313602 9235.656 70250.647 +2021 Portugal Portugal 11.437926753365227 1740.3 15215.17 +2021 Roumanie Romania 24.909638477223016 5846.885 23472.38 +2021 République tchèque Czech Republic 21.716683280446812 5158.445 23753.374 +2021 Slovaquie Slovakia 25.253930010417324 2427.134 9610.916 +2021 Slovénie Slovenia 13.141683407321874 582.024 4428.839 +2021 Suède Sweden 1.497679952802663 471.085 31454.317 +2021 UE 27 UE 27 21.894190365821018 193930.95399999994 885764.4460000001 + +query IIIIII rowsort res +SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); + + +query IIIIII rowsort res +SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); + + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 From 15cadd9d56cac0140a66bdd2a3e21c1355a8bac2 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 4 Sep 2025 10:58:13 +0200 Subject: [PATCH 67/78] remove extra paren --- extension/httpfs/crypto.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 3681c9fa..4e311ed8 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -52,7 +52,7 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { case EncryptionTypes::CTR: { switch (key_len) { case 16: - return EVP_aes_128_ctr()(); + return EVP_aes_128_ctr(); case 24: return EVP_aes_192_ctr(); case 32: From 86f5fc83345e2fb773e7b0c78ef42c8babfa685c Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Tue, 29 Jul 2025 15:35:29 +0200 Subject: [PATCH 68/78] feat: automatic full file download fallback --- .github/workflows/MinioTests.yml | 13 ++- extension/httpfs/httpfs.cpp | 96 +++++++++++++++++++--- extension/httpfs/httpfs_extension.cpp | 1 + extension/httpfs/include/httpfs.hpp | 22 ++++- extension/httpfs/include/httpfs_client.hpp | 2 + test/sql/full_file_download_fallback.test | 46 +++++++++++ 6 files changed, 167 insertions(+), 13 deletions(-) create mode 100644 test/sql/full_file_download_fallback.test diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index eb8ea6da..e457f0c2 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -22,6 +22,8 @@ jobs: GEN: ninja VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake VCPKG_TARGET_TRIPLET: x64-linux + PYTHON_HTTP_SERVER_URL: http://localhost:8008 + PYTHON_HTTP_SERVER_DIR: /tmp/python_test_server steps: - uses: actions/checkout@v4 @@ -50,7 +52,9 @@ jobs: - name: Build shell: bash - run: make + run: | + echo -e "\nduckdb_extension_load(tpch)\n" >> extension_config.cmake + make - name: Start S3/HTTP test server shell: bash @@ -62,6 +66,13 @@ jobs: source ./scripts/run_s3_test_server.sh sleep 30 + - name: Run & Populate test server + shell: bash + run: | + mkdir -p $PYTHON_HTTP_SERVER_DIR + cd $PYTHON_HTTP_SERVER_DIR + python3 -m http.server 8008 & + - name: Test shell: bash run: | diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 2b5df4dd..6a26f59f 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -44,6 +44,7 @@ unique_ptr HTTPFSUtil::InitializeParameters(optional_ptr // Setting lookups FileOpener::TryGetCurrentSetting(opener, "http_timeout", result->timeout, info); FileOpener::TryGetCurrentSetting(opener, "force_download", result->force_download, info); + FileOpener::TryGetCurrentSetting(opener, "auto_fallback_to_full_download", result->auto_fallback_to_full_download, info); FileOpener::TryGetCurrentSetting(opener, "http_retries", result->retries, info); FileOpener::TryGetCurrentSetting(opener, "http_retry_wait_ms", result->retry_wait_ms, info); FileOpener::TryGetCurrentSetting(opener, "http_retry_backoff", result->retry_backoff, info); @@ -230,8 +231,7 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str if (response.HasHeader("Content-Length")) { auto content_length = stoll(response.GetHeaderValue("Content-Length")); if ((idx_t)content_length != buffer_out_len) { - throw HTTPException("HTTP GET error: Content-Length from server mismatches requested " - "range, server may not support range requests."); + RangeRequestNotSupportedException::Throw(); } } } @@ -254,6 +254,8 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str return true; }); + get_request.try_request = true; + auto response = http_util.Request(get_request, http_client); hfh.StoreClient(std::move(http_client)); @@ -338,9 +340,36 @@ unique_ptr HTTPFileSystem::OpenFileExtended(const OpenFileInfo &file return std::move(handle); } -// Buffered read from http file. -// Note that buffering is disabled when FileFlags::FILE_FLAGS_DIRECT_IO is set -void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) { +bool HTTPFileSystem::TryRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, idx_t file_offset, char *buffer_out, idx_t buffer_out_len) { + auto res = GetRangeRequest(handle, url, header_map, file_offset, buffer_out, buffer_out_len); + + if (res) { + // Request succeeded + if (res->Success()) { + return true; + } + + // Request failed and we have a request error + if (res->HasRequestError()) { + ErrorData error (res->GetRequestError()); + + // Special case: we can do a retry with a full file download + if (error.Type() == RangeRequestNotSupportedException::TYPE && error.RawMessage() == RangeRequestNotSupportedException::MESSAGE) { + auto &hfh = handle.Cast(); + if (hfh.http_params.auto_fallback_to_full_download) { + return false; + } + + error.Throw(); + } + } + throw HTTPException(*res, "Request returned HTTP %d for HTTP %s to '%s'", + static_cast(res->status), EnumUtil::ToString(RequestType::GET_REQUEST), res->url); + } + throw IOException("Unknown error for HTTP %s to '%s'", EnumUtil::ToString(RequestType::GET_REQUEST), url); +} + +bool HTTPFileSystem::ReadInternal(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) { auto &hfh = handle.Cast(); D_ASSERT(hfh.http_params.state); @@ -351,7 +380,7 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id memcpy(buffer, hfh.cached_file_handle->GetData() + location, nr_bytes); DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location); hfh.file_offset = location + nr_bytes; - return; + return true; } idx_t to_read = nr_bytes; @@ -360,7 +389,9 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id // Don't buffer when DirectIO is set or when we are doing parallel reads bool skip_buffer = hfh.flags.DirectIO() || hfh.flags.RequireParallelAccess(); if (skip_buffer && to_read > 0) { - GetRangeRequest(hfh, hfh.path, {}, location, (char *)buffer, to_read); + if (!TryRangeRequest(hfh, hfh.path, {}, location, (char *)buffer, to_read)) { + return false; + } DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location); // Update handle status within critical section for parallel access. if (hfh.flags.RequireParallelAccess()) { @@ -368,13 +399,13 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id hfh.buffer_available = 0; hfh.buffer_idx = 0; hfh.file_offset = location + nr_bytes; - return; + return true; } hfh.buffer_available = 0; hfh.buffer_idx = 0; hfh.file_offset = location + nr_bytes; - return; + return true; } if (location >= hfh.buffer_start && location < hfh.buffer_end) { @@ -406,13 +437,17 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id // Bypass buffer if we read more than buffer size if (to_read > new_buffer_available) { - GetRangeRequest(hfh, hfh.path, {}, location + buffer_offset, (char *)buffer + buffer_offset, to_read); + if (!TryRangeRequest(hfh, hfh.path, {}, location + buffer_offset, (char *)buffer + buffer_offset, to_read)) { + return false; + } hfh.buffer_available = 0; hfh.buffer_idx = 0; start_offset += to_read; break; } else { - GetRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available); + if (!TryRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available)) { + return false; + } hfh.buffer_available = new_buffer_available; hfh.buffer_idx = 0; hfh.buffer_start = start_offset; @@ -422,6 +457,32 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id } hfh.file_offset = location + nr_bytes; DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location); + return true; +} + +// Buffered read from http file. +// Note that buffering is disabled when FileFlags::FILE_FLAGS_DIRECT_IO is set +void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) { + auto success = ReadInternal(handle, buffer, nr_bytes, location); + if (success) { + return; + } + + // ReadInternal returned false. This means the regular path of querying the file with range requests failed. We will + // attempt to download the full file and retry. + + if (handle.logger) { + DUCKDB_LOG_WARN(handle.logger, "Falling back to full file download for file '%s': the server does not support HTTP range requests. Performance and memory usage are potentially degraded.", handle.path); + } + + auto &hfh = handle.Cast(); + + bool should_write_cache = false; + hfh.FullDownload(*this, should_write_cache); + + if (!ReadInternal(handle, buffer, nr_bytes, location)) { + throw HTTPException("Failed to read from HTTP file after automatically retrying a full file download."); + } } int64_t HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes) { @@ -627,6 +688,19 @@ void HTTPFileHandle::LoadFileInfo() { initialized = true; } + +void HTTPFileHandle::TryAddLogger(FileOpener &opener) { + auto context = opener.TryGetClientContext(); + if (context) { + logger = context->logger; + return; + } + auto database = opener.TryGetDatabase(); + if (database) { + logger = database->GetLogManager().GlobalLoggerReference(); + } +} + void HTTPFileHandle::Initialize(optional_ptr opener) { auto &hfs = file_system.Cast(); http_params.state = HTTPState::TryGetState(opener); diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index 2e65c3d0..fb4a1739 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -47,6 +47,7 @@ static void LoadInternal(ExtensionLoader &loader) { config.AddExtensionOption("http_retries", "HTTP retries on I/O error", LogicalType::UBIGINT, Value(3)); config.AddExtensionOption("http_retry_wait_ms", "Time between retries", LogicalType::UBIGINT, Value(100)); config.AddExtensionOption("force_download", "Forces upfront download of file", LogicalType::BOOLEAN, Value(false)); + config.AddExtensionOption("auto_fallback_to_full_download", "Allows automatically falling back to full file downloads when possible.", LogicalType::BOOLEAN, Value(true)); // Reduces the number of requests made while waiting, for example retry_wait_ms of 50 and backoff factor of 2 will // result in wait times of 0 50 100 200 400...etc. config.AddExtensionOption("http_retry_backoff", "Backoff factor for exponentially increasing retry wait time", diff --git a/extension/httpfs/include/httpfs.hpp b/extension/httpfs/include/httpfs.hpp index 55c74cc4..ea267c6a 100644 --- a/extension/httpfs/include/httpfs.hpp +++ b/extension/httpfs/include/httpfs.hpp @@ -14,6 +14,19 @@ namespace duckdb { +class RangeRequestNotSupportedException { +public: + // Call static Throw instead: if thrown as exception DuckDB can't catch it. + explicit RangeRequestNotSupportedException() = delete; + + static constexpr ExceptionType TYPE = ExceptionType::HTTP; + static constexpr const char *MESSAGE = "Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true`"; + + static void Throw() { + throw HTTPException(MESSAGE); + } +}; + class HTTPClientCache { public: //! Get a client from the client cache @@ -50,6 +63,8 @@ class HTTPFileHandle : public FileHandle { string etag; bool force_full_download; bool initialized = false; + + bool auto_fallback_to_full_file_download = true; // When using full file download, the full file will be written to a cached file handle unique_ptr cached_file_handle; @@ -85,7 +100,10 @@ class HTTPFileHandle : public FileHandle { //! Perform a HEAD request to get the file info (if not yet loaded) void LoadFileInfo(); -private: + //! TODO: make base function virtual? + void TryAddLogger(FileOpener &opener); + +public: //! Fully downloads a file void FullDownload(HTTPFileSystem &hfs, bool &should_write_cache); }; @@ -154,6 +172,8 @@ class HTTPFileSystem : public FileSystem { } virtual HTTPException GetHTTPError(FileHandle &, const HTTPResponse &response, const string &url); + bool TryRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, idx_t file_offset, char *buffer_out, idx_t buffer_out_len); + bool ReadInternal(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location); protected: virtual duckdb::unique_ptr CreateHandle(const OpenFileInfo &file, FileOpenFlags flags, diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index d540ce8b..9d51c85e 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -16,8 +16,10 @@ struct HTTPFSParams : public HTTPParams { static constexpr bool DEFAULT_ENABLE_SERVER_CERT_VERIFICATION = false; static constexpr uint64_t DEFAULT_HF_MAX_PER_PAGE = 0; static constexpr bool DEFAULT_FORCE_DOWNLOAD = false; + static constexpr bool AUTO_FALLBACK_TO_FULL_DOWNLOAD = true; bool force_download = DEFAULT_FORCE_DOWNLOAD; + bool auto_fallback_to_full_download = AUTO_FALLBACK_TO_FULL_DOWNLOAD; bool enable_server_cert_verification = DEFAULT_ENABLE_SERVER_CERT_VERIFICATION; idx_t hf_max_per_page = DEFAULT_HF_MAX_PER_PAGE; string ca_cert_file; diff --git a/test/sql/full_file_download_fallback.test b/test/sql/full_file_download_fallback.test new file mode 100644 index 00000000..456e6bcf --- /dev/null +++ b/test/sql/full_file_download_fallback.test @@ -0,0 +1,46 @@ +# name: test/sql/full_file_download_fallback.test +# group: [full_file_download] + +require parquet + +require httpfs + +require tpch + +require-env PYTHON_HTTP_SERVER_URL + +require-env PYTHON_HTTP_SERVER_DIR + +statement ok +pragma enable_logging; + +statement ok +call dbgen(sf=1); + +statement ok +copy lineitem to '${PYTHON_HTTP_SERVER_DIR}/lineitem.csv' + +statement ok +drop table lineitem; + +statement ok +CREATE view lineitem AS FROM '${PYTHON_HTTP_SERVER_URL}/lineitem.csv'; + +query I +pragma tpch(6); +---- +123141078.22829981 + +query I +select count(*) from duckdb_logs where log_level='WARN' and message like '%Falling back to full%' +---- +2 + +statement ok +set auto_fallback_to_full_download=false + +statement error +pragma tpch(6); +---- +HTTP Error: Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true` + From c964766b9792f1abc2a14bdca4a857633d1a6e38 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Tue, 29 Jul 2025 16:03:08 +0200 Subject: [PATCH 69/78] fix: http 206 not considered success --- extension/httpfs/httpfs.cpp | 4 ++-- test/sql/secret/secret_aws.test | 2 ++ test/sql/secret/secret_refresh.test | 2 ++ test/sql/secret/secret_refresh_attach.test | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 6a26f59f..e915f5b1 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -344,8 +344,8 @@ bool HTTPFileSystem::TryRangeRequest(FileHandle &handle, string url, HTTPHeaders auto res = GetRangeRequest(handle, url, header_map, file_offset, buffer_out, buffer_out_len); if (res) { - // Request succeeded - if (res->Success()) { + // Request succeeded TODO: fix upstream that 206 is not considered success + if (res->Success() || res->status == HTTPStatusCode::PartialContent_206 || res->status == HTTPStatusCode::Accepted_202) { return true; } diff --git a/test/sql/secret/secret_aws.test b/test/sql/secret/secret_aws.test index 0c1cbf86..b1eba63b 100644 --- a/test/sql/secret/secret_aws.test +++ b/test/sql/secret/secret_aws.test @@ -14,6 +14,8 @@ require-env DUCKDB_S3_ENDPOINT require-env DUCKDB_S3_USE_SSL +set ignore_error_messages + require httpfs require parquet diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index 987050c7..f2c55665 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -14,6 +14,8 @@ require-env DUCKDB_S3_ENDPOINT require-env DUCKDB_S3_USE_SSL +set ignore_error_messages + require httpfs require parquet diff --git a/test/sql/secret/secret_refresh_attach.test b/test/sql/secret/secret_refresh_attach.test index efe3b439..7cff03e9 100644 --- a/test/sql/secret/secret_refresh_attach.test +++ b/test/sql/secret/secret_refresh_attach.test @@ -16,6 +16,8 @@ require-env DUCKDB_S3_USE_SSL require-env S3_ATTACH_DB +set ignore_error_messages + require httpfs require parquet From 9788dabc5367928f41e5554b6c6312adaea64917 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Fri, 5 Sep 2025 13:02:18 +0200 Subject: [PATCH 70/78] fix test --- test/sql/secret/secret_refresh.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index f2c55665..ab3f9f72 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -84,7 +84,7 @@ CREATE SECRET s1 ( statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP Error: HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/test-file.parquet' (HTTP 403) +403 query I SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' @@ -125,7 +125,7 @@ set s3_access_key_id='bogus' statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP Error: HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/test-file.parquet' (HTTP 403) +403 # -> log empty query II From 9d24bd1f1072a0cd324d6c778b04a56d6a50f1fc Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Fri, 5 Sep 2025 14:10:20 +0200 Subject: [PATCH 71/78] dont send head on writes --- extension/httpfs/httpfs.cpp | 13 ++++++ extension/httpfs/include/httpfs.hpp | 3 ++ test/sql/copy/no_head_on_write.test | 66 +++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 test/sql/copy/no_head_on_write.test diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 2b5df4dd..02dc6111 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -331,6 +331,11 @@ unique_ptr HTTPFileSystem::OpenFileExtended(const OpenFileInfo &file } auto handle = CreateHandle(file, flags, opener); + + if (flags.OpenForWriting() && !flags.OpenForAppending() && !flags.OpenForReading()) { + handle->write_overwrite_mode = true; + } + handle->Initialize(opener); DUCKDB_LOG_FILE_SYSTEM_OPEN((*handle)); @@ -582,6 +587,14 @@ void HTTPFileHandle::LoadFileInfo() { // already initialized or we specifically do not want to perform a head request and just run a direct download return; } + + // In write_overwrite_mode we dgaf about the size, so no head request is needed + if (write_overwrite_mode) { + length = 0; + initialized = true; + return; + } + auto &hfs = file_system.Cast(); auto res = hfs.HeadRequest(*this, path, {}); if (res->status != HTTPStatusCode::OK_200) { diff --git a/extension/httpfs/include/httpfs.hpp b/extension/httpfs/include/httpfs.hpp index 55c74cc4..24440b0a 100644 --- a/extension/httpfs/include/httpfs.hpp +++ b/extension/httpfs/include/httpfs.hpp @@ -51,6 +51,9 @@ class HTTPFileHandle : public FileHandle { bool force_full_download; bool initialized = false; + // In write overwrite mode, we are not interested in the current state of the file: we're overwriting it. + bool write_overwrite_mode = false; + // When using full file download, the full file will be written to a cached file handle unique_ptr cached_file_handle; diff --git a/test/sql/copy/no_head_on_write.test b/test/sql/copy/no_head_on_write.test new file mode 100644 index 00000000..54a0ec50 --- /dev/null +++ b/test/sql/copy/no_head_on_write.test @@ -0,0 +1,66 @@ +# name: test/sql/copy/no_head_on_write.test +# description: Confirm that we don't send head requests for writes +# group: [secret] + +require-env S3_TEST_SERVER_AVAILABLE 1 + +require-env AWS_DEFAULT_REGION + +require-env AWS_ACCESS_KEY_ID + +require-env AWS_SECRET_ACCESS_KEY + +require-env DUCKDB_S3_ENDPOINT + +require-env DUCKDB_S3_USE_SSL + +require httpfs + +require parquet + +statement ok +SET enable_logging=true + +statement ok +set s3_use_ssl='${DUCKDB_S3_USE_SSL}' + +statement ok +set s3_endpoint='${DUCKDB_S3_ENDPOINT}' + +statement ok +set s3_region='${AWS_DEFAULT_REGION}' + +# Create some test data +statement ok +CREATE SECRET s1 ( + TYPE S3, + KEY_ID '${AWS_ACCESS_KEY_ID}', + SECRET '${AWS_SECRET_ACCESS_KEY}', + REQUESTER_PAYS true +) + +statement ok +CALL enable_logging('HTTP'); + +statement ok +copy (select 1 as a) to 's3://test-bucket/test-file.parquet' + +query I +select request.type FROM duckdb_logs_parsed('HTTP') +---- +POST +PUT +POST + +statement ok +CALL truncate_duckdb_logs(); + +statement ok +copy (select 1 as a) to 's3://test-bucket/test-file.csv' + +query I +select request.type FROM duckdb_logs_parsed('HTTP') +---- +POST +PUT +POST From cb960005b1bf4c64f2b8e54ebf004817a69a4519 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Fri, 5 Sep 2025 14:57:43 +0200 Subject: [PATCH 72/78] fix another test --- test/sql/full_file_download_fallback.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/full_file_download_fallback.test b/test/sql/full_file_download_fallback.test index 456e6bcf..a222a248 100644 --- a/test/sql/full_file_download_fallback.test +++ b/test/sql/full_file_download_fallback.test @@ -12,7 +12,7 @@ require-env PYTHON_HTTP_SERVER_URL require-env PYTHON_HTTP_SERVER_DIR statement ok -pragma enable_logging; +CALL enable_logging(); statement ok call dbgen(sf=1); From e12faac624551442fca6da58e4f47624df3d187d Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Fri, 5 Sep 2025 16:24:16 +0200 Subject: [PATCH 73/78] Add enable_curl_server_cert_verification option --- extension/httpfs/httpfs.cpp | 2 ++ extension/httpfs/httpfs_curl_client.cpp | 6 ++++-- extension/httpfs/httpfs_extension.cpp | 1 + extension/httpfs/include/httpfs_client.hpp | 1 + 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 2b5df4dd..d0d4e9a8 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -48,6 +48,8 @@ unique_ptr HTTPFSUtil::InitializeParameters(optional_ptr FileOpener::TryGetCurrentSetting(opener, "http_retry_wait_ms", result->retry_wait_ms, info); FileOpener::TryGetCurrentSetting(opener, "http_retry_backoff", result->retry_backoff, info); FileOpener::TryGetCurrentSetting(opener, "http_keep_alive", result->keep_alive, info); + FileOpener::TryGetCurrentSetting(opener, "enable_curl_server_cert_verification", result->enable_curl_server_cert_verification, + info); FileOpener::TryGetCurrentSetting(opener, "enable_server_cert_verification", result->enable_server_cert_verification, info); FileOpener::TryGetCurrentSetting(opener, "ca_cert_file", result->ca_cert_file, info); diff --git a/extension/httpfs/httpfs_curl_client.cpp b/extension/httpfs/httpfs_curl_client.cpp index 3c45c80a..e926367f 100644 --- a/extension/httpfs/httpfs_curl_client.cpp +++ b/extension/httpfs/httpfs_curl_client.cpp @@ -135,10 +135,12 @@ class HTTPFSCurlClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_FORBID_REUSE, 1L); } - // client->enable_server_certificate_verification(http_params.enable_server_cert_verification); - if (http_params.enable_server_cert_verification) { + if (http_params.enable_curl_server_cert_verification) { curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYPEER, 1L); // Verify the cert curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYHOST, 2L); // Verify that the cert matches the hostname + } else { + curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYPEER, 0L); // Override default, don't verify the cert + curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYHOST, 0L); // Override default, don't verify that the cert matches the hostname } // set read timeout diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index 2e65c3d0..ddd1a0e9 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -55,6 +55,7 @@ static void LoadInternal(ExtensionLoader &loader) { "http_keep_alive", "Keep alive connections. Setting this to false can help when running into connection failures", LogicalType::BOOLEAN, Value(true)); + config.AddExtensionOption("enable_curl_server_cert_verification", "Enable server side certificate verification for CURL backend.", LogicalType::BOOLEAN, Value(true)); config.AddExtensionOption("enable_server_cert_verification", "Enable server side certificate verification.", LogicalType::BOOLEAN, Value(false)); config.AddExtensionOption("ca_cert_file", "Path to a custom certificate file for self-signed certificates.", diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index d540ce8b..db8742b6 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -19,6 +19,7 @@ struct HTTPFSParams : public HTTPParams { bool force_download = DEFAULT_FORCE_DOWNLOAD; bool enable_server_cert_verification = DEFAULT_ENABLE_SERVER_CERT_VERIFICATION; + bool enable_curl_server_cert_verification = true; idx_t hf_max_per_page = DEFAULT_HF_MAX_PER_PAGE; string ca_cert_file; string bearer_token; From 58f15dcd947219efdffce6d3ed7dbb9160e30198 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Mon, 8 Sep 2025 10:55:28 +0200 Subject: [PATCH 74/78] avoid fallback for s3 urls --- extension/httpfs/httpfs.cpp | 4 ++-- extension/httpfs/include/s3fs.hpp | 1 + test/sql/secret/secret_refresh.test | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index e915f5b1..d9ce9a8c 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -254,7 +254,7 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str return true; }); - get_request.try_request = true; + get_request.try_request = hfh.auto_fallback_to_full_file_download; auto response = http_util.Request(get_request, http_client); @@ -360,8 +360,8 @@ bool HTTPFileSystem::TryRangeRequest(FileHandle &handle, string url, HTTPHeaders return false; } - error.Throw(); } + error.Throw(); } throw HTTPException(*res, "Request returned HTTP %d for HTTP %s to '%s'", static_cast(res->status), EnumUtil::ToString(RequestType::GET_REQUEST), res->url); diff --git a/extension/httpfs/include/s3fs.hpp b/extension/httpfs/include/s3fs.hpp index 7b9ad8df..b848d2c8 100644 --- a/extension/httpfs/include/s3fs.hpp +++ b/extension/httpfs/include/s3fs.hpp @@ -116,6 +116,7 @@ class S3FileHandle : public HTTPFileHandle { : HTTPFileHandle(fs, file, flags, std::move(http_params_p)), auth_params(auth_params_p), config_params(config_params_p), uploads_in_progress(0), parts_uploaded(0), upload_finalized(false), uploader_has_error(false), upload_exception(nullptr) { + auto_fallback_to_full_file_download = false; if (flags.OpenForReading() && flags.OpenForWriting()) { throw NotImplementedException("Cannot open an HTTP file for both reading and writing"); } else if (flags.OpenForAppending()) { diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index ab3f9f72..f2c55665 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -84,7 +84,7 @@ CREATE SECRET s1 ( statement error FROM "s3://test-bucket/test-file.parquet" ---- -403 +HTTP Error: HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/test-file.parquet' (HTTP 403) query I SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' @@ -125,7 +125,7 @@ set s3_access_key_id='bogus' statement error FROM "s3://test-bucket/test-file.parquet" ---- -403 +HTTP Error: HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/test-file.parquet' (HTTP 403) # -> log empty query II From 7bcaf1faab448c120ff8db7f73ac5f6ac6b626ee Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Mon, 8 Sep 2025 16:34:52 +0200 Subject: [PATCH 75/78] fix: etag checking bug --- extension/httpfs/httpfs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 5a126824..bb7267aa 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -232,7 +232,7 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str if (static_cast(response.status) < 300) { // done redirecting out_offset = 0; - if (!hfh.http_params.unsafe_disable_etag_checks && hfh.etag.empty() && response.HasHeader("ETag")) { + if (!hfh.http_params.unsafe_disable_etag_checks && !hfh.etag.empty() && response.HasHeader("ETag")) { string responseEtag = response.GetHeaderValue("ETag"); if (!responseEtag.empty() && responseEtag != hfh.etag) { From 4045a7358cd8532d2d294fce3b4e1a008d253533 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Sep 2025 13:11:54 +0200 Subject: [PATCH 76/78] Bump duckdb and apply patches --- duckdb | 2 +- extension/httpfs/crypto.cpp | 52 +++++++++++++++++++---------- extension/httpfs/include/crypto.hpp | 11 +++--- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/duckdb b/duckdb index 605eaf76..9cb1e3d3 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 605eaf76be154d5c6d38353f96b23c031795572d +Subproject commit 9cb1e3d3ca556d6828703a3dbf06b968570125cc diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 4e311ed8..61270e26 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -23,9 +23,9 @@ namespace duckdb { -AESStateSSL::AESStateSSL(EncryptionTypes::CipherType cipher_p, const std::string *key) : EncryptionState(cipher_p), context(EVP_CIPHER_CTX_new()), cipher(cipher_p) { +AESStateSSL::AESStateSSL(EncryptionTypes::CipherType cipher_p, idx_t key_len) : EncryptionState(cipher_p, key_len), context(EVP_CIPHER_CTX_new()) { if (!(context)) { - throw InternalException("AES GCM failed with initializing context"); + throw InternalException("OpenSSL AES failed with initializing context"); } } @@ -46,7 +46,7 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { case 32: return EVP_aes_256_gcm(); default: - throw InternalException("Invalid AES key length"); + throw InternalException("Invalid AES key length for GCM"); } } case EncryptionTypes::CTR: { @@ -58,7 +58,7 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { case 32: return EVP_aes_256_ctr(); default: - throw InternalException("Invalid AES key length"); + throw InternalException("Invalid AES key length for CTR"); } } case EncryptionTypes::CBC: { @@ -70,11 +70,11 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { case 32: return EVP_aes_256_cbc(); default: - throw InternalException("Invalid AES key length"); + throw InternalException("Invalid AES key length for CBC"); } } default: - throw duckdb::InternalException("Invalid Encryption/Decryption Cipher: %d", static_cast(cipher)); + throw InternalException("Invalid Encryption/Decryption Cipher: %d", static_cast(cipher)); } } @@ -83,11 +83,21 @@ void AESStateSSL::GenerateRandomData(data_ptr_t data, idx_t len) { RAND_bytes(data, len); } -void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, const_data_ptr_t aad, idx_t aad_len) { +void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len_p, const_data_ptr_t aad, idx_t aad_len) { mode = EncryptionTypes::ENCRYPT; - if (1 != EVP_EncryptInit_ex(context, GetCipher(key_len), NULL, key, iv)) { - throw InternalException("EncryptInit failed"); + if (key_len_p != key_len) { + throw InternalException("Invalid encryption key length, expected %llu, got %llu", key_len, key_len_p); + } + if (1 != EVP_EncryptInit_ex(context, GetCipher(key_len), NULL, NULL, NULL)) { + throw InternalException("EncryptInit failed (attempt 1)"); + } + if (1 != EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) { + throw InternalException("EVP_CIPHER_CTX_ctrl failed (EVP_CTRL_GCM_SET_IVLEN)"); + } + + if (1 != EVP_EncryptInit_ex(context, NULL, NULL, key, iv)) { + throw InternalException("EncryptInit failed (attempt 2)"); } int len; @@ -98,13 +108,23 @@ void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_ } } -void AESStateSSL::InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, const_data_ptr_t aad, idx_t aad_len) { +void AESStateSSL::InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len_p, const_data_ptr_t aad, idx_t aad_len) { mode = EncryptionTypes::DECRYPT; - - if (1 != EVP_DecryptInit_ex(context, GetCipher(key_len), NULL, key, iv)) { - throw InternalException("DecryptInit failed"); + if (key_len_p != key_len) { + throw InternalException("Invalid encryption key length, expected %llu, got %llu", key_len, key_len_p); + } + if (1 != EVP_DecryptInit_ex(context, GetCipher(key_len), NULL, NULL, NULL)) { + throw InternalException("EVP_DecryptInit_ex failed to set cipher"); + } + // we use a bigger IV for GCM + if (cipher == EncryptionTypes::GCM) { + if (1 != EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) { + throw InternalException("EVP_CIPHER_CTX_ctrl failed to set GCM iv len"); + } + } + if (1 != EVP_DecryptInit_ex(context, NULL, NULL, key, iv)) { + throw InternalException("EVP_DecryptInit_ex failed to set iv/key"); } - int len; if (aad_len > 0){ if (!EVP_DecryptUpdate(context, NULL, &len, aad, aad_len)) { @@ -114,7 +134,6 @@ void AESStateSSL::InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const_ } size_t AESStateSSL::Process(const_data_ptr_t in, idx_t in_len, data_ptr_t out, idx_t out_len) { - switch (mode) { case EncryptionTypes::ENCRYPT: if (1 != EVP_EncryptUpdate(context, data_ptr_cast(out), reinterpret_cast(&out_len), @@ -135,7 +154,6 @@ size_t AESStateSSL::Process(const_data_ptr_t in, idx_t in_len, data_ptr_t out, i if (out_len != in_len) { throw InternalException("AES GCM failed, in- and output lengths differ"); } - return out_len; } @@ -189,7 +207,6 @@ size_t AESStateSSL::Finalize(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_ if (1 != EVP_EncryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len))) { throw InternalException("EncryptFinal failed"); } - return text_len += out_len; } @@ -197,7 +214,6 @@ size_t AESStateSSL::Finalize(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_ // EVP_DecryptFinal() will return an error code if final block is not correctly formatted. int ret = EVP_DecryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len)); text_len += out_len; - if (ret > 0) { // success return text_len; diff --git a/extension/httpfs/include/crypto.hpp b/extension/httpfs/include/crypto.hpp index 72100efe..f18fe0f3 100644 --- a/extension/httpfs/include/crypto.hpp +++ b/extension/httpfs/include/crypto.hpp @@ -22,10 +22,10 @@ void hmac256(std::string message, hash_bytes secret, hash_bytes &out); void hex256(hash_bytes &in, hash_str &out); -class DUCKDB_EXTENSION_API AESStateSSL : public duckdb::EncryptionState { +class DUCKDB_EXTENSION_API AESStateSSL : public EncryptionState { public: - explicit AESStateSSL(duckdb::EncryptionTypes::CipherType cipher_p, const std::string *key = nullptr); + explicit AESStateSSL(EncryptionTypes::CipherType cipher_p, idx_t key_len_p); ~AESStateSSL() override; public: @@ -40,8 +40,7 @@ class DUCKDB_EXTENSION_API AESStateSSL : public duckdb::EncryptionState { private: EVP_CIPHER_CTX *context; - duckdb::EncryptionTypes::Mode mode; - duckdb::EncryptionTypes::CipherType cipher; + EncryptionTypes::Mode mode; }; } // namespace duckdb @@ -53,8 +52,8 @@ class DUCKDB_EXTENSION_API AESStateSSLFactory : public duckdb::EncryptionUtil { explicit AESStateSSLFactory() { } - duckdb::shared_ptr CreateEncryptionState(duckdb::EncryptionTypes::CipherType cipher_p, duckdb::const_data_ptr_t key = nullptr, duckdb::idx_t key_len = 0) const override { - return duckdb::make_shared_ptr(cipher_p); + duckdb::shared_ptr CreateEncryptionState(duckdb::EncryptionTypes::CipherType cipher_p, duckdb::idx_t key_len_p) const override { + return duckdb::make_shared_ptr(cipher_p, key_len_p); } ~AESStateSSLFactory() override { From b0400ecb13c351f2cb37301c704191c1f8b60ae1 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Sep 2025 13:36:34 +0200 Subject: [PATCH 77/78] Bump also MainDistributionPipeline.yml --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index e3c397ba..fdf0d42e 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - duckdb_version: main + duckdb_version: v1.4-andium ci_tools_version: main @@ -28,6 +28,6 @@ jobs: secrets: inherit with: extension_name: httpfs - duckdb_version: main + duckdb_version: v1.4-andium ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} From ed9ba981bdfd0f1e419555eda173d36222b979fc Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Sep 2025 13:38:05 +0200 Subject: [PATCH 78/78] Add also configuration to run duckdb tests with httpfs linked in --- Makefile | 14 ++++++++++++++ test/configs/duckdb-tests.json | 16 ++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 test/configs/duckdb-tests.json diff --git a/Makefile b/Makefile index c7db0379..48de9ee9 100644 --- a/Makefile +++ b/Makefile @@ -8,3 +8,17 @@ CORE_EXTENSIONS='' # Include the Makefile from extension-ci-tools include extension-ci-tools/makefiles/duckdb_extension.Makefile + + +## Add some more extra tests +test_release_internal: + ./build/release/$(TEST_PATH) "$(PROJ_DIR)test/*" + ./build/release/$(TEST_PATH) --test-dir duckdb --test-config test/configs/duckdb-tests.json + +test_debug_internal: + ./build/debug/$(TEST_PATH) "$(PROJ_DIR)test/*" + ./build/debug/$(TEST_PATH) --test-dir duckdb --test-config test/configs/duckdb-tests.json + +test_reldebug_internal: + ./build/reldebug/$(TEST_PATH) "$(PROJ_DIR)test/*" + ./build/reldebug/$(TEST_PATH) --test-dir duckdb --test-config test/configs/duckdb-tests.json diff --git a/test/configs/duckdb-tests.json b/test/configs/duckdb-tests.json new file mode 100644 index 00000000..763755b0 --- /dev/null +++ b/test/configs/duckdb-tests.json @@ -0,0 +1,16 @@ +{ + "description": "Run tests with httpfs loaded", + "statically_loaded_extensions": ["core_functions","parquet","httpfs"], + "skip_compiled": "true", + "skip_tests": [ + { + "reason": "Secrets & zstd are supported with httpfs loaded", + "paths": [ + "test/sql/secrets/create_secret_hffs_autoload.test", + "test/sql/secrets/secret_types_function.test", + "test/sql/secrets/secret_autoloading_errors.test", + "test/sql/copy/csv/zstd_crash.test" + ] + } + ] +}