diff --git a/.github/ai-opt-out b/.github/ai-opt-out new file mode 100644 index 00000000000..f2bf078d222 --- /dev/null +++ b/.github/ai-opt-out @@ -0,0 +1 @@ +opt-out: true diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 1cef39e1945..7abc185173f 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -39,6 +39,32 @@ jobs: mode: release enables: --enable-dpdk options: --cook dpdk --dpdk-machine corei7-avx + # disable dpdk build as we don't use it an it is + # long and breaks now and then: it still runs + # upstream + if: false + + build_with_dual_tls: + name: "Test with both TLS backends" + uses: ./.github/workflows/test.yaml + strategy: + fail-fast: false + with: + compiler: clang++ + standard: 23 + mode: debug + options: --tls-mode=both + + build_with_openssl_tls: + name: "Test with OpenSSL TLS backend only" + uses: ./.github/workflows/test.yaml + strategy: + fail-fast: false + with: + compiler: clang++ + standard: 23 + mode: debug + options: --tls-mode=openssl build_with_cxx_modules: name: "Test with C++20 modules enabled" @@ -51,6 +77,9 @@ jobs: mode: debug enables: --enable-cxx-modules enable-ccache: false + # disable modules build as we aren't using module and it is quite + # broken at the moment + if: false fuzz_test: name: "Fuzz Tests" diff --git a/CMakeLists.txt b/CMakeLists.txt index bacd7a7fb4a..da32c419ab1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -525,11 +525,13 @@ seastar_generate_protobuf ( IN_FILE ${CMAKE_CURRENT_SOURCE_DIR}/src/proto/metrics2.proto OUT_DIR ${Seastar_GEN_BINARY_DIR}/src/proto) -set_option_if_package_is_found (Seastar_GNUTLS GnuTLS) -set_option_if_package_is_found (Seastar_OPENSSL OpenSSL) +option (Seastar_GNUTLS "Enable the GnuTLS-based TLS backend" ON) +option (Seastar_OPENSSL "Enable the OpenSSL-based TLS backend" OFF) if (NOT Seastar_GNUTLS AND NOT Seastar_OPENSSL) - message (FATAL_ERROR "At least one TLS/crypto backend is required. Install GnuTLS or OpenSSL development packages.") + message (FATAL_ERROR "At least one TLS backend must be enabled. " + "Pass -DSeastar_GNUTLS=ON and/or -DSeastar_OPENSSL=ON, " + "or use configure.py --tls-mode=gnutls|openssl|both.") endif () add_library (seastar @@ -722,6 +724,7 @@ add_library (seastar src/core/reactor_backend.cc src/core/thread_pool.cc src/core/app-template.cc + src/core/cpu_profiler.cc src/core/disk_params.cc src/core/dpdk_rte.cc src/core/exception_hacks.cc @@ -755,6 +758,7 @@ add_library (seastar src/core/semaphore.cc src/core/condition-variable.cc src/core/crypto.cc + src/core/signal_mutex.cc src/http/api_docs.cc src/http/common.cc src/http/file_handler.cc @@ -1132,6 +1136,15 @@ if (Seastar_OPENSSL) PRIVATE OpenSSL::SSL OpenSSL::Crypto) endif () +if (Seastar_GNUTLS AND Seastar_OPENSSL) + # Public marker: both TLS backends are compiled in, so the active backend is + # selected at reactor startup. Code that needs to handle the no-reactor case + # (e.g. static initializers, unit tests without a reactor) can use this to + # distinguish from the single-backend builds where the backend is fixed at + # compile time and available unconditionally. + target_compile_definitions (seastar PUBLIC SEASTAR_TLS_DUAL_BACKEND) +endif () + set_option_if_package_is_found (Seastar_IO_URING LibUring) if (Seastar_IO_URING) list (APPEND Seastar_PRIVATE_COMPILE_DEFINITIONS SEASTAR_HAVE_URING) diff --git a/apps/memcached/memcache.cc b/apps/memcached/memcache.cc index 6933e9e3975..6746e0fc07b 100644 --- a/apps/memcached/memcache.cc +++ b/apps/memcached/memcache.cc @@ -893,7 +893,7 @@ class ascii_protocol { private: static void append(std::vector>& bufs, const char* buf, size_t size) { if (size) { - bufs.emplace_back(const_cast(buf), size, deleter()); + bufs.push_back(temporary_buffer::maybe_unsafe_from_deleter(const_cast(buf), size, deleter())); } } @@ -917,7 +917,7 @@ class ascii_protocol { append(bufs, msg_crlf); append(bufs, item->value()); - bufs.emplace_back(const_cast(msg_crlf), strlen(msg_crlf), make_deleter([item = std::move(item)]{})); + bufs.push_back(temporary_buffer::maybe_unsafe_from_deleter(const_cast(msg_crlf), strlen(msg_crlf), make_deleter([item = std::move(item)]{}))); } template diff --git a/configure.py b/configure.py index 1cd299e5fa4..ca81cf1781b 100755 --- a/configure.py +++ b/configure.py @@ -173,6 +173,9 @@ def resolve_compilers_for_compiler_cache(args, compiler_cache): arg_parser.add_argument('--verbose', dest='verbose', action='store_true', help='Make configure output more verbose.') arg_parser.add_argument('--scheduling-groups-count', action='store', dest='scheduling_groups_count', default='16', help='Number of available scheduling groups in the reactor') +arg_parser.add_argument('--tls-mode', action='store', dest='tls_mode', + choices=['gnutls', 'openssl', 'both'], default='gnutls', + help='TLS backend(s) to enable: gnutls (default), openssl, or both') add_tristate( arg_parser, @@ -289,6 +292,8 @@ def configure_mode(mode): '-DBUILD_SHARED_LIBS={}'.format('yes' if mode in ('debug', 'dev') else 'no'), '-DSeastar_API_LEVEL={}'.format(args.api_level), '-DSeastar_SCHEDULING_GROUPS_COUNT={}'.format(args.scheduling_groups_count), + '-DSeastar_GNUTLS={}'.format('ON' if args.tls_mode in ('gnutls', 'both') else 'OFF'), + '-DSeastar_OPENSSL={}'.format('ON' if args.tls_mode in ('openssl', 'both') else 'OFF'), tr(args.exclude_tests, 'EXCLUDE_TESTS_FROM_ALL'), tr(args.exclude_apps, 'EXCLUDE_APPS_FROM_ALL'), tr(args.exclude_demos, 'EXCLUDE_DEMOS_FROM_ALL'), diff --git a/demos/tls_echo_server.hh b/demos/tls_echo_server.hh index e7185dac95e..4c633b1c520 100644 --- a/demos/tls_echo_server.hh +++ b/demos/tls_echo_server.hh @@ -46,70 +46,86 @@ class echoserver { seastar::gate _gate; bool _stopped = false; bool _verbose = false; + + future run_once() { + if (_stopped) { + return make_ready_future(stop_iteration::yes); + } + return with_gate(_gate, [this] { + return _socket.accept().then([this](accept_result ar) { + ::connected_socket s = std::move(ar.connection); + socket_address a = std::move(ar.remote_address); + if (_verbose) { + std::cout << "Got connection from "<< a << std::endl; + } + auto strms = make_lw_shared(std::move(s)); + return repeat([strms, this]() { + return strms->in.read().then([this, strms](temporary_buffer buf) { + if (buf.empty()) { + if (_verbose) { + std::cout << "EOM" << std::endl; + } + return make_ready_future(stop_iteration::yes); + } + sstring tmp(buf.begin(), buf.end()); + if (_verbose) { + std::cout << "Read " << tmp.size() << "B" << std::endl; + } + return strms->out.write(tmp).then([strms]() { + return strms->out.flush(); + }).then([] { + return make_ready_future(stop_iteration::no); + }); + }); + }).then([strms]{ + return strms->out.close(); + }).handle_exception([](auto ep) { + std::cout << "Exception: " << ep << std::endl; + }).finally([this, strms]{ + if (_verbose) { + std::cout << "Ending session" << std::endl; + } + return strms->in.close(); + }); + }).handle_exception([this](auto ep) { + if (!_stopped) { + std::cerr << "Error: " << ep << std::endl; + } + }).then([this] { + return make_ready_future(_stopped ? stop_iteration::yes : stop_iteration::no); + }); + }); + } public: echoserver(bool verbose = false) : _certs(make_shared(make_shared())) , _verbose(verbose) {} - future<> listen(socket_address addr, sstring crtfile, sstring keyfile, tls::client_auth ca = tls::client_auth::NONE) { - _certs->set_client_auth(ca); - return _certs->set_x509_key_file(crtfile, keyfile, tls::x509_crt_format::PEM).then([this, addr] { - ::listen_options opts; - opts.reuse_address = true; + future<> listen(socket_address addr, sstring crtfile, sstring keyfile, sstring cafile) { + _certs->set_dn_verification_callback([](seastar::tls::session_type, sstring subject, sstring issuer){ + std::cout << "DN Verification callback, subject: " << subject << " issuer: " << issuer << std::endl; + }); + auto f = make_ready_future(); + auto cauth = tls::client_auth::NONE; + if (cafile != "") { + cauth = tls::client_auth::REQUIRE; + f = _certs->set_x509_trust_file(cafile, tls::x509_crt_format::PEM); + } + _certs->set_client_auth(cauth); + return f.then([this, addr, crtfile, keyfile] { + return _certs->set_x509_key_file(crtfile, keyfile, tls::x509_crt_format::PEM).then([this, addr] { + ::listen_options opts; + opts.reuse_address = true; - _socket = tls::listen(_certs, addr, opts); + _socket = tls::listen(_certs, addr, opts); - // Listen in background. - (void)repeat([this] { - if (_stopped) { - return make_ready_future(stop_iteration::yes); - } - return with_gate(_gate, [this] { - return _socket.accept().then([this](accept_result ar) { - ::connected_socket s = std::move(ar.connection); - socket_address a = std::move(ar.remote_address); - if (_verbose) { - std::cout << "Got connection from "<< a << std::endl; - } - auto strms = make_lw_shared(std::move(s)); - return repeat([strms, this]() { - return strms->in.read().then([this, strms](temporary_buffer buf) { - if (buf.empty()) { - if (_verbose) { - std::cout << "EOM" << std::endl; - } - return make_ready_future(stop_iteration::yes); - } - sstring tmp(buf.begin(), buf.end()); - if (_verbose) { - std::cout << "Read " << tmp.size() << "B" << std::endl; - } - return strms->out.write(tmp).then([strms]() { - return strms->out.flush(); - }).then([] { - return make_ready_future(stop_iteration::no); - }); - }); - }).then([strms]{ - return strms->out.close(); - }).handle_exception([](auto ep) { - }).finally([this, strms]{ - if (_verbose) { - std::cout << "Ending session" << std::endl; - } - return strms->in.close(); - }); - }).handle_exception([this](auto ep) { - if (!_stopped) { - std::cerr << "Error: " << ep << std::endl; - } - }).then([this] { - return make_ready_future(_stopped ? stop_iteration::yes : stop_iteration::no); + // Listen in background. + (void)repeat([this] { + return run_once(); }); - }); + return make_ready_future(); }); - return make_ready_future(); }); } diff --git a/demos/tls_echo_server_demo.cc b/demos/tls_echo_server_demo.cc index 4c445fea973..9611f5e8b06 100644 --- a/demos/tls_echo_server_demo.cc +++ b/demos/tls_echo_server_demo.cc @@ -37,6 +37,7 @@ int main(int ac, char** av) { app.add_options() ("port", bpo::value()->default_value(10000), "Server port") ("address", bpo::value()->default_value("127.0.0.1"), "Server address") + ("ca,a", bpo::value()->default_value(""), "Server CA chain file") ("cert,c", bpo::value()->required(), "Server certificate file") ("key,k", bpo::value()->required(), "Certificate key") ("verbose,v", bpo::value()->default_value(false)->implicit_value(true), "Verbose") @@ -46,6 +47,7 @@ int main(int ac, char** av) { seastar_apps_lib::stop_signal stop_signal; auto&& config = app.configuration(); uint16_t port = config["port"].as(); + auto ca = config["ca"].as(); auto crt = config["cert"].as(); auto key = config["key"].as(); auto addr = config["address"].as(); @@ -61,7 +63,7 @@ int main(int ac, char** av) { auto stop_server = deferred_stop(server); try { - server.invoke_on_all(&echoserver::listen, socket_address(ia), sstring(crt), sstring(key), tls::client_auth::NONE).get(); + server.invoke_on_all(&echoserver::listen, socket_address(ia), sstring(crt), sstring(key),sstring(ca)).get(); } catch (...) { std::cerr << "Error: " << std::current_exception() << std::endl; return 1; diff --git a/demos/tls_simple_client_demo.cc b/demos/tls_simple_client_demo.cc index d35b451d229..a83ed798f8b 100644 --- a/demos/tls_simple_client_demo.cc +++ b/demos/tls_simple_client_demo.cc @@ -39,6 +39,8 @@ int main(int ac, char** av) { ("port", bpo::value()->default_value(10000), "Remote port") ("address", bpo::value()->default_value("127.0.0.1"), "Remote address") ("trust,t", bpo::value(), "Trust store") + ("certificate", bpo::value(), "Certficiate") + ("key,k", bpo::value(), "Private Keyfile") ("msg,m", bpo::value(), "Message to send") ("bytes,b", bpo::value()->default_value(512), "Use random bytes of length as message") ("iterations,i", bpo::value()->default_value(1), "Repeat X times") @@ -68,6 +70,15 @@ int main(int ac, char** av) { f = certs->set_x509_trust_file(config["trust"].as(), tls::x509_crt_format::PEM); } + if (config.count("certificate") && config.count("key")) { + f = f.then([certs, + cert = config["certificate"].as(), + key = config["key"].as()]{ + return certs->set_x509_key_file(cert, key, tls::x509_crt_format::PEM); + }); + } + + seastar::shared_ptr msg; if (config.count("msg")) { diff --git a/demos/udp_zero_copy_demo.cc b/demos/udp_zero_copy_demo.cc index 79dae99444f..c59b238138a 100644 --- a/demos/udp_zero_copy_demo.cc +++ b/demos/udp_zero_copy_demo.cc @@ -107,7 +107,7 @@ class server { if (_copy) { bufs.emplace_back(temporary_buffer(chunk, _chunk_size)); } else { - bufs.emplace_back(temporary_buffer(chunk, _chunk_size, deleter())); + bufs.emplace_back(temporary_buffer::maybe_unsafe_from_deleter(chunk, _chunk_size, deleter())); } chunk += _chunk_size; } diff --git a/include/seastar/core/deleter.hh b/include/seastar/core/deleter.hh index 371459e242f..9db9e0c61ee 100644 --- a/include/seastar/core/deleter.hh +++ b/include/seastar/core/deleter.hh @@ -21,13 +21,26 @@ #pragma once +#include #include #include #include #include #include +// The forward declarations of classes below are used for +// friending by the deleter. +struct test_deleter_append_does_not_free_shared_object; +struct test_deleter_append_same_shared_object_twice; + namespace seastar { +namespace net { + class packet; +}; +namespace internal { + struct wrapped_iovecs; +} +class pipe_data_sink_impl; /// \addtogroup memory-module /// @{ @@ -82,11 +95,21 @@ public: this->~deleter(); new (this) deleter(i); } +private: /// \endcond /// Appends another deleter to this deleter. When this deleter is /// destroyed, both encapsulated actions will be carried out. + /// + /// This operation is not thread-safe and therefore not made public + /// except for a few manually verified uses that are marked as freinds + /// below. void append(deleter d); -private: + friend class ::seastar::net::packet; + friend struct ::test_deleter_append_does_not_free_shared_object; + friend struct ::test_deleter_append_same_shared_object_twice; + friend struct ::seastar::internal::wrapped_iovecs; + friend class ::seastar::pipe_data_sink_impl; + static bool is_raw_object(impl* i) noexcept { auto x = reinterpret_cast(i); return x & 1; @@ -109,7 +132,9 @@ private: /// \cond internal struct deleter::impl { - unsigned refs = 1; + // The memory ordering on operations to this counter is similar to + // std::shared_ptr. + std::atomic refs = 1; deleter next; impl(deleter next) : next(std::move(next)) {} virtual ~impl() {} @@ -122,7 +147,7 @@ deleter::~deleter() { std::free(to_raw_object()); return; } - if (_impl && --_impl->refs == 0) { + if (_impl && _impl->refs.fetch_sub(1, std::memory_order_acq_rel) == 1) { delete _impl; } } @@ -203,7 +228,7 @@ deleter::share() { if (is_raw_object()) { _impl = new free_deleter_impl(to_raw_object()); } - ++_impl->refs; + _impl->refs.fetch_add(1, std::memory_order_relaxed); return deleter(_impl); } diff --git a/include/seastar/core/disk_params.hh b/include/seastar/core/disk_params.hh index 9bbd51e909f..3a8846add75 100644 --- a/include/seastar/core/disk_params.hh +++ b/include/seastar/core/disk_params.hh @@ -45,6 +45,7 @@ struct disk_params { std::optional physical_block_size; // Override for disks that lie about their physical block size bool duplex = false; float rate_factor = 1.0; + bool max_cost_function = true; }; class disk_config_params { diff --git a/include/seastar/core/file.hh b/include/seastar/core/file.hh index 6dc41b67985..19c69ea3250 100644 --- a/include/seastar/core/file.hh +++ b/include/seastar/core/file.hh @@ -643,7 +643,7 @@ public: future> dma_read_bulk(uint64_t offset, size_t range_size, io_intent* intent = nullptr) noexcept { return dma_read_bulk_impl(offset, range_size, intent).then([] (temporary_buffer t) { - return temporary_buffer(reinterpret_cast(t.get_write()), t.size(), t.release()); + return temporary_buffer::maybe_unsafe_from_deleter(reinterpret_cast(t.get_write()), t.size(), t.release()); }); } diff --git a/include/seastar/core/internal/cpu_profiler.hh b/include/seastar/core/internal/cpu_profiler.hh new file mode 100644 index 00000000000..7fb369cb1f6 --- /dev/null +++ b/include/seastar/core/internal/cpu_profiler.hh @@ -0,0 +1,182 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2023 ScyllaDB + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace seastar { + +class reactor; + +struct cpu_profiler_trace { + using kernel_trace_vec = boost::container::static_vector; + simple_backtrace user_backtrace; + kernel_trace_vec kernel_backtrace; + // The scheduling group active at the time the same was taken. Note that + // non-task reactor work (such as polling) ends up the in the default + // scheduling group (with name "main"). + scheduling_group sg; +}; + +constexpr size_t max_number_of_traces = 128; + +namespace internal { + +// Temporarily enable/disable the CPU profiler from taking stacktraces on this thread, +// but don't disable the profiler completely. This can be used disable the profiler +// for cases when taking a backtrace isn't valid (IE JIT generated code). +void profiler_drop_stacktraces(bool) noexcept; + +// A small RAII object to disable profiling temporarily +// +// This is not reentrant. +class scoped_disable_profile_temporarily { +public: + scoped_disable_profile_temporarily() noexcept { + profiler_drop_stacktraces(true); + } + ~scoped_disable_profile_temporarily() noexcept { + profiler_drop_stacktraces(false); + } +}; + +struct cpu_profiler_config { + bool enabled; + std::chrono::nanoseconds period; +}; + +struct cpu_profiler_stats { + unsigned dropped_samples_from_manual_disablement{0}; + unsigned dropped_samples_from_exceptions{0}; + unsigned dropped_samples_from_buffer_full{0}; + unsigned dropped_samples_from_mutex_contention{0}; + + void clear_dropped() { + dropped_samples_from_manual_disablement = 0; + dropped_samples_from_exceptions = 0; + dropped_samples_from_buffer_full = 0; + dropped_samples_from_mutex_contention = 0; + } + + unsigned sum_dropped() const { + return dropped_samples_from_manual_disablement + + dropped_samples_from_buffer_full + + dropped_samples_from_exceptions + + dropped_samples_from_mutex_contention; + } +}; + +class cpu_profiler { +private: + circular_buffer_fixed_capacity _traces; + // The operations in `_traces` are not reentrant. Therefore mutex is used to ensure + // that an interrupt cannot access `_traces` if the interrupted thread was already + // accessing it. + signal_mutex _traces_mutex; + cpu_profiler_config _cfg; + std::chrono::nanoseconds _last_set_timeout; + cpu_profiler_stats _stats; + bool _is_stopped{true}; + + + bool is_enabled() const; + std::chrono::nanoseconds period() const; + std::chrono::nanoseconds get_next_timeout(); + +protected: + friend reactor; + +public: + static int signal_number() { return SIGRTMIN + 2; } + + cpu_profiler(cpu_profiler_config cfg) : _cfg(cfg) {} + + // Allows for the sampling period of the profiler to be adjusted + // and the profiler to be enabled and disabled. + void update_config(cpu_profiler_config cfg); + // Stops the profiler if running and prevents it from starting until + // `start()` is explicitly called. + void stop(); + // Allows to profiler to run when it's enabled via the `cpu_profiler_config`. + void start(); + void on_signal(); + size_t results(std::vector& results_buffer); + + virtual ~cpu_profiler() = default; + virtual void arm_timer(std::chrono::nanoseconds) = 0; + virtual void disarm_timer() = 0; + virtual bool is_spurious_signal() { return false; } + virtual std::optional + try_get_kernel_backtrace() { return std::nullopt; } +}; + +class cpu_profiler_posix_timer : public cpu_profiler { + posix_timer _timer; +public: + cpu_profiler_posix_timer(cpu_profiler_config cfg) + : cpu_profiler(cfg) + // CLOCK_MONOTONIC is used here in place of CLOCK_THREAD_CPUTIME_ID. + // This is since for intervals of ~5ms or less CLOCK_THREAD_CPUTIME_ID + // fires 200-600% after it's configured time. Therefore it is not granular + // enough for cases where the reactor is configured to sleep when idle and + // is only active for short intervals. CLOCK_MONOTONIC doesn't suffer from + // this issue. + , _timer({signal_number()}, CLOCK_MONOTONIC) {} + + virtual ~cpu_profiler_posix_timer() override = default; + virtual void arm_timer(std::chrono::nanoseconds) override; + virtual void disarm_timer() override; +}; + +class cpu_profiler_linux_perf_event : public cpu_profiler { + linux_perf_event _perf_event; +public: + static std::unique_ptr try_make(cpu_profiler_config); + cpu_profiler_linux_perf_event(linux_perf_event perf_event, cpu_profiler_config cfg) + : cpu_profiler(cfg) + , _perf_event(std::move(perf_event)) {} + + virtual ~cpu_profiler_linux_perf_event() override = default; + virtual void arm_timer(std::chrono::nanoseconds) override; + virtual void disarm_timer() override; + virtual bool is_spurious_signal() override; + virtual std::optional + try_get_kernel_backtrace() override; +}; + +std::unique_ptr make_cpu_profiler(cpu_profiler_config cfg = {false, std::chrono::milliseconds(100)}); + +} +} diff --git a/include/seastar/core/internal/signal_mutex.hh b/include/seastar/core/internal/signal_mutex.hh new file mode 100644 index 00000000000..e48841cc772 --- /dev/null +++ b/include/seastar/core/internal/signal_mutex.hh @@ -0,0 +1,52 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2025 ScyllaDB + */ + +#pragma once + +#include +#include + +namespace seastar::internal { + +/// A lightweight mutex designed to work with interrupts +/// utilizing only compiler barriers. +class signal_mutex { +public: + class guard { + private: + signal_mutex* _mutex; + guard(signal_mutex* m) : _mutex(m) {} + friend class signal_mutex; + public: + guard(guard&& o) : _mutex(o._mutex) { o._mutex = nullptr; } + ~guard(); + }; + + // Returns a `guard` if the lock was acquired. + // Otherwise returns a nullopt. + std::optional try_lock(); + +private: + friend class guard; + std::atomic_bool _mutex; +}; + +} // namespace seastar::internal diff --git a/include/seastar/core/internal/stall_detector.hh b/include/seastar/core/internal/stall_detector.hh index cdee5b825fa..f65ab9eb96e 100644 --- a/include/seastar/core/internal/stall_detector.hh +++ b/include/seastar/core/internal/stall_detector.hh @@ -32,6 +32,7 @@ #include #include #include +#include namespace seastar { @@ -94,85 +95,25 @@ public: }; class cpu_stall_detector_posix_timer : public cpu_stall_detector { - timer_t _timer; + posix_timer _timer; public: explicit cpu_stall_detector_posix_timer(cpu_stall_detector_config cfg = {}); - virtual ~cpu_stall_detector_posix_timer() override; + virtual ~cpu_stall_detector_posix_timer() override = default; private: virtual void arm_timer() override; virtual void start_sleep() override; }; class cpu_stall_detector_linux_perf_event : public cpu_stall_detector { - file_desc _fd; - bool _enabled = false; - uint64_t _current_period = 0; - struct ::perf_event_mmap_page* _mmap; - char* _data_area; - size_t _data_area_mask; - // after the detector has been armed (i.e., _enabled is true), this - // is the moment at or after which the next signal is expected to occur - // and can be used for detecting spurious signals - sched_clock::time_point _next_signal_time{}; -private: - class data_area_reader { - cpu_stall_detector_linux_perf_event& _p; - const char* _data_area; - size_t _data_area_mask; - uint64_t _head; - uint64_t _tail; - public: - explicit data_area_reader(cpu_stall_detector_linux_perf_event& p) - : _p(p) - , _data_area(p._data_area) - , _data_area_mask(p._data_area_mask) { - _head = _p._mmap->data_head; - _tail = _p._mmap->data_tail; - std::atomic_thread_fence(std::memory_order_acquire); // required after reading data_head - } - ~data_area_reader() { - std::atomic_thread_fence(std::memory_order_release); // not documented, but probably required before writing data_tail - _p._mmap->data_tail = _tail; - } - uint64_t read_u64() { - uint64_t ret; - // We cannot wrap around if the 8-byte unit is aligned - std::copy_n(_data_area + (_tail & _data_area_mask), 8, reinterpret_cast(&ret)); - _tail += 8; - return ret; - } - template - S read_struct() { - static_assert(sizeof(S) % 8 == 0); - S ret; - char* p = reinterpret_cast(&ret); - for (size_t i = 0; i != sizeof(S); i += 8) { - uint64_t w = read_u64(); - std::copy_n(reinterpret_cast(&w), 8, p + i); - } - return ret; - } - void skip(uint64_t bytes_to_skip) { - _tail += bytes_to_skip; - } - // skip all the remaining data in the buffer, as-if calling read until - // have_data returns false (but much faster) - void skip_all() { - _tail = _head; - } - bool have_data() const { - return _head != _tail; - } - }; - - virtual void maybe_report_kernel_trace(backtrace_buffer& buf) override; + linux_perf_event _perf_event; public: static std::unique_ptr try_make(cpu_stall_detector_config cfg = {}); - explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {}); - ~cpu_stall_detector_linux_perf_event(); + explicit cpu_stall_detector_linux_perf_event(linux_perf_event perf_event, cpu_stall_detector_config cfg = {}); + virtual ~cpu_stall_detector_linux_perf_event() override = default; virtual void arm_timer() override; virtual void start_sleep() override; virtual bool is_spurious_signal() override; + virtual void maybe_report_kernel_trace(backtrace_buffer& buf) override; }; std::unique_ptr make_cpu_stall_detector(cpu_stall_detector_config cfg = {}); diff --git a/include/seastar/core/internal/timers.hh b/include/seastar/core/internal/timers.hh new file mode 100644 index 00000000000..ec4995bfe34 --- /dev/null +++ b/include/seastar/core/internal/timers.hh @@ -0,0 +1,146 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2023 ScyllaDB + */ + +#pragma once + +#ifndef SEASTAR_MODULE +#include +#include +#include +#include +#include + +#include +#endif + +#include +#include + +namespace seastar { +namespace internal { + +struct timer_cfg { + int signal_number; +}; + +class posix_timer { + timer_t _timer; +public: + explicit posix_timer(timer_cfg cfg, clockid_t clock_id = CLOCK_THREAD_CPUTIME_ID); + virtual ~posix_timer(); + void arm_timer(std::chrono::nanoseconds); + void disarm_timer(); +}; + +class linux_perf_event { + file_desc _fd; + bool _enabled = false; + uint64_t _current_period = 0; + struct ::perf_event_mmap_page* _mmap; + char* _data_area; + size_t _data_area_mask; + // after the detector has been armed (i.e., _enabled is true), this + // is the moment at or after which the next signal is expected to occur + // and can be used for detecting spurious signals + sched_clock::time_point _next_signal_time{}; +private: + class data_area_reader { + std::reference_wrapper _p; + const char* _data_area; + size_t _data_area_mask; + uint64_t _head; + uint64_t _tail; + public: + explicit data_area_reader(linux_perf_event& p) + : _p(p) + , _data_area(p._data_area) + , _data_area_mask(p._data_area_mask) { + _head = _p.get()._mmap->data_head; + _tail = _p.get()._mmap->data_tail; + std::atomic_thread_fence(std::memory_order_acquire); // required after reading data_head + } + data_area_reader(data_area_reader&& o) + : _p(o._p) + , _data_area(o._data_area) + , _data_area_mask(o._data_area_mask) + , _head(o._head) + , _tail(o._tail) { + o._data_area = nullptr; + } + ~data_area_reader() { + if(_data_area != nullptr) { + std::atomic_thread_fence(std::memory_order_release); // not documented, but probably required before writing data_tail + _p.get()._mmap->data_tail = _tail; + } + } + uint64_t read_u64() { + + uint64_t ret; + // We cannot wrap around if the 8-byte unit is aligned + std::copy_n(_data_area + (_tail & _data_area_mask), 8, reinterpret_cast(&ret)); + _tail += 8; + return ret; + } + template + S read_struct() { + static_assert(sizeof(S) % 8 == 0); + S ret; + char* p = reinterpret_cast(&ret); + for (size_t i = 0; i != sizeof(S); i += 8) { + uint64_t w = read_u64(); + std::copy_n(reinterpret_cast(&w), 8, p + i); + } + return ret; + } + void skip(uint64_t bytes_to_skip) { + _tail += bytes_to_skip; + } + // skip all the remaining data in the buffer, as-if calling read until + // have_data returns false (but much faster) + void skip_all() { + _tail = _head; + } + bool have_data() const { + return _head != _tail; + } + }; + + explicit linux_perf_event(file_desc fd); +public: + + class kernel_backtrace { + data_area_reader _reader; + public: + kernel_backtrace(data_area_reader reader) : _reader(std::move(reader)) {} + void read_backtrace(std::function); + }; + + linux_perf_event(linux_perf_event&&); + static linux_perf_event try_make(timer_cfg cfg); + ~linux_perf_event(); + void arm_timer(std::chrono::nanoseconds); + void disarm_timer(); + bool is_spurious_signal(); + std::optional try_get_kernel_backtrace(); +}; + +} +} diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh index 69411e901f4..d07db79feb4 100644 --- a/include/seastar/core/io_queue.hh +++ b/include/seastar/core/io_queue.hh @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -192,6 +193,13 @@ public: std::chrono::milliseconds stall_threshold = std::chrono::milliseconds(100); std::chrono::microseconds tau = std::chrono::milliseconds(5); std::optional physical_block_size; // Override for disks that lie about their physical block size + + // Original values of io-properties (if available) + size_t read_bytes_rate = std::numeric_limits::max(); + size_t write_bytes_rate = std::numeric_limits::max(); + size_t read_req_rate = std::numeric_limits::max(); + size_t write_req_rate = std::numeric_limits::max(); + bool max_cost_function = true; }; io_queue(io_group_ptr group, internal::io_sink& sink); diff --git a/include/seastar/core/memory.hh b/include/seastar/core/memory.hh index 05c66bcce7e..e2762464a0a 100644 --- a/include/seastar/core/memory.hh +++ b/include/seastar/core/memory.hh @@ -184,6 +184,17 @@ public: } }; +// Within the scope of this object, the allocator will not abort (when it +// would normally do so, i.e., when abort_on_allocation_failure is true), +// but rather fall back to the system allocator. +struct scoped_system_alloc_fallback { + scoped_system_alloc_fallback() noexcept; + ~scoped_system_alloc_fallback() noexcept; + + scoped_system_alloc_fallback(const scoped_system_alloc_fallback&) = delete; + scoped_system_alloc_fallback& operator=(const scoped_system_alloc_fallback&) = delete; +}; + // Disables heap profiling as long as this object is alive. // Can be nested, in which case the profiling is re-enabled when all // the objects go out of scope. @@ -300,6 +311,7 @@ class statistics { uint64_t _reclaims; uint64_t _large_allocs; uint64_t _failed_allocs; + uint64_t _fallback_allocs; uint64_t _foreign_mallocs; uint64_t _foreign_frees; @@ -307,11 +319,11 @@ class statistics { private: statistics(uint64_t mallocs, uint64_t frees, uint64_t cross_cpu_frees, uint64_t total_memory, uint64_t free_memory, uint64_t total_bytes_allocated, uint64_t reclaims, - uint64_t large_allocs, uint64_t failed_allocs, + uint64_t large_allocs, uint64_t failed_allocs, uint64_t fallback_allocs, uint64_t foreign_mallocs, uint64_t foreign_frees, uint64_t foreign_cross_frees) : _mallocs(mallocs), _frees(frees), _cross_cpu_frees(cross_cpu_frees) , _total_memory(total_memory), _free_memory(free_memory), _total_bytes_allocated(total_bytes_allocated), _reclaims(reclaims) - , _large_allocs(large_allocs), _failed_allocs(failed_allocs) + , _large_allocs(large_allocs), _failed_allocs(failed_allocs), _fallback_allocs(fallback_allocs) , _foreign_mallocs(foreign_mallocs), _foreign_frees(foreign_frees) , _foreign_cross_frees(foreign_cross_frees) {} public: @@ -337,13 +349,18 @@ public: /// Number of allocations which violated the large allocation threshold uint64_t large_allocations() const { return _large_allocs; } /// Number of allocations which failed, i.e., where the required memory could not be obtained - /// even after reclaim was attempted + /// even after reclaim was attempted and which did not fallback (see fallback_allocations()) uint64_t failed_allocations() const { return _failed_allocs; } - /// Number of foreign allocations + /// Number of allocations which fell back to the system allocator, i.e., because they were + /// in a fallback allocation scope. These are not counted in failed_allocations. + uint64_t fallback_allocations() const { return _fallback_allocs; } + /// Number of foreign allocations, which are all allocations which use the system allocator. + /// These include allocations on alien threads, allocations (even on reactor threads) before + /// the allocator is initialized and allocations in a fallback allocation scope. uint64_t foreign_mallocs() const { return _foreign_mallocs; } - /// Number of foreign frees + /// Number of foreign frees (frees of non-seastar-heap pointers) on alien threads uint64_t foreign_frees() const { return _foreign_frees; } - /// Number of foreign frees on reactor threads + /// Number of foreign frees (frees of non-seastar-heap pointers) on reactor threads uint64_t foreign_cross_frees() const { return _foreign_cross_frees; } friend statistics stats(); }; diff --git a/include/seastar/core/metrics.hh b/include/seastar/core/metrics.hh index cd3c5749c5c..ba25db36a52 100644 --- a/include/seastar/core/metrics.hh +++ b/include/seastar/core/metrics.hh @@ -435,6 +435,7 @@ public: virtual metric_groups_def& add_metric(group_name_type name, const metric_definition& md) = 0; virtual metric_groups_def& add_group(group_name_type name, const std::initializer_list& l) = 0; virtual metric_groups_def& add_group(group_name_type name, const std::vector& l) = 0; + virtual int get_handle() const = 0; }; escaped_string shard(); @@ -649,6 +650,13 @@ impl::metric_definition_impl make_total_operations(metric_name_type name, return make_counter(name, std::forward(val), d, labels).set_type("total_operations"); } +/*! + * \brief Update the aggregation labels of a metric family + */ +void update_aggregate_labels(const group_name_type& group_name, + const metric_name_type& metric_name, + const std::vector