diff --git a/projects/hotswap/CMakeLists.txt b/projects/hotswap/CMakeLists.txt index d71ae9143be..dbc24c97de4 100644 --- a/projects/hotswap/CMakeLists.txt +++ b/projects/hotswap/CMakeLists.txt @@ -30,6 +30,7 @@ add_library(hsa-hotswap SHARED hotswap_tool.cpp hotswap.cpp hotswap_gfx_query.cpp + hotswap_loader_policy.cpp ${HOTSWAP_PLATFORM_IO_SRC} ) @@ -68,14 +69,14 @@ file(WRITE "${_hotswap_co_hdr}" "static const unsigned char kGfx1250MinCo[] = {${_hotswap_co_arr}};\n") target_include_directories(hotswap_test PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) -# Unit tests for the gfx-target / ASIC-revision query logic. The test compiles -# the portable hotswap_gfx_query.cpp unit alongside the test translation unit -# and supplies its own stubs for the HSA entry points, so this target needs -# the relevant headers but must NOT link the real HSA library (doing so would -# clash with the in-file stubs). +# Unit tests for gfx-target / ASIC-revision query logic and rewrite-policy +# decisions. The test supplies its own HSA stubs, so this target needs the +# relevant headers but must NOT link the real HSA library (doing so would clash +# with the in-file stubs). add_executable(hotswap_tool_test tests/hotswap_tool_test.cpp hotswap_gfx_query.cpp + hotswap_loader_policy.cpp ) target_include_directories(hotswap_tool_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} @@ -83,6 +84,22 @@ target_include_directories(hotswap_tool_test PRIVATE ${HSA_RUNTIME_INC}/.. ) +# Loader-path tests for libhsa-hotswap.so. The test includes hotswap_tool.cpp +# directly and supplies stub HSA/COMGR entry points, so it does not link the +# real HSA runtime or COMGR. +add_executable(hotswap_loader_test + tests/hotswap_loader_test.cpp + hotswap_gfx_query.cpp + hotswap_loader_policy.cpp + ${HOTSWAP_PLATFORM_IO_SRC} +) +target_include_directories(hotswap_loader_test PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${HSA_RUNTIME_INC} + ${HSA_RUNTIME_INC}/.. +) + enable_testing() add_test(NAME hotswap_test COMMAND hotswap_test) add_test(NAME hotswap_tool_test COMMAND hotswap_tool_test) +add_test(NAME hotswap_loader_test COMMAND hotswap_loader_test) diff --git a/projects/hotswap/hotswap.hpp b/projects/hotswap/hotswap.hpp index 4034723b25b..3a495566a30 100644 --- a/projects/hotswap/hotswap.hpp +++ b/projects/hotswap/hotswap.hpp @@ -22,11 +22,9 @@ std::string GetCodeObjectIsaName(const void *elf_data, size_t elf_size); /// /// Both ISA names are supplied by the caller: source_isa typically comes from /// the code object (see GetCodeObjectIsaName) and target_isa from the running -/// GPU (e.g. the HSA agent), but either may be overridden. COMGR's -/// amd_comgr_hotswap_rewrite (linked directly) applies whatever transformation -/// the source/target pair calls for -- same-ISA stepping patches (e.g. gfx1250 -/// B0 to A0) or cross-family transpilation -- and returns the rewritten code -/// object. If no transformation is needed, the output is a copy of the input. +/// GPU (e.g. the HSA agent), but either may be overridden. This wrapper passes +/// the request to COMGR's amd_comgr_hotswap_rewrite and returns COMGR's output +/// bytes to the caller. /// /// On success, *out_data and *out_size describe the rewritten code object. /// If *out_data differs from elf_data, it was allocated by this function diff --git a/projects/hotswap/hotswap_gfx_query.cpp b/projects/hotswap/hotswap_gfx_query.cpp index 6361b4ebe96..456c9458ed9 100644 --- a/projects/hotswap/hotswap_gfx_query.cpp +++ b/projects/hotswap/hotswap_gfx_query.cpp @@ -48,7 +48,9 @@ std::string extract_gfx_target(const std::string &isa_name) { if (pos == std::string::npos) return {}; auto end = std::find_if_not(isa_name.begin() + pos, isa_name.end(), - [](unsigned char c) { return std::isalnum(c); }); + [](unsigned char c) { + return std::isalnum(c) || c == '-'; + }); return isa_name.substr(pos, end - isa_name.begin() - pos); } @@ -94,9 +96,4 @@ void reset_gfx_revision_cache() { g_cache.clear(); } -bool gate_allows_hotswap(const AgentGfxRevision &gfx) { - return gfx.revision_valid && gfx.gfx_target == "gfx1250" && - gfx.asic_revision == 0; // A0 -} - } // namespace rocr::hotswap diff --git a/projects/hotswap/hotswap_gfx_query.hpp b/projects/hotswap/hotswap_gfx_query.hpp index 3fe08813ffa..db95997e481 100644 --- a/projects/hotswap/hotswap_gfx_query.hpp +++ b/projects/hotswap/hotswap_gfx_query.hpp @@ -33,25 +33,21 @@ struct AgentGfxRevision { // "amdgcn-amd-amdhsa--gfx1250:sramecc+:xnack-"), or an empty string on failure. std::string get_agent_isa_name(hsa_agent_t agent); -// Extracts the gfx target (e.g. "gfx1250") from a full HSA ISA name. Returns an -// empty string when no gfx target is present. The returned token stops at the -// first non-alphanumeric character so feature suffixes (":sramecc+", etc.) are -// dropped. +// Extracts the gfx target (e.g. "gfx1250" or "gfx12-5-generic") from a full +// HSA ISA name. Returns an empty string when no gfx target is present. The +// returned token preserves hyphenated processor names and stops before feature +// suffixes (":sramecc+", etc.). std::string extract_gfx_target(const std::string &isa_name); // Queries the agent's gfx target and ASIC revision via the HSA runtime. The // result is cached per agent handle, since code-object loads can be frequent. -// This function intentionally encodes no gating policy; callers apply -// gate_allows_hotswap() (below) to decide whether to act. +// This function intentionally encodes no rewrite policy; callers apply the +// policy in hotswap_loader_policy.{hpp,cpp}. AgentGfxRevision query_agent_gfx_revision(hsa_agent_t agent); // Clears the per-agent-handle cache used by query_agent_gfx_revision(). void reset_gfx_revision_cache(); -// HotSwap's activation policy: rewriting is performed only for gfx1250 silicon -// at ASIC revision A0 (and only when the revision was successfully queried). -bool gate_allows_hotswap(const AgentGfxRevision &gfx); - } // namespace rocr::hotswap #endif // ROCR_HOTSWAP_GFX_QUERY_HPP diff --git a/projects/hotswap/hotswap_loader_policy.cpp b/projects/hotswap/hotswap_loader_policy.cpp new file mode 100644 index 00000000000..6546a2ea09b --- /dev/null +++ b/projects/hotswap/hotswap_loader_policy.cpp @@ -0,0 +1,106 @@ +//===- hotswap_loader_policy.cpp - HotSwap loader decision policy ---------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hotswap_loader_policy.hpp" + +#include +#include +#include + +namespace rocr::hotswap { + +namespace { + +constexpr char Gfx1250[] = "gfx1250"; +constexpr char Gfx12_5Generic[] = "gfx12-5-generic"; +constexpr char Gfx125Prefix[] = "gfx125"; +constexpr char Gfx1250B0Feature[] = ":gfx1250-b0-specific+"; +constexpr char Gfx1250A0Feature[] = ":gfx1250-b0-specific-"; + +enum class Gfx1250Stepping { + B0, + A0, +}; + +const char *gfx1250_stepping_feature(Gfx1250Stepping stepping) { + return stepping == Gfx1250Stepping::B0 ? Gfx1250B0Feature : Gfx1250A0Feature; +} + +bool is_gfx12_5_target(const std::string &gfx_target) { + constexpr size_t Gfx125PrefixLen = sizeof(Gfx125Prefix) - 1; + if (gfx_target == Gfx12_5Generic) { + return true; + } + if (gfx_target.size() <= Gfx125PrefixLen || + gfx_target.compare(0, Gfx125PrefixLen, Gfx125Prefix) != 0) { + return false; + } + return std::all_of(gfx_target.begin() + Gfx125PrefixLen, gfx_target.end(), + [](unsigned char c) { return std::isdigit(c); }); +} + +std::string with_gfx1250_stepping_feature(const std::string &isa_name, + Gfx1250Stepping stepping) { + if (extract_gfx_target(isa_name) != Gfx1250 || + isa_name.find(Gfx1250B0Feature) != std::string::npos || + isa_name.find(Gfx1250A0Feature) != std::string::npos) { + return isa_name; + } + return isa_name + gfx1250_stepping_feature(stepping); +} + +} // namespace + +bool gate_allows_hotswap(const AgentGfxRevision &gfx) { + return gfx.revision_valid && gfx.gfx_target == Gfx1250 && + gfx.asic_revision == 0; // A0 +} + +bool has_candidate_hotswap_rewrite(const AgentGfxRevision &gfx, + const RewriteOptions &options) { + return gate_allows_hotswap(gfx) || + (options.gfx12_5_rewrite_requested && + is_gfx12_5_target(gfx.gfx_target)); +} + +std::optional +decide_hotswap_rewrite(const AgentGfxRevision &gfx, + const std::string &source_isa, + const std::string &target_isa, + const RewriteOptions &options) { + if (source_isa.empty() || target_isa.empty()) { + return std::nullopt; + } + + std::string source_gfx = extract_gfx_target(source_isa); + + if (gate_allows_hotswap(gfx) && source_gfx == Gfx1250 && + extract_gfx_target(target_isa) == Gfx1250) { + return RewriteDecision{ + with_gfx1250_stepping_feature(source_isa, Gfx1250Stepping::B0), + with_gfx1250_stepping_feature(target_isa, Gfx1250Stepping::A0)}; + } + + if (!options.gfx12_5_rewrite_requested || + !is_gfx12_5_target(gfx.gfx_target) || !is_gfx12_5_target(source_gfx)) { + return std::nullopt; + } + + // ROCm/rocm-systems#7581 established the loader-side invariant that this + // opt-in path uses the code object's processor, not a source->agent retarget. + RewriteDecision decision{source_isa, source_isa}; + + if (source_gfx == Gfx1250) { + decision.source_isa = + with_gfx1250_stepping_feature(source_isa, Gfx1250Stepping::B0); + decision.target_isa = + with_gfx1250_stepping_feature(source_isa, Gfx1250Stepping::B0); + } + + return decision; +} + +} // namespace rocr::hotswap diff --git a/projects/hotswap/hotswap_loader_policy.hpp b/projects/hotswap/hotswap_loader_policy.hpp new file mode 100644 index 00000000000..c0c5b8833ab --- /dev/null +++ b/projects/hotswap/hotswap_loader_policy.hpp @@ -0,0 +1,53 @@ +//===- hotswap_loader_policy.hpp - HotSwap loader decision policy ---------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Centralized loader policy for selecting whether HotSwap should call COMGR and +// which source/target ISA pair it should pass. COMGR owns validation and all +// code-object transformations after the call crosses this boundary. +// +//===----------------------------------------------------------------------===// + +#ifndef ROCR_HOTSWAP_LOADER_POLICY_HPP +#define ROCR_HOTSWAP_LOADER_POLICY_HPP + +#include "hotswap_gfx_query.hpp" + +#include +#include + +namespace rocr::hotswap { + +struct RewriteOptions { + bool gfx12_5_rewrite_requested = false; +}; + +struct RewriteDecision { + std::string source_isa; + std::string target_isa; +}; + +// HotSwap's baseline gfx1250 route is active only for gfx1250 silicon at ASIC +// revision A0 (and only when the revision was successfully queried). +bool gate_allows_hotswap(const AgentGfxRevision &gfx); + +// Agent-level precheck used by the loader to avoid source-ISA parsing when no +// local routing condition can possibly apply. +bool has_candidate_hotswap_rewrite(const AgentGfxRevision &gfx, + const RewriteOptions &options); + +// Returns the COMGR ISA pair for this load, or std::nullopt when the original +// code object should be loaded unchanged. The decision is limited to loader +// routing and ISA-pair construction; COMGR decides which rewrite work, if any, +// is enabled for the request. +std::optional +decide_hotswap_rewrite(const AgentGfxRevision &gfx, + const std::string &source_isa, + const std::string &target_isa, + const RewriteOptions &options); + +} // namespace rocr::hotswap + +#endif // ROCR_HOTSWAP_LOADER_POLICY_HPP diff --git a/projects/hotswap/hotswap_tool.cpp b/projects/hotswap/hotswap_tool.cpp index ab709d5b57b..7199bfde991 100644 --- a/projects/hotswap/hotswap_tool.cpp +++ b/projects/hotswap/hotswap_tool.cpp @@ -16,11 +16,13 @@ #include "hotswap.hpp" #include "hotswap_gfx_query.hpp" #include "hotswap_platform_io.hpp" +#include "hotswap_loader_policy.hpp" #include #include #include #include #include +#include #include #include #include @@ -41,9 +43,12 @@ namespace { namespace hotswap_io = rocr::hotswap::platform_io; using rocr::hotswap::AgentGfxRevision; -using rocr::hotswap::gate_allows_hotswap; +using rocr::hotswap::decide_hotswap_rewrite; using rocr::hotswap::get_agent_isa_name; +using rocr::hotswap::has_candidate_hotswap_rewrite; using rocr::hotswap::query_agent_gfx_revision; +using rocr::hotswap::RewriteDecision; +using rocr::hotswap::RewriteOptions; using ByteVec = std::shared_ptr>; using OwnedElf = std::unique_ptr; @@ -107,6 +112,11 @@ void stash_bytes(uint64_t handle, const uint8_t *data, size_t size) { g_reader_map[handle] = ReaderEntry{std::move(vec), false, false}; } +bool gfx12_5_rewrite_requested() { + const char *value = std::getenv("AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES"); + return value && value[0] != '\0' && std::strcmp(value, "0") != 0; +} + bool try_get_reader_entry(uint64_t handle, ByteVec *bytes, bool *from_file) { std::scoped_lock lock(g_reader_map_mutex); const auto it = g_reader_map.find(handle); @@ -254,29 +264,24 @@ hsa_status_t load_rewritten_reader(hsa_executable_t executable, hsa_agent_t agen } hsa_status_t try_retarget_and_load(hsa_executable_t executable, hsa_agent_t agent, - hsa_code_object_reader_t code_object_reader, const char *options, hsa_loaded_code_object_t *loaded_code_object, - const ByteVec &local_bytes) { - // Source ISA from the code object, target ISA from the running GPU. - const std::string source_isa = rocr::hotswap::GetCodeObjectIsaName( - local_bytes->data(), local_bytes->size()); - const std::string target_isa = get_agent_isa_name(agent); - if (source_isa.empty() || target_isa.empty()) { - HOTSWAP_LOG("hotswap: rewrite SKIP empty isa (src='%s' tgt='%s' size=%zu)\n", - source_isa.c_str(), target_isa.c_str(), local_bytes->size()); - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } - + const ByteVec &local_bytes, + const RewriteDecision &decision, + const RewriteOptions &rewrite_options) { + // Route through RetargetCodeObject once policy has selected a request. A + // same-processor source/target pair can still be meaningful to COMGR. void *out_elf = nullptr; size_t out_elf_size = 0; const int rc = rocr::hotswap::RetargetCodeObject( - local_bytes->data(), local_bytes->size(), source_isa.c_str(), - target_isa.c_str(), &out_elf, &out_elf_size); + local_bytes->data(), local_bytes->size(), decision.source_isa.c_str(), + decision.target_isa.c_str(), &out_elf, &out_elf_size); - HOTSWAP_LOG("hotswap: rewrite src=%s tgt=%s in=%zu rc=%d out=%zu changed=%d\n", - source_isa.c_str(), target_isa.c_str(), local_bytes->size(), rc, - out_elf_size, out_elf != local_bytes->data()); + HOTSWAP_LOG("hotswap: rewrite src=%s tgt=%s gfx12_5_opt_in=%d in=%zu " + "rc=%d out=%zu changed=%d\n", + decision.source_isa.c_str(), decision.target_isa.c_str(), + rewrite_options.gfx12_5_rewrite_requested, local_bytes->size(), + rc, out_elf_size, out_elf != local_bytes->data()); if (rc != 0 || out_elf == local_bytes->data()) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; @@ -311,10 +316,9 @@ hsa_status_t HSA_API hotswap_load_agent_code_object( reader_from_file); } - // Gate HotSwap to gfx1250 A0 silicon. On any other GPU or stepping, load - // the original code object unchanged instead of routing through COMGR. const AgentGfxRevision gfx = query_agent_gfx_revision(agent); - if (!gate_allows_hotswap(gfx)) { + const RewriteOptions rewrite_options{gfx12_5_rewrite_requested()}; + if (!has_candidate_hotswap_rewrite(gfx, rewrite_options)) { HOTSWAP_LOG("hotswap: gate BLOCKED (gfx=%s rev=%u valid=%d)\n", gfx.gfx_target.c_str(), gfx.asic_revision, gfx.revision_valid); return load_original_reader(executable, agent, code_object_reader, @@ -322,9 +326,22 @@ hsa_status_t HSA_API hotswap_load_agent_code_object( reader_from_file); } + const std::string source_isa = rocr::hotswap::GetCodeObjectIsaName( + local_bytes->data(), local_bytes->size()); + const std::string target_isa = get_agent_isa_name(agent); + const auto decision = + decide_hotswap_rewrite(gfx, source_isa, target_isa, rewrite_options); + if (!decision) { + HOTSWAP_LOG("hotswap: decision NONE (gfx=%s src='%s' tgt='%s')\n", + gfx.gfx_target.c_str(), source_isa.c_str(), target_isa.c_str()); + return load_original_reader(executable, agent, code_object_reader, + options, loaded_code_object, + reader_from_file); + } + const hsa_status_t status = try_retarget_and_load( - executable, agent, code_object_reader, options, loaded_code_object, - local_bytes); + executable, agent, options, loaded_code_object, local_bytes, *decision, + rewrite_options); if (status == HSA_STATUS_SUCCESS) { return status; } diff --git a/projects/hotswap/tests/hotswap_loader_test.cpp b/projects/hotswap/tests/hotswap_loader_test.cpp new file mode 100644 index 00000000000..6dca51cd7f4 --- /dev/null +++ b/projects/hotswap/tests/hotswap_loader_test.cpp @@ -0,0 +1,165 @@ +//===- hotswap_loader_test.cpp - Tests for HSA tools loader path ----------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The test support header includes hotswap_tool.cpp directly so these tests can +// drive the wrapped HSA API-table entry points without a GPU, real HSA runtime, +// or real COMGR. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "hotswap_loader_test_utils.hpp" + +namespace { + +void test_OptInDisabledLoadsOriginal() { + begin_test("OptInDisabledLoadsOriginal", + "Unset, empty, and 0 opt-in values must leave non-A0 gfx1250 " + "on the original reader."); + struct FlagCase { + const char *flag_value; + const char *expectation; + }; + const FlagCase cases[] = { + {nullptr, "unset opt-in skips non-A0 gfx1250 rewrite"}, + {"", "empty opt-in skips non-A0 gfx1250 rewrite"}, + {"0", "opt-in value 0 skips non-A0 gfx1250 rewrite"}, + }; + for (const FlagCase &c : cases) { + const LoadResult result = + load_once(kGfx1250Isa, kGfx1250Isa, c.flag_value); + check_original_load(result, c.expectation); + } +} + +void test_OptInRoutesGfx1250SameProcessor() { + begin_test("OptInRoutesGfx1250SameProcessor", + "The opt-in must route non-A0 gfx1250 through COMGR with a " + "same-processor request."); + const LoadResult result = load_once(kGfx1250Isa, kGfx1250Isa, "1"); + check(result.status == HSA_STATUS_SUCCESS, "load succeeds"); + check(result.retarget_calls == 1, + "opt-in routes non-A0 gfx1250 through COMGR"); + check(result.source_isa == kGfx1250B0Isa, + "source ISA is tagged as B0"); + check(result.target_isa == kGfx1250B0Isa, + "non-A0 target ISA is tagged as B0"); + check(result.loaded_reader != result.original_reader, + "rewritten reader is loaded instead of original reader"); + check(result.retained_elfs == 1, + "rewritten ELF is retained after successful load"); +} + +void test_OptInRoutesGfx12_5Family() { + begin_test("OptInRoutesGfx12_5Family", + "The opt-in must route gfx125* and gfx12-5-generic " + "without adding gfx1250 stepping features."); + const char *cases[] = {kGfx1251Isa, kGfx12_5GenericIsa}; + for (const char *isa : cases) { + const LoadResult result = load_once(isa, isa, "1"); + check(result.status == HSA_STATUS_SUCCESS, "load succeeds"); + check(result.retarget_calls == 1, + "opt-in routes gfx12.5 target through COMGR"); + check(result.source_isa == isa, "source ISA is preserved"); + check(result.target_isa == isa, "target ISA is preserved"); + } +} + +void test_GenericSourceUsesGenericTarget() { + begin_test("GenericSourceUsesGenericTarget", + "A gfx12-5-generic source loaded on a concrete gfx125* agent " + "must stay generic to avoid processor retargeting."); + const LoadResult result = load_once(kGfx12_5GenericIsa, kGfx1251Isa, "1"); + check(result.status == HSA_STATUS_SUCCESS, "load succeeds"); + check(result.retarget_calls == 1, + "generic source on concrete gfx125 agent routes through COMGR"); + check(result.source_isa == kGfx12_5GenericIsa, + "generic source ISA is preserved"); + check(result.target_isa == kGfx12_5GenericIsa, + "generic target ISA is preserved to avoid processor retargeting"); +} + +void test_ConcreteSourceUsesSourceTarget() { + begin_test("ConcreteSourceUsesSourceTarget", + "A concrete gfx125* source loaded on a different gfx125* agent " + "must stay on the source processor to avoid retargeting."); + const LoadResult result = load_once(kGfx1250Isa, kGfx1251Isa, "1"); + check(result.status == HSA_STATUS_SUCCESS, "load succeeds"); + check(result.retarget_calls == 1, + "concrete source on different gfx125 agent routes through COMGR"); + check(result.source_isa == kGfx1250B0Isa, + "source ISA is tagged as B0"); + check(result.target_isa == kGfx1250B0Isa, + "target ISA stays on the source processor"); +} + +void test_A0UsesBaselineRouteWithoutOptIn() { + begin_test("A0UsesBaselineRouteWithoutOptIn", + "The existing gfx1250 A0 route must still call COMGR without " + "the opt-in."); + const LoadResult result = load_once(kGfx1250Isa, kGfx1250Isa, nullptr, 0); + check(result.status == HSA_STATUS_SUCCESS, "load succeeds"); + check(result.retarget_calls == 1, "A0 gfx1250 keeps baseline route"); + check(result.source_isa == kGfx1250B0Isa, + "source code object ISA is tagged as B0"); + check(result.target_isa == kGfx1250A0Isa, + "A0 agent ISA is tagged as A0"); +} + +void test_A0WithOptInKeepsBaselinePair() { + begin_test("A0WithOptInKeepsBaselinePair", + "The opt-in on gfx1250 A0 must preserve the baseline ISA pair " + "while routing through COMGR."); + const LoadResult result = load_once(kGfx1250Isa, kGfx1250Isa, "1", 0); + check(result.status == HSA_STATUS_SUCCESS, "load succeeds"); + check(result.retarget_calls == 1, "A0 gfx1250 routes through COMGR"); + check(result.source_isa == kGfx1250B0Isa, + "source code object ISA is tagged as B0"); + check(result.target_isa == kGfx1250A0Isa, + "A0 agent ISA remains tagged as A0"); +} + +void test_OptInBlocksNonGfx12_5() { + begin_test("OptInBlocksNonGfx12_5", + "The opt-in must not become a global rewrite enable for " + "unsupported agents or source code objects."); + LoadResult result = load_once(kGfx942Isa, kGfx942Isa, "1", 0); + check_original_load(result, "non-gfx12.5 agent does not route"); + + result = load_once(kGfx942Isa, kGfx1251Isa, "1"); + check_original_load(result, "non-gfx12.5 source does not route"); +} + +void test_RetargetFailureFallsBackToOriginalReader() { + begin_test("RetargetFailureFallsBackToOriginalReader", + "If COMGR rejects a gated rewrite, the loader must still load " + "the original reader."); + const LoadResult result = load_once(kGfx1250Isa, kGfx1250Isa, "1", 1, -1); + check(result.status == HSA_STATUS_SUCCESS, "fallback load succeeds"); + check(result.retarget_calls == 1, "COMGR retarget was attempted"); + check(result.loaded_reader == result.original_reader, + "retarget failure falls back to original reader"); +} + +} // namespace + +int main() { + test_OptInDisabledLoadsOriginal(); + test_OptInRoutesGfx1250SameProcessor(); + test_OptInRoutesGfx12_5Family(); + test_GenericSourceUsesGenericTarget(); + test_ConcreteSourceUsesSourceTarget(); + test_A0UsesBaselineRouteWithoutOptIn(); + test_A0WithOptInKeepsBaselinePair(); + test_OptInBlocksNonGfx12_5(); + test_RetargetFailureFallsBackToOriginalReader(); + reset_state(); + + std::printf("\n%d passed, %d failed\n", tests_passed, tests_failed); + return tests_failed ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/projects/hotswap/tests/hotswap_loader_test_utils.hpp b/projects/hotswap/tests/hotswap_loader_test_utils.hpp new file mode 100644 index 00000000000..6fc202e5bff --- /dev/null +++ b/projects/hotswap/tests/hotswap_loader_test_utils.hpp @@ -0,0 +1,317 @@ +//===- hotswap_loader_test_utils.hpp - Loader test support ----------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef ROCR_HOTSWAP_LOADER_TEST_UTILS_HPP +#define ROCR_HOTSWAP_LOADER_TEST_UTILS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace { + +struct FakeEnv { + std::string agent_isa = "amdgcn-amd-amdhsa--gfx1250"; + uint32_t asic_revision = 0; + int retarget_calls = 0; + int retarget_status = 0; + uint64_t next_reader_handle = 100; + uint64_t last_loaded_reader = 0; + std::string retarget_source_isa; + std::string retarget_target_isa; +}; + +FakeEnv g_env; +CoreApiTable g_test_core; + +} // namespace + +#include "../hotswap_tool.cpp" + +namespace rocr::hotswap { + +// The production helper uses COMGR to read a real code object's ISA metadata. +// This test double only supplies a source ISA so the test can stay focused on +// loader policy without linking COMGR. +std::string GetCodeObjectIsaName(const void *elf_data, size_t elf_size) { + if (!elf_data || elf_size == 0) { + return {}; + } + + constexpr char IsaPrefix[] = "amdgcn-amd-amdhsa--"; + const char *begin = static_cast(elf_data); + const char *end = begin + elf_size; + const char *match = std::search(begin, end, IsaPrefix, + IsaPrefix + sizeof(IsaPrefix) - 1); + if (match == end) { + return {}; + } + + const char *limit = match; + while (limit != end) { + const char c = *limit; + if (c == '\0' || c == '\'' || c == '"' || c == '\n' || c == '\r' || + c == ' ' || c == '\t') { + break; + } + ++limit; + } + return std::string(match, limit); +} + +int RetargetCodeObject(const void *elf_data, size_t elf_size, + const char *source_isa, const char *target_isa, + void **out_data, size_t *out_size) { + ++g_env.retarget_calls; + g_env.retarget_source_isa = source_isa ? source_isa : ""; + g_env.retarget_target_isa = target_isa ? target_isa : ""; + + if (!out_data || !out_size) { + return -1; + } + *out_data = const_cast(elf_data); + *out_size = elf_size; + + if (g_env.retarget_status != 0) { + return g_env.retarget_status; + } + + void *copy = std::malloc(elf_size); + if (!copy) { + return -1; + } + std::memcpy(copy, elf_data, elf_size); + *out_data = copy; + *out_size = elf_size; + return 0; +} + +} // namespace rocr::hotswap + +extern "C" { + +hsa_status_t hsa_agent_iterate_isas(hsa_agent_t /*agent*/, + hsa_status_t (*callback)(hsa_isa_t isa, + void *data), + void *data) { + hsa_isa_t isa{}; + isa.handle = 1; + const hsa_status_t status = callback(isa, data); + return status == HSA_STATUS_INFO_BREAK ? HSA_STATUS_SUCCESS : status; +} + +hsa_status_t hsa_isa_get_info_alt(hsa_isa_t /*isa*/, hsa_isa_info_t attribute, + void *value) { + if (attribute == HSA_ISA_INFO_NAME_LENGTH) { + *static_cast(value) = + static_cast(g_env.agent_isa.size() + 1); + return HSA_STATUS_SUCCESS; + } + if (attribute == HSA_ISA_INFO_NAME) { + std::memcpy(value, g_env.agent_isa.c_str(), g_env.agent_isa.size() + 1); + return HSA_STATUS_SUCCESS; + } + return HSA_STATUS_ERROR; +} + +hsa_status_t hsa_agent_get_info(hsa_agent_t /*agent*/, + hsa_agent_info_t attribute, void *value) { + if (attribute == + static_cast(HSA_AMD_AGENT_INFO_ASIC_REVISION)) { + *static_cast(value) = g_env.asic_revision; + return HSA_STATUS_SUCCESS; + } + return HSA_STATUS_ERROR; +} + +} // extern "C" + +namespace { + +constexpr const char *kGfx1250Isa = "amdgcn-amd-amdhsa--gfx1250"; +constexpr const char *kGfx942Isa = "amdgcn-amd-amdhsa--gfx942"; +constexpr const char *kGfx1251Isa = "amdgcn-amd-amdhsa--gfx1251"; +constexpr const char *kGfx12_5GenericIsa = + "amdgcn-amd-amdhsa--gfx12-5-generic"; +constexpr const char *kGfx1250B0Isa = + "amdgcn-amd-amdhsa--gfx1250:gfx1250-b0-specific+"; +constexpr const char *kGfx1250A0Isa = + "amdgcn-amd-amdhsa--gfx1250:gfx1250-b0-specific-"; + +int tests_passed = 0; +int tests_failed = 0; + +void check(bool cond, const char *name) { + if (cond) { + ++tests_passed; + std::printf(" PASS: %s\n", name); + } else { + ++tests_failed; + std::fprintf(stderr, " FAIL: %s\n", name); + } +} + +void begin_test(const char *name, const char *description) { + std::printf("TEST %s...\n %s\n", name, description); +} + +void set_gfx12_5_rewrite_env(const char *value) { +#ifdef _WIN32 + _putenv_s("AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES", value ? value : ""); +#else + if (value) { + setenv("AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES", value, 1); + } else { + unsetenv("AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES"); + } +#endif +} + +void reset_state() { + { + std::scoped_lock lock(g_reader_map_mutex); + g_reader_map.clear(); + } + { + std::scoped_lock lock(g_rewritten_elfs_mutex); + g_rewritten_elfs.clear(); + } + g_core_table = nullptr; + g_orig_reader_create_from_memory = nullptr; + g_orig_reader_create_from_file = nullptr; + g_orig_reader_destroy = nullptr; + g_orig_load_agent_code_object = nullptr; + rocr::hotswap::reset_gfx_revision_cache(); + set_gfx12_5_rewrite_env(nullptr); + g_env = FakeEnv{}; +} + +std::vector make_code_object(const std::string &isa) { + const std::string metadata = "---\namdhsa.target: '" + isa + "'\n"; + return std::vector(metadata.begin(), metadata.end()); +} + +hsa_status_t HSA_API fake_reader_create_from_memory( + const void * /*code_object*/, size_t /*size*/, + hsa_code_object_reader_t *code_object_reader) { + code_object_reader->handle = g_env.next_reader_handle++; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HSA_API fake_reader_create_from_file( + hsa_file_t /*file*/, hsa_code_object_reader_t *code_object_reader) { + code_object_reader->handle = g_env.next_reader_handle++; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HSA_API +fake_reader_destroy(hsa_code_object_reader_t code_object_reader) { + (void)code_object_reader; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HSA_API fake_load_agent_code_object( + hsa_executable_t /*executable*/, hsa_agent_t /*agent*/, + hsa_code_object_reader_t code_object_reader, const char * /*options*/, + hsa_loaded_code_object_t *loaded_code_object) { + g_env.last_loaded_reader = code_object_reader.handle; + if (loaded_code_object) { + loaded_code_object->handle = 0xC0DE; + } + return HSA_STATUS_SUCCESS; +} + +CoreApiTable &install_tool() { + g_test_core = CoreApiTable{}; + g_test_core.hsa_code_object_reader_create_from_memory_fn = + fake_reader_create_from_memory; + g_test_core.hsa_code_object_reader_create_from_file_fn = + fake_reader_create_from_file; + g_test_core.hsa_code_object_reader_destroy_fn = fake_reader_destroy; + g_test_core.hsa_executable_load_agent_code_object_fn = + fake_load_agent_code_object; + + HsaApiTable table{}; + table.core_ = &g_test_core; + check(OnLoad(&table, 0, 0, nullptr), "OnLoad installs with complete table"); + return g_test_core; +} + +hsa_agent_t fake_agent() { + hsa_agent_t agent{}; + agent.handle = 42; + return agent; +} + +hsa_code_object_reader_t create_memory_reader(CoreApiTable &core, + const std::vector &elf) { + hsa_code_object_reader_t reader{}; + const hsa_status_t status = core.hsa_code_object_reader_create_from_memory_fn( + elf.data(), elf.size(), &reader); + check(status == HSA_STATUS_SUCCESS, "memory reader creation succeeds"); + return reader; +} + +hsa_status_t load_reader(CoreApiTable &core, hsa_code_object_reader_t reader) { + hsa_loaded_code_object_t loaded{}; + return core.hsa_executable_load_agent_code_object_fn( + hsa_executable_t{}, fake_agent(), reader, nullptr, &loaded); +} + +struct LoadResult { + hsa_status_t status = HSA_STATUS_SUCCESS; + uint64_t original_reader = 0; + uint64_t loaded_reader = 0; + int retarget_calls = 0; + std::string source_isa; + std::string target_isa; + size_t retained_elfs = 0; +}; + +LoadResult load_once(const char *code_object_isa, const char *agent_isa, + const char *gfx12_5_rewrite_flag, + uint32_t asic_revision = 1, + int retarget_status = 0) { + reset_state(); + set_gfx12_5_rewrite_env(gfx12_5_rewrite_flag); + g_env.agent_isa = agent_isa; + g_env.asic_revision = asic_revision; + g_env.retarget_status = retarget_status; + + CoreApiTable &core = install_tool(); + std::vector elf = make_code_object(code_object_isa); + const hsa_code_object_reader_t reader = create_memory_reader(core, elf); + + LoadResult result; + result.original_reader = reader.handle; + result.status = load_reader(core, reader); + result.loaded_reader = g_env.last_loaded_reader; + result.retarget_calls = g_env.retarget_calls; + result.source_isa = g_env.retarget_source_isa; + result.target_isa = g_env.retarget_target_isa; + result.retained_elfs = g_rewritten_elfs.size(); + return result; +} + +void check_original_load(const LoadResult &result, const char *name) { + check(result.status == HSA_STATUS_SUCCESS, "load succeeds"); + check(result.retarget_calls == 0, name); + check(result.loaded_reader == result.original_reader, + "original reader is loaded"); +} + +} // namespace + +#endif // ROCR_HOTSWAP_LOADER_TEST_UTILS_HPP diff --git a/projects/hotswap/tests/hotswap_tool_test.cpp b/projects/hotswap/tests/hotswap_tool_test.cpp index e5dc33e91e9..7c9e95bd167 100644 --- a/projects/hotswap/tests/hotswap_tool_test.cpp +++ b/projects/hotswap/tests/hotswap_tool_test.cpp @@ -4,16 +4,12 @@ // //===----------------------------------------------------------------------===// // -// Unit tests for query_agent_gfx_revision() in hotswap_gfx_query.cpp, plus the -// gfx1250-A0 gate that hotswap_load_agent_code_object() applies on top of it. +// Unit tests for query_agent_gfx_revision() in hotswap_gfx_query.cpp and the +// COMGR requests selected by hotswap_loader_policy.cpp. // -// The query helpers — query_agent_gfx_revision(), extract_gfx_target() and the -// AgentGfxRevision type — live in their own translation unit (compiled in -// alongside this test), so the test includes only the small -// hotswap_gfx_query.hpp header. The HSA entry -// points the query calls are replaced with in-file stubs (linked in place of -// the real libraries) so the query can be driven entirely from the test without -// GPU hardware: +// The HSA entry points used by the query helper are replaced with in-file stubs +// (linked in place of the real libraries) so query and policy behavior can be +// driven entirely from the test without GPU hardware: // // * ISA name <- hsa_agent_iterate_isas / hsa_isa_get_info_alt // * ASIC revision <- hsa_agent_get_info(HSA_AMD_AGENT_INFO_ASIC_REVISION) @@ -27,6 +23,7 @@ #include #include #include +#include #include // --------------------------------------------------------------------------- @@ -45,13 +42,18 @@ struct FakeEnv { FakeEnv g_env; } // namespace -// The unit under test (brings in hsa.h and the query helper declarations). +// The units under test (bring in HSA query helpers and rewrite policy). #include "hotswap_gfx_query.hpp" +#include "hotswap_loader_policy.hpp" using rocr::hotswap::AgentGfxRevision; +using rocr::hotswap::decide_hotswap_rewrite; using rocr::hotswap::gate_allows_hotswap; +using rocr::hotswap::has_candidate_hotswap_rewrite; using rocr::hotswap::query_agent_gfx_revision; using rocr::hotswap::reset_gfx_revision_cache; +using rocr::hotswap::RewriteDecision; +using rocr::hotswap::RewriteOptions; // --------------------------------------------------------------------------- // Stubs replacing the real HSA symbols referenced by the tool. @@ -138,10 +140,31 @@ const char *kGfx1250IsaWithFeatures = "amdgcn-amd-amdhsa--gfx1250:sramecc+:xnack-"; const char *kGfx942Isa = "amdgcn-amd-amdhsa--gfx942"; const char *kGfx1251Isa = "amdgcn-amd-amdhsa--gfx1251"; +const char *kGfx125MalformedIsa = "amdgcn-amd-amdhsa--gfx125foo"; +const char *kGfx12_5GenericIsa = "amdgcn-amd-amdhsa--gfx12-5-generic"; +const char *kGfx12_5GenericIsaWithFeatures = + "amdgcn-amd-amdhsa--gfx12-5-generic:sramecc+"; -// The gate applied in hotswap_load_agent_code_object() is the shared -// rocr::hotswap::gate_allows_hotswap(), exercised directly below so the tests -// and the tool can never drift apart. +AgentGfxRevision make_gfx_revision(const char *gfx_target, + uint32_t asic_revision, + bool revision_valid = true) { + AgentGfxRevision gfx; + gfx.gfx_target = gfx_target; + gfx.revision_valid = revision_valid; + gfx.asic_revision = asic_revision; + return gfx; +} + +void run_decision_pair(const char *name, + const std::optional &decision, + const std::string &source_isa, + const std::string &target_isa) { + run(name, decision && decision->source_isa == source_isa && + decision->target_isa == target_isa); +} + +// The rewrite-policy helpers are exercised directly below so the tests and the +// loader can never drift apart. // gfx1250 silicon at ASIC revision A0 -> parsed target + revision, gate passes. void test_Gfx1250A0Passes() { @@ -189,6 +212,20 @@ void test_NearMissTargetBlocks() { run("gate blocks gfx1251 (exact match)", gate_allows_hotswap(g) == false); } +// Hyphenated generic processor names must be preserved while stripping feature +// suffixes; otherwise gfx12.5 opt-in routing would see only "gfx12". +void test_Gfx12_5GenericFeatureSuffixParsed() { + printf("TEST Gfx12_5GenericFeatureSuffixParsed...\n"); + reset_env(); + g_env.isa_name = kGfx12_5GenericIsaWithFeatures; + g_env.asic_revision = 0; + const AgentGfxRevision g = query_agent_gfx_revision(fresh_agent()); + run("feature suffix stripped -> gfx12-5-generic", + g.gfx_target == "gfx12-5-generic"); + run("baseline route blocks gfx12-5-generic", + gate_allows_hotswap(g) == false); +} + // gfx1250 but a non-A0 stepping -> gate blocks. void test_Gfx1250NonA0Blocks() { printf("TEST Gfx1250NonA0Blocks...\n"); @@ -200,6 +237,163 @@ void test_Gfx1250NonA0Blocks() { run("gate blocks gfx1250 A1", gate_allows_hotswap(g) == false); } +// The explicit opt-in opens a gfx1250 route independent of the baseline A0 +// agent route. +void test_OptInAllowsGfx1250NonA0() { + printf("TEST OptInAllowsGfx1250NonA0...\n"); + reset_env(); + g_env.isa_name = kGfx1250Isa; + g_env.asic_revision = 1; // A1/B0-side path, not A0. + const AgentGfxRevision g = query_agent_gfx_revision(fresh_agent()); + run("baseline route blocks non-A0", + has_candidate_hotswap_rewrite(g, RewriteOptions{false}) == false); + run("opt-in route allows gfx1250 non-A0", + has_candidate_hotswap_rewrite(g, RewriteOptions{true}) == true); +} + +// The explicit opt-in covers the broader gfx125* family, while the baseline +// route remains exact gfx1250 A0 only. +void test_OptInAllowsGfx125Family() { + printf("TEST OptInAllowsGfx125Family...\n"); + reset_env(); + g_env.isa_name = kGfx1251Isa; + g_env.asic_revision = 1; + const AgentGfxRevision g = query_agent_gfx_revision(fresh_agent()); + run("baseline route blocks gfx1251", + has_candidate_hotswap_rewrite(g, RewriteOptions{false}) == false); + run("opt-in route allows gfx1251", + has_candidate_hotswap_rewrite(g, RewriteOptions{true}) == true); +} + +void test_OptInRejectsMalformedGfx125Prefix() { + printf("TEST OptInRejectsMalformedGfx125Prefix...\n"); + reset_env(); + g_env.isa_name = kGfx125MalformedIsa; + g_env.asic_revision = 1; + const AgentGfxRevision g = query_agent_gfx_revision(fresh_agent()); + run("malformed gfx125 prefix is parsed", + g.gfx_target == "gfx125foo"); + run("opt-in route rejects malformed gfx125 prefix", + has_candidate_hotswap_rewrite(g, RewriteOptions{true}) == false); +} + +// The HSA tool routes gfx12-5-generic when explicitly requested. +void test_OptInAllowsGfx12_5Generic() { + printf("TEST OptInAllowsGfx12_5Generic...\n"); + reset_env(); + g_env.isa_name = kGfx12_5GenericIsa; + g_env.asic_revision = 1; + const AgentGfxRevision g = query_agent_gfx_revision(fresh_agent()); + run("baseline route blocks gfx12-5-generic", + has_candidate_hotswap_rewrite(g, RewriteOptions{false}) == false); + run("opt-in route allows gfx12-5-generic", + has_candidate_hotswap_rewrite(g, RewriteOptions{true}) == true); +} + +// If ASIC revision cannot be queried, the explicit opt-in can still route +// gfx12.5 targets through COMGR. The baseline A0 route remains disabled. +void test_OptInAllowsGfx1250UnknownRevision() { + printf("TEST OptInAllowsGfx1250UnknownRevision...\n"); + reset_env(); + g_env.isa_name = kGfx1250Isa; + g_env.asic_rev_ok = false; + const AgentGfxRevision g = query_agent_gfx_revision(fresh_agent()); + run("baseline route blocks unknown revision", + has_candidate_hotswap_rewrite(g, RewriteOptions{false}) == false); + run("opt-in route allows gfx1250 unknown revision", + has_candidate_hotswap_rewrite(g, RewriteOptions{true}) == true); +} + +// The opt-in is not a global rewrite enable; non-gfx12.5 targets still +// load unchanged. +void test_OptInBlocksOtherTargets() { + printf("TEST OptInBlocksOtherTargets...\n"); + reset_env(); + g_env.isa_name = kGfx942Isa; + g_env.asic_revision = 0; + const AgentGfxRevision g = query_agent_gfx_revision(fresh_agent()); + run("opt-in route blocks gfx942", + has_candidate_hotswap_rewrite(g, RewriteOptions{true}) == false); +} + +// A gfx1250 A0 agent uses the baseline COMGR request without the opt-in. +void test_RewriteDecisionSelectsBaselineA0Request() { + printf("TEST RewriteDecisionSelectsBaselineA0Request...\n"); + const AgentGfxRevision gfx1250_a0 = make_gfx_revision("gfx1250", 0); + const auto d = + decide_hotswap_rewrite(gfx1250_a0, kGfx1250Isa, kGfx1250Isa, + RewriteOptions{false}); + run_decision_pair("baseline request uses expected source and target ISA", d, + std::string(kGfx1250Isa) + ":gfx1250-b0-specific+", + std::string(kGfx1250Isa) + ":gfx1250-b0-specific-"); +} + +// The explicit opt-in does not change the baseline gfx1250 A0 request. +void test_RewriteDecisionA0WithOptInKeepsBaselineRequest() { + printf("TEST RewriteDecisionA0WithOptInKeepsBaselineRequest...\n"); + const AgentGfxRevision gfx1250_a0 = make_gfx_revision("gfx1250", 0); + const auto d = + decide_hotswap_rewrite(gfx1250_a0, kGfx1250Isa, kGfx1250Isa, + RewriteOptions{true}); + run_decision_pair("opt-in keeps baseline source and target ISA", d, + std::string(kGfx1250Isa) + ":gfx1250-b0-specific+", + std::string(kGfx1250Isa) + ":gfx1250-b0-specific-"); +} + +// A non-A0 gfx1250 opt-in uses a same-processor COMGR request. +void test_RewriteDecisionOptInSelectsGfx1250SameProcessorRequest() { + printf("TEST RewriteDecisionOptInSelectsGfx1250SameProcessorRequest...\n"); + const AgentGfxRevision gfx1250_b0 = make_gfx_revision("gfx1250", 1); + const auto d = + decide_hotswap_rewrite(gfx1250_b0, kGfx1250Isa, kGfx1250Isa, + RewriteOptions{true}); + const std::string b0_isa = + std::string(kGfx1250Isa) + ":gfx1250-b0-specific+"; + run_decision_pair("gfx1250 request keeps source and target on source ISA", d, + b0_isa, b0_isa); +} + +// Concrete gfx125* targets other than gfx1250 use the original ISA pair. +void test_RewriteDecisionOptInSelectsGfx125FamilyRequest() { + printf("TEST RewriteDecisionOptInSelectsGfx125FamilyRequest...\n"); + const AgentGfxRevision gfx1251 = make_gfx_revision("gfx1251", 1); + const auto d = + decide_hotswap_rewrite(gfx1251, kGfx1251Isa, kGfx1251Isa, + RewriteOptions{true}); + run_decision_pair("gfx1251 ISA pair is preserved", d, kGfx1251Isa, + kGfx1251Isa); +} + +// The opt-in path keeps COMGR on the source processor when the agent reports a +// different concrete or generic gfx12.5 ISA. +void test_RewriteDecisionOptInKeepsSourceProcessor() { + printf("TEST RewriteDecisionOptInKeepsSourceProcessor...\n"); + const AgentGfxRevision gfx1251 = make_gfx_revision("gfx1251", 1); + auto d = + decide_hotswap_rewrite(gfx1251, kGfx12_5GenericIsa, kGfx1251Isa, + RewriteOptions{true}); + run_decision_pair("generic source keeps generic target", d, + kGfx12_5GenericIsa, kGfx12_5GenericIsa); + + d = decide_hotswap_rewrite(gfx1251, kGfx1250Isa, kGfx1251Isa, + RewriteOptions{true}); + const std::string b0_isa = + std::string(kGfx1250Isa) + ":gfx1250-b0-specific+"; + run_decision_pair("concrete mismatch keeps source processor", d, b0_isa, + b0_isa); +} + +// The opt-in never routes non-gfx12.5 source code objects through COMGR. +void test_RewriteDecisionRejectsNonGfx12_5Source() { + printf("TEST RewriteDecisionRejectsNonGfx12_5Source...\n"); + const AgentGfxRevision gfx1251 = make_gfx_revision("gfx1251", 1); + const auto d = + decide_hotswap_rewrite(gfx1251, kGfx942Isa, kGfx1251Isa, + RewriteOptions{true}); + run("non-gfx12.5 source on gfx125 agent is not rewritten", + !d.has_value()); +} + // ASIC revision query failure -> revision_valid false and gate blocks, even for // gfx1250. The query is still attempted exactly once. void test_AsicRevisionQueryFailure() { @@ -257,7 +451,20 @@ int main() { test_Gfx1250FeatureSuffixParsed(); test_NonGfx1250Blocks(); test_NearMissTargetBlocks(); + test_Gfx12_5GenericFeatureSuffixParsed(); test_Gfx1250NonA0Blocks(); + test_OptInAllowsGfx1250NonA0(); + test_OptInAllowsGfx125Family(); + test_OptInRejectsMalformedGfx125Prefix(); + test_OptInAllowsGfx12_5Generic(); + test_OptInAllowsGfx1250UnknownRevision(); + test_OptInBlocksOtherTargets(); + test_RewriteDecisionSelectsBaselineA0Request(); + test_RewriteDecisionA0WithOptInKeepsBaselineRequest(); + test_RewriteDecisionOptInSelectsGfx1250SameProcessorRequest(); + test_RewriteDecisionOptInSelectsGfx125FamilyRequest(); + test_RewriteDecisionOptInKeepsSourceProcessor(); + test_RewriteDecisionRejectsNonGfx12_5Source(); test_AsicRevisionQueryFailure(); test_ResultIsCachedPerHandle(); test_DistinctHandlesIndependent(); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp index 6e1a8efac1e..6b37c3a6b2f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.cpp @@ -175,67 +175,6 @@ void LoaderOptions::PrintHelp(std::ostream& out) const static const char *LOADER_DUMP_PREFIX = "amdcode"; -// Kernel-entry trampoline (gfx125x / RDNA4). -// -// We cannot reserve space immediately in front of each kernel entry: that would -// require a non-uniform relayout of the loaded code segment, which breaks every -// intra-segment PC-relative reference the compiler baked in. Instead we allocate -// a separate *executable* region (AMDGPU_HSA_SEGMENT_CODE_AGENT, which carries -// AllocateExecutable in the loader context) and, for each kernel, emit a stub -// that jumps to the real entry; the kernel descriptor's entry offset is then -// rewritten so dispatch lands in the stub first. -// -// The jump is absolute (the pool is not within S_BRANCH range of the code), so -// the stub does a global cache writeback (SCOPE_CU) and a v_nop, then loads the -// 64-bit entry address into a scratch SGPR pair and sets PC. -// s[100:101] is a safe fixed scratch: RDNA gives every wave 128 physical SGPRs and -// these indices are well above the preloaded user+system SGPRs (<= ~20), so they -// are never a live kernel input -- the kernel writes them before it reads them. -// -// gfx1250 encodings verified with: llvm-mc --arch=amdgcn --mcpu=gfx1250 --show-encoding -// global_wb -> 0xEE0B007C, 0x00000000, 0x00000000 -// v_nop (padding) -> 0x7E000000 -// s_mov_b32 s100, + literal -> 0xBEE400FF -// s_mov_b32 s101, + literal -> 0xBEE500FF -// s_set_pc_i64 s[100:101] -> 0xBE804864 -// s_code_end (padding) -> 0xBF9F0000 -static constexpr size_t kTrampolineStubStride = AMD_ISA_ALIGN_BYTES; // 256: one stub, entry-aligned - -// The CP (CPC) instruction-prefetches forward from a kernel's entry PC when it -// dispatches. Because dispatch now lands on a stub inside our pool, that prefetch -// reads ahead from the stub and would run off the end of the pool into the next, -// unmapped page -- a CPC read page/permission fault (observed on gfx1250). The -// prefetch length is per-kernel: COMPUTE_PGM_RSRC3.INST_PREF_SIZE (6 bits, GFX11+) -// counts 128-byte instruction-cache lines to prefetch ahead of the entry. We size -// a trailing guard from the largest INST_PREF_SIZE in the pool so the prefetch from -// any stub always lands in mapped, readable memory inside this same allocation. The -// guard is never executed (the stub sets PC away first); it only needs to be present -// and readable, which the allocation's zero-fill already guarantees. -static constexpr size_t kInstPrefUnitBytes = 128; // GFX11+ CP I$ prefetch line size - -static void BuildTrampolineGfx1250(uint8_t* buf, uint64_t target) { - auto* w = reinterpret_cast(buf); - - w[0] = 0xEE0B007C; // global_wb - w[1] = 0x00000000; // : - w[2] = 0x00000000; // : - w[3] = 0x7E000000; // v_nop (padding) - w[4] = 0xBEE400FF; // s_mov_b32 s100, target_lo - w[5] = static_cast(target); - w[6] = 0xBEE500FF; // s_mov_b32 s101, target_hi - w[7] = static_cast(target >> 32); - w[8] = 0xBE804864; // s_set_pc_i64 s[100:101] - for (size_t i = 9; i < kTrampolineStubStride / sizeof(uint32_t); ++i) - w[i] = 0xBF9F0000; // s_code_end (prefetch-safe padding) -} - -// gfx12.5 family: CO v3+ reports either a generic mach name (gfx12-5-generic) or -// discrete targets (gfx1250, gfx1251, …) in the amdgcn-amd-amdhsa-- ISA string. -static bool CodeObjectIsaIsGfx125Family(const std::string& codeIsa) { - if (codeIsa.find("gfx12-5-generic") != std::string::npos) return true; - return codeIsa.find("gfx125") != std::string::npos; -} - Loader* Loader::Create(Context* context) { return new AmdHsaCodeLoader(context); @@ -1316,11 +1255,6 @@ hsa_status_t ExecutableImpl::LoadCodeObject( return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } - // Kernel-entry trampolines (gfx125x). Gate on this code object's ISA and reset - // the per-object fixup list collected by LoadDefinitionSymbol. - trampoline_enabled_gfx125x_ = CodeObjectIsaIsGfx125Family(codeIsa); - kd_fixups_.clear(); - uint32_t majorVersion, minorVersion; if (!code->GetCodeObjectVersion(&majorVersion, &minorVersion)) { logger_ << "LoaderError: failed to determine code object's version\n"; @@ -1381,14 +1315,6 @@ hsa_status_t ExecutableImpl::LoadCodeObject( status = ApplyRelocations(agent, code.get()); if (status != HSA_STATUS_SUCCESS) { return status; } - // Emit kernel-entry trampolines into the host shadow now that the image is - // final (post-relocation) and still unfrozen. The single Freeze DMA carries - // them to device along with the rewritten descriptors. - if (trampoline_enabled_gfx125x_ && !kd_fixups_.empty()) { - status = InstallTrampolinesGfx125x(agent); - if (status != HSA_STATUS_SUCCESS) { return status; } - } - code.reset(); if (loaderOptions.DumpAll()->is_set() || loaderOptions.DumpExec()->is_set()) { @@ -1499,59 +1425,6 @@ hsa_status_t ExecutableImpl::LoadSegmentV2(const code::Segment *data_segment, return HSA_STATUS_SUCCESS; } -hsa_status_t ExecutableImpl::InstallTrampolinesGfx125x(hsa_agent_t agent) { - const size_t n = kd_fixups_.size(); - - // Size the trailing prefetch guard from the largest CP instruction-prefetch - // window among this pool's kernels (INST_PREF_SIZE lines * 128 B). The forward - // prefetch from the last stub reaches its_entry + INST_PREF_SIZE*128; since that - // stub's own slot (one stub stride) already lies inside the pool, only the - // remainder, (INST_PREF_SIZE*128 - stub_size), can spill past the pool and needs - // a guard. (Clamp to 0 when the window fits within a stub slot.) - uint32_t max_pref_lines = 0; - for (const auto& f : kd_fixups_) - max_pref_lines = std::max(max_pref_lines, f.inst_pref); - const size_t pref_bytes = static_cast(max_pref_lines) * kInstPrefUnitBytes; - const size_t guard = - pref_bytes > kTrampolineStubStride ? pref_bytes - kTrampolineStubStride : 0; - const size_t pool = n * kTrampolineStubStride + guard; - - // AMDGPU_HSA_SEGMENT_CODE_AGENT yields *executable* device memory: the loader - // context backs it with RegionMemory(..., is_code=true), which sets - // core::MemoryRegion::AllocateExecutable (see amd_loader_context.cpp). - void* ptr = context_->SegmentAlloc(AMDGPU_HSA_SEGMENT_CODE_AGENT, agent, pool, - AMD_ISA_ALIGN_BYTES, /*zero=*/true); - if (!ptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - - // vaddr == 0: Address()/Copy() index by raw byte offset into the pool. - auto tramp = std::make_shared(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT, - ptr, pool, /*vaddr=*/0, /*storage_offset=*/0); - objects.push_back(tramp); // freed via Destroy() in ~ExecutableImpl - trampoline_segments_.push_back(tramp); // frozen in ExecutableImpl::Freeze - - for (size_t i = 0; i < n; ++i) { - const KdFixup& f = kd_fixups_[i]; - const uint64_t stub_off = i * kTrampolineStubStride; - // Device addresses are valid pre-Freeze (RegionMemory::ptr_ is set at alloc). - const uint64_t kd_dev = reinterpret_cast(f.code_seg->Address(f.kd_vaddr)); - const uint64_t entry_dev = reinterpret_cast(f.code_seg->Address(f.kd_vaddr + f.entry_off)); - const uint64_t stub_dev = reinterpret_cast(tramp->Address(stub_off)); - - uint8_t blob[kTrampolineStubStride]; - BuildTrampolineGfx1250(blob, entry_dev); // stub jumps to the real entry - tramp->Copy(stub_off, blob, sizeof(blob)); // -> trampoline host shadow - - // Redirect dispatch onto the stub: kernel_object(kd_dev) + new_off == stub. - int64_t new_off = static_cast(stub_dev) - static_cast(kd_dev); - f.code_seg->Copy(f.kd_vaddr + llvm::amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET, - &new_off, sizeof(new_off)); // -> code host shadow - } - - // The prefetch guard is left as the allocation's zero-fill (zero=true): it is - // committed and readable -- all the CP prefetch needs -- and is never executed. - return HSA_STATUS_SUCCESS; -} - hsa_status_t ExecutableImpl::LoadSymbol(hsa_agent_t agent, code::Symbol* sym, uint32_t majorVersion) @@ -1600,17 +1473,6 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, llvm::amdhsa::kernel_descriptor_t kd; sym->GetSection()->getData(sym->SectionOffset(), &kd, sizeof(kd)); - if (trampoline_enabled_gfx125x_) { - // Record this descriptor; the trampoline is installed after relocations. - // sym->VAddr() is the descriptor's ELF vaddr (matches SymbolAddress below). - // INST_PREF_SIZE (GFX11+) = number of 128B I$ lines the CP prefetches ahead - // of the entry; captured here to size the trampoline's prefetch guard. - uint32_t inst_pref = AMDHSA_BITS_GET(kd.compute_pgm_rsrc3, - rocr::llvm::amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_INST_PREF_SIZE); - kd_fixups_.push_back({ SymbolSegment(agent, sym), sym->VAddr(), - kd.kernel_code_entry_byte_offset, inst_pref }); - } - uint32_t kernarg_segment_size = kd.kernarg_size; // FIXME: If 0 then the compiler is not specifying the size. uint32_t kernarg_segment_alignment = 16; // FIXME: Use the minumum HSA required alignment. uint32_t group_segment_size = kd.group_segment_fixed_size; @@ -2092,13 +1954,6 @@ hsa_status_t ExecutableImpl::Freeze(const char *options) { } } - // Trampoline pools are not part of any LoadedCodeObject's segment list - // (that must stay size==1 for v2+); freeze them explicitly so their host->device - // DMA and code-cache invalidation happen alongside the code segments. - for (auto &ts : trampoline_segments_) { - ts->Freeze(); - } - state_ = HSA_EXECUTABLE_STATE_FROZEN; return HSA_STATUS_SUCCESS; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp index b17c2da5410..9d8a238fb1f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/loader/executable.hpp @@ -623,10 +623,6 @@ friend class AmdHsaCodeLoader; Segment* SymbolSegment(hsa_agent_t agent, amd::hsa::code::Symbol* sym); Segment* SectionSegment(hsa_agent_t agent, amd::hsa::code::Section* sec); - // gfx125x: allocate a separate executable region and emit, per kernel, a stub - // that jumps to the real entry, then redirect the kernel descriptor to it. - hsa_status_t InstallTrampolinesGfx125x(hsa_agent_t agent); - amd::hsa::common::ReaderWriterLock rw_lock_; hsa_profile_t profile_; Context *context_; @@ -641,14 +637,6 @@ friend class AmdHsaCodeLoader; std::vector> objects; std::shared_ptr program_allocation_segment; std::vector> loaded_code_objects; - - // Kernel-entry trampolines (gfx125x). - // kd_fixups_ is collected per-LoadCodeObject; trampoline_segments_ persists for - // the lifetime of the executable so it can be frozen and destroyed normally. - struct KdFixup { Segment* code_seg; uint64_t kd_vaddr; int64_t entry_off; uint32_t inst_pref; }; - bool trampoline_enabled_gfx125x_ = false; - std::vector kd_fixups_; - std::vector> trampoline_segments_; }; class AmdHsaCodeLoader : public Loader {