Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 26 additions & 18 deletions test/performance/matrix_multiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ int main(int argc, char **argv) {

const int matrix_size = 992;

ImageParam A(type_of<float>(), 2);
ImageParam B(type_of<float>(), 2);
ImageParam A(type_of<float>(), 2, "A");
ImageParam B(type_of<float>(), 2, "B");

Var x("x"), y("y");
RDom k(0, matrix_size);
Expand All @@ -46,17 +46,34 @@ int main(int argc, char **argv) {
//
// Using 16 threads (and no hyperthreading), hits 2080 GFlops (67% of peak)
// and 1310 GFLops (85% of peak) respectively.
//
// On Apple M3 Max, single-threaded hits ~114 GFlops (89% of peak), and
// ~1270 GFlops using 16 cores.

const int vec = target.natural_vector_size<float>();

// Size the inner loop tiles to fit into the number of registers available
// on the target, using either 12 accumulator registers or 24.
const int inner_tile_x = 3 * vec;
const int inner_tile_y = (target.has_feature(Target::AVX512) || target.arch != Target::X86) ? 8 : 4;
// On 64-bit ARM, there are 32 NEON registers. Using inner_tile_x=4*vec
// with inner_tile_y=4 leaves 10 spare NEON registers, which lets LLVM
// assign an independent GP base address to each A row. This avoids the
// ld1r post-increment serial dependency chain that occurs with 8 rows
// (where only 2 temp registers cycle between rows), and produces balanced
// load/compute throughput (4 cycles each at 4 FP units and 2 load ports).
const bool is_aarch64 = target.arch == Target::ARM && target.bits == 64;
const bool is_avx512 = target.has_feature(Target::AVX512);

// The shape of the outer tiling
const int tile_y = matrix_size / 4;
const int tile_k = matrix_size / 16;
// Size the inner loop tiles to fit into the number of registers available
// on the target.
// ARM64 NEON: 4×4=16 accumulators (22/32 NEON regs).
// AVX-512: 3×8=24 accumulators (27/32 ZMM regs).
// AVX2 (default): 3×4=12 accumulators.
const int inner_tile_x = is_aarch64 ? 4 * vec : 3 * vec;
const int inner_tile_y = is_avx512 ? 8 : 4;

// The shape of the outer tiling. On ARM64, use a narrower y-tile so the
Comment thread
alexreinking marked this conversation as resolved.
Outdated
// B panel (inner_tile_x × matrix_k × 4 bytes = ~62KB) fits in L1
// alongside the C accumulator buffer.
const int tile_y = matrix_size / (is_aarch64 ? 8 : 4);
const int tile_k = matrix_size / (is_aarch64 ? 4 : 16);

Var xy("xy"), xi("xi"), yi("yi"), yii("yii");

Expand Down Expand Up @@ -144,16 +161,7 @@ int main(int argc, char **argv) {
return 1;
}

// Uncomment to see the generated assembly.
Comment thread
shoaibkamil marked this conversation as resolved.
/*
{
Target t("host-no_asserts-no_runtime-no_bounds_query");
out.compile_to_assembly("/dev/stdout", matrix_mul.infer_arguments(), t);
}
*/

float gflops = 2.0f * matrix_size * matrix_size * matrix_size / 1e9f;

printf("Halide: %fms, %f GFLOP/s\n\n", t * 1e3, (gflops / t));

printf("Success!\n");
Expand Down
Loading