halide · alexreinking · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/test/performance/matrix_multiplication.cpp b/test/performance/matrix_multiplication.cpp
@@ -27,8 +27,8 @@ int main(int argc, char **argv) {
 
     const int matrix_size = 992;
 
-    ImageParam A(type_of<float>(), 2);
-    ImageParam B(type_of<float>(), 2);
+    ImageParam A(type_of<float>(), 2, "A");
+    ImageParam B(type_of<float>(), 2, "B");
 
     Var x("x"), y("y");
     RDom k(0, matrix_size);
@@ -46,17 +46,34 @@ int main(int argc, char **argv) {
     //
     // Using 16 threads (and no hyperthreading), hits 2080 GFlops (67% of peak)
     // and 1310 GFLops (85% of peak) respectively.
+    //
+    // On Apple M3 Max, single-threaded hits ~114 GFlops (89% of peak), and
+    // ~1270 GFlops using 16 cores.
 
     const int vec = target.natural_vector_size<float>();
 
-    // Size the inner loop tiles to fit into the number of registers available
-    // on the target, using either 12 accumulator registers or 24.
-    const int inner_tile_x = 3 * vec;
-    const int inner_tile_y = (target.has_feature(Target::AVX512) || target.arch != Target::X86) ? 8 : 4;
+    // On 64-bit ARM, there are 32 NEON registers. Using inner_tile_x=4*vec
+    // with inner_tile_y=4 leaves 10 spare NEON registers, which lets LLVM
+    // assign an independent GP base address to each A row. This avoids the
+    // ld1r post-increment serial dependency chain that occurs with 8 rows
+    // (where only 2 temp registers cycle between rows), and produces balanced
+    // load/compute throughput (4 cycles each at 4 FP units and 2 load ports).
+    const bool is_aarch64 = target.arch == Target::ARM && target.bits == 64;
+    const bool is_avx512 = target.has_feature(Target::AVX512);
 
-    // The shape of the outer tiling
-    const int tile_y = matrix_size / 4;
-    const int tile_k = matrix_size / 16;
+    // Size the inner loop tiles to fit into the number of registers available
+    // on the target.
+    // ARM64 NEON:      4×4=16 accumulators (22/32 NEON regs).
+    // AVX-512:         3×8=24 accumulators (27/32 ZMM regs).
+    // AVX2 (default):  3×4=12 accumulators.
+    const int inner_tile_x = is_aarch64 ? 4 * vec : 3 * vec;
+    const int inner_tile_y = is_avx512 ? 8 : 4;
+
+    // The shape of the outer tiling. On ARM64, use a narrower y-tile so the
+    // B panel (inner_tile_x × matrix_k × 4 bytes = ~62KB) fits in L1
+    // alongside the C accumulator buffer.
+    const int tile_y = matrix_size / (is_aarch64 ? 8 : 4);
+    const int tile_k = matrix_size / (is_aarch64 ? 4 : 16);
 
     Var xy("xy"), xi("xi"), yi("yi"), yii("yii");
 
@@ -144,16 +161,7 @@ int main(int argc, char **argv) {
         return 1;
     }
 
-    // Uncomment to see the generated assembly.
-    /*
-    {
-        Target t("host-no_asserts-no_runtime-no_bounds_query");
-        out.compile_to_assembly("/dev/stdout", matrix_mul.infer_arguments(), t);
-    }
-    */
-
     float gflops = 2.0f * matrix_size * matrix_size * matrix_size / 1e9f;
-
     printf("Halide: %fms, %f GFLOP/s\n\n", t * 1e3, (gflops / t));
 
     printf("Success!\n");