Arm-Examples · codp594 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/vecops_fp32/Makefile b/vecops_fp32/Makefile
@@ -0,0 +1,18 @@
+TARGET := vecops_fp32
+SRC := main.c
+HDR := ssve.h
+
+ifeq ($(origin CC),default)
+$(error Please set CC explicitly, for example: make CC=clang)
+endif
+
+CFLAGS := -O3 -Wall -march=armv9.2-a+sme2+sve+sve2
+.PHONY: all clean
+
+all: $(TARGET)
+
+$(TARGET): $(SRC) $(HDR)
+	$(CC) $(CFLAGS) $(SRC) -o $(TARGET)
+
+clean:
+	rm -f $(TARGET)
diff --git a/vecops_fp32/README.md b/vecops_fp32/README.md
@@ -0,0 +1,99 @@
+# Arm Streaming SVE (SSVE) Vector Operations
+
+## Vector operations
+
+The `ssve.h` header contains a set of basic vector operations implemented with Arm SSVE intrinsics. The kernels support two data formats, `float32` and `complex float32`. In the function names, `f32` means `float32` while `cf32` means `complex float32`. The table below summarizes the available operations.
+
+| SSVE routine | Notes |
+| --- | --- |
+| `mul_cf32` | Complex element-wise multiply. |
+| `power_cf32` | Outputs L2 norm for each complex element. |
+| `conj_scale_cf32` | Applies complex conjugation and then scales by a real scalar. |
+| `dot_cf32` | Complex inner product without conjugation. |
+| `conj_mul_cf32` | Multiplies `conj(a)` by `b` element-wise. |
+| `conj_dot_cf32` | Conjugate complex dot product. |
+| `mul_f32` | Real element-wise multiply. |
+| `scale_f32` | Scalar multiply operation. |
+| `dot_f32` | Real dot product reduced into `c[0]`. |
+| `add_f32` | Element-wise addition kernel. |
+
+Important: the code in this repository is written under a fixed **512-bit** SSVE assumption. The blocking factors, tuple sizes, and tail-handling logic in `ssve.h`, as well as the way `main.c` is used for measurement, are documented with that assumption in mind. This is not presented as a vector-length-agnostic implementation.
+
+The notes below describe the intended correspondence assuming a **512-bit** SSVE vector length, which means:
+
+- One SSVE `float32` vector contains 16 lanes.
+- `svld1_f32_x4` / `svst1_f32_x4` cover 64 `float` values per tuple.
+
+The f32 kernels operate on `n` scalar `float32` elements, while the cf32 kernels operate on `n` scalar `complex float32` elements. The complex data is stored as interleaved `(real, imag)` pairs.
+
+```text
+[re0, im0, re1, im1, re2, im2, ...]
+```
+
+## Benchmark
+
+`main.c` is a minimal benchmark entry point for the SSVE kernels.
+
+- It accepts three command-line arguments: `choice`, `n`, and `iter`.
+- `choice` selects one of the ten routines implemented in `ssve.h`.
+- `n` is passed directly to the selected routine, so its meaning depends on whether the routine is operating on real or complex data.
+- `iter` controls how many times the selected routine is executed for timing.
+- The program currently prints only the total elapsed time in nanoseconds.
+
+The benchmark maps `choice` to kernels as follows:
+
+| `choice` | Kernel |
+| --- | --- |
+| `1` | `mul_cf32` |
+| `2` | `power_cf32` |
+| `3` | `conj_scale_cf32` |
+| `4` | `dot_cf32` |
+| `5` | `conj_mul_cf32` |
+| `6` | `conj_dot_cf32` |
+| `7` | `mul_f32` |
+| `8` | `scale_f32` |
+| `9` | `dot_f32` |
+| `10` | `add_f32` |
+
+The program runs the specified kernel `iter` times and then prints the total duration in `ns`.
+
+## Build instructions
+
+Build from the `vecops_fp32` directory and pass the compiler explicitly via `CC`.
+
+The generated binary is intended for AArch64 targets that support the SME2 feature enabled by the Makefile flags.
+
+```sh
+make CC=clang
+```
+
+The Makefile uses the following flags:
+
+```text
+-O3 -Wall -march=armv9.2-a+sme2+sve+sve2
+```
+
+This produces the benchmark binary `vecops_fp32`.
+
+To run the benchmark:
+
+```sh
+./vecops_fp32 <choice> <n> <iter>
+```
+
+Example:
+
+```sh
+./vecops_fp32 1 512 1000
+```
+
+To remove the generated binary:
+
+```sh
+make clean CC=clang
+```
+
+## Numerical Behavior
+
+- The complex SSVE routines use `FCMLA`-based sequences, so their last-bit results can differ slightly from implementations with non-SIMD instructions.
+- Reduction routines can also differ slightly because vector code changes the accumulation order compared with implementations with non-SIMD instructions.
diff --git a/vecops_fp32/main.c b/vecops_fp32/main.c
@@ -0,0 +1,145 @@
+#include "ssve.h"
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static uint64_t get_time_unit()
+{
+    uint64_t freq;
+    __asm__ volatile("mrs %0, cntfrq_el0" : "=r"(freq));
+    return freq;
+}
+
+static uint64_t get_time_count()
+{
+    uint64_t stamp;
+    __asm__ volatile("mrs %0, cntvct_el0" : "=r"(stamp));
+    return stamp;
+}
+
+static uint64_t get_time_interval_ns(const uint64_t t1, const uint64_t t2, const uint64_t unit)
+{
+    assert(t2 >= t1);
+    return (t2 - t1) * 1000000000 / unit;
+}
+
+static void fill_vec(float *vec, size_t count, float weight)
+{
+    for (size_t i = 0; i < count; i++)
+        vec[i] = weight * (i + 1);
+}
+
+static void print_choice_usage(const char *prog)
+{
+    fprintf(stderr, "Usage: %s <choice> <n> <iter>\n", prog);
+    fprintf(stderr, "choice=1  -> mul_cf32\n");
+    fprintf(stderr, "choice=2  -> power_cf32\n");
+    fprintf(stderr, "choice=3  -> conj_scale_cf32\n");
+    fprintf(stderr, "choice=4  -> dot_cf32\n");
+    fprintf(stderr, "choice=5  -> conj_mul_cf32\n");
+    fprintf(stderr, "choice=6  -> conj_dot_cf32\n");
+    fprintf(stderr, "choice=7  -> mul_f32\n");
+    fprintf(stderr, "choice=8  -> scale_f32\n");
+    fprintf(stderr, "choice=9  -> dot_f32\n");
+    fprintf(stderr, "choice=10 -> add_f32\n");
+    fprintf(stderr, "Note: n is passed directly to the selected ssve kernel.\n");
+}
+
+static const char *run_ssve_choice(size_t choice, const float *a, const float *b, float *c,
+                                   uint64_t n, float scale)
+{
+    switch (choice) {
+    case 1:
+        mul_cf32(a, b, c, n);
+        return "mul_cf32";
+    case 2:
+        power_cf32(a, c, n);
+        return "power_cf32";
+    case 3:
+        conj_scale_cf32(a, scale, c, n);
+        return "conj_scale_cf32";
+    case 4:
+        dot_cf32(a, b, c, n);
+        return "dot_cf32";
+    case 5:
+        conj_mul_cf32(a, b, c, n);
+        return "conj_mul_cf32";
+    case 6:
+        conj_dot_cf32(a, b, c, n);
+        return "conj_dot_cf32";
+    case 7:
+        mul_f32(a, b, c, n);
+        return "mul_f32";
+    case 8:
+        scale_f32(a, scale, c, n);
+        return "scale_f32";
+    case 9:
+        dot_f32(a, b, c, n);
+        return "dot_f32";
+    case 10:
+        add_f32(a, b, c, n);
+        return "add_f32";
+    default:
+        return NULL;
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    if (argc != 4) {
+        fprintf(stderr, "Error argc!\n");
+        print_choice_usage(argv[0]);
+        return 1;
+    }
+    size_t choice = atoi(argv[1]), n = atoi(argv[2]), iter = atoi(argv[3]);
+    if (n == 0 || iter == 0) {
+        fprintf(stderr, "Error n or iter!\n");
+        return 1;
+    }
+    float *a = NULL, *b = NULL, *c = NULL;
+    a = (float *)malloc(sizeof(float) * n * 2);
+    b = (float *)malloc(sizeof(float) * n * 2);
+    c = (float *)malloc(sizeof(float) * n * 2);
+    if (a == NULL || b == NULL || c == NULL) {
+        fprintf(stderr, "malloc failed!\n");
+        free(a);
+        free(b);
+        free(c);
+        return 1;
+    }
+    fill_vec(a, n * 2, 0.1f);
+    fill_vec(b, n * 2, 0.2f);
+    fill_vec(c, n * 2, 0.0f);
+
+    const float scale = 0.75f;
+    uint64_t unit = get_time_unit();
+    uint64_t t1 = 0;
+    uint64_t t2 = 0;
+    uint64_t elapsed_ns = 0;
+
+    if (run_ssve_choice(choice, a, b, c, (uint64_t)n, scale) == NULL) {
+        fprintf(stderr, "Unsupported choice: %zu\n", choice);
+        print_choice_usage(argv[0]);
+        free(a);
+        free(b);
+        free(c);
+        return 1;
+    }
+
+    fill_vec(c, n * 2, 0.0f);
+    t1 = get_time_count();
+    for (size_t i = 0; i < iter; i++) {
+        (void)run_ssve_choice(choice, a, b, c, (uint64_t)n, scale);
+    }
+    t2 = get_time_count();
+    elapsed_ns = get_time_interval_ns(t1, t2, unit);
+
+    printf("%llu\n", (unsigned long long)elapsed_ns);
+
+    free(a);
+    free(b);
+    free(c);
+    return 0;
+}