Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions vecops_fp32/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
TARGET := vecops_fp32
SRC := main.c
HDR := ssve.h

ifeq ($(origin CC),default)
$(error Please set CC explicitly, for example: make CC=clang)
endif

CFLAGS := -O3 -Wall -march=armv9.2-a+sme2+sve+sve2
.PHONY: all clean

all: $(TARGET)

$(TARGET): $(SRC) $(HDR)
$(CC) $(CFLAGS) $(SRC) -o $(TARGET)

clean:
rm -f $(TARGET)
99 changes: 99 additions & 0 deletions vecops_fp32/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Arm Streaming SVE (SSVE) Vector Operations

## Vector operations

The `ssve.h` header contains a set of basic vector operations implemented with Arm SSVE intrinsics. The kernels support two data formats, `float32` and `complex float32`. In the function names, `f32` means `float32` while `cf32` means `complex float32`. The table below summarizes the available operations.

| SSVE routine | Notes |
| --- | --- |
| `mul_cf32` | Complex element-wise multiply. |
| `power_cf32` | Outputs L2 norm for each complex element. |
| `conj_scale_cf32` | Applies complex conjugation and then scales by a real scalar. |
| `dot_cf32` | Complex inner product without conjugation. |
| `conj_mul_cf32` | Multiplies `conj(a)` by `b` element-wise. |
| `conj_dot_cf32` | Conjugate complex dot product. |
| `mul_f32` | Real element-wise multiply. |
| `scale_f32` | Scalar multiply operation. |
| `dot_f32` | Real dot product reduced into `c[0]`. |
| `add_f32` | Element-wise addition kernel. |

Important: the code in this repository is written under a fixed **512-bit** SSVE assumption. The blocking factors, tuple sizes, and tail-handling logic in `ssve.h`, as well as the way `main.c` is used for measurement, are documented with that assumption in mind. This is not presented as a vector-length-agnostic implementation.

The notes below describe the intended correspondence assuming a **512-bit** SSVE vector length, which means:

- One SSVE `float32` vector contains 16 lanes.
- `svld1_f32_x4` / `svst1_f32_x4` cover 64 `float` values per tuple.

The f32 kernels operate on `n` scalar `float32` elements, while the cf32 kernels operate on `n` scalar `complex float32` elements. The complex data is stored as interleaved `(real, imag)` pairs.

```text
[re0, im0, re1, im1, re2, im2, ...]
```

## Benchmark

`main.c` is a minimal benchmark entry point for the SSVE kernels.

- It accepts three command-line arguments: `choice`, `n`, and `iter`.
- `choice` selects one of the ten routines implemented in `ssve.h`.
- `n` is passed directly to the selected routine, so its meaning depends on whether the routine is operating on real or complex data.
- `iter` controls how many times the selected routine is executed for timing.
- The program currently prints only the total elapsed time in nanoseconds.

The benchmark maps `choice` to kernels as follows:

| `choice` | Kernel |
| --- | --- |
| `1` | `mul_cf32` |
| `2` | `power_cf32` |
| `3` | `conj_scale_cf32` |
| `4` | `dot_cf32` |
| `5` | `conj_mul_cf32` |
| `6` | `conj_dot_cf32` |
| `7` | `mul_f32` |
| `8` | `scale_f32` |
| `9` | `dot_f32` |
| `10` | `add_f32` |

The program runs the specified kernel `iter` times and then prints the total duration in `ns`.

## Build instructions

Build from the `vecops_fp32` directory and pass the compiler explicitly via `CC`.

The generated binary is intended for AArch64 targets that support the SME2 feature enabled by the Makefile flags.

```sh
make CC=clang
```

The Makefile uses the following flags:

```text
-O3 -Wall -march=armv9.2-a+sme2+sve+sve2
```

This produces the benchmark binary `vecops_fp32`.

To run the benchmark:

```sh
./vecops_fp32 <choice> <n> <iter>
```

Example:

```sh
./vecops_fp32 1 512 1000
```

To remove the generated binary:

```sh
make clean CC=clang
```

## Numerical Behavior

- The complex SSVE routines use `FCMLA`-based sequences, so their last-bit results can differ slightly from implementations with non-SIMD instructions.
- Reduction routines can also differ slightly because vector code changes the accumulation order compared with implementations with non-SIMD instructions.
145 changes: 145 additions & 0 deletions vecops_fp32/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#include "ssve.h"
#include <assert.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

static uint64_t get_time_unit()
{
uint64_t freq;
__asm__ volatile("mrs %0, cntfrq_el0" : "=r"(freq));
return freq;
}

static uint64_t get_time_count()
{
uint64_t stamp;
__asm__ volatile("mrs %0, cntvct_el0" : "=r"(stamp));
return stamp;
}

static uint64_t get_time_interval_ns(const uint64_t t1, const uint64_t t2, const uint64_t unit)
{
assert(t2 >= t1);
return (t2 - t1) * 1000000000 / unit;
}

static void fill_vec(float *vec, size_t count, float weight)
{
for (size_t i = 0; i < count; i++)
vec[i] = weight * (i + 1);
}

static void print_choice_usage(const char *prog)
{
fprintf(stderr, "Usage: %s <choice> <n> <iter>\n", prog);
fprintf(stderr, "choice=1 -> mul_cf32\n");
fprintf(stderr, "choice=2 -> power_cf32\n");
fprintf(stderr, "choice=3 -> conj_scale_cf32\n");
fprintf(stderr, "choice=4 -> dot_cf32\n");
fprintf(stderr, "choice=5 -> conj_mul_cf32\n");
fprintf(stderr, "choice=6 -> conj_dot_cf32\n");
fprintf(stderr, "choice=7 -> mul_f32\n");
fprintf(stderr, "choice=8 -> scale_f32\n");
fprintf(stderr, "choice=9 -> dot_f32\n");
fprintf(stderr, "choice=10 -> add_f32\n");
fprintf(stderr, "Note: n is passed directly to the selected ssve kernel.\n");
}

static const char *run_ssve_choice(size_t choice, const float *a, const float *b, float *c,
uint64_t n, float scale)
{
switch (choice) {
case 1:
mul_cf32(a, b, c, n);
return "mul_cf32";
case 2:
power_cf32(a, c, n);
return "power_cf32";
case 3:
conj_scale_cf32(a, scale, c, n);
return "conj_scale_cf32";
case 4:
dot_cf32(a, b, c, n);
return "dot_cf32";
case 5:
conj_mul_cf32(a, b, c, n);
return "conj_mul_cf32";
case 6:
conj_dot_cf32(a, b, c, n);
return "conj_dot_cf32";
case 7:
mul_f32(a, b, c, n);
return "mul_f32";
case 8:
scale_f32(a, scale, c, n);
return "scale_f32";
case 9:
dot_f32(a, b, c, n);
return "dot_f32";
case 10:
add_f32(a, b, c, n);
return "add_f32";
default:
return NULL;
}
}

int main(int argc, char *argv[])
{
if (argc != 4) {
fprintf(stderr, "Error argc!\n");
print_choice_usage(argv[0]);
return 1;
}
size_t choice = atoi(argv[1]), n = atoi(argv[2]), iter = atoi(argv[3]);
if (n == 0 || iter == 0) {
fprintf(stderr, "Error n or iter!\n");
return 1;
}
float *a = NULL, *b = NULL, *c = NULL;
a = (float *)malloc(sizeof(float) * n * 2);
b = (float *)malloc(sizeof(float) * n * 2);
c = (float *)malloc(sizeof(float) * n * 2);
if (a == NULL || b == NULL || c == NULL) {
fprintf(stderr, "malloc failed!\n");
free(a);
free(b);
free(c);
return 1;
}
fill_vec(a, n * 2, 0.1f);
fill_vec(b, n * 2, 0.2f);
fill_vec(c, n * 2, 0.0f);

const float scale = 0.75f;
uint64_t unit = get_time_unit();
uint64_t t1 = 0;
uint64_t t2 = 0;
uint64_t elapsed_ns = 0;

if (run_ssve_choice(choice, a, b, c, (uint64_t)n, scale) == NULL) {
fprintf(stderr, "Unsupported choice: %zu\n", choice);
print_choice_usage(argv[0]);
free(a);
free(b);
free(c);
return 1;
}

fill_vec(c, n * 2, 0.0f);
t1 = get_time_count();
for (size_t i = 0; i < iter; i++) {
(void)run_ssve_choice(choice, a, b, c, (uint64_t)n, scale);
}
t2 = get_time_count();
elapsed_ns = get_time_interval_ns(t1, t2, unit);

printf("%llu\n", (unsigned long long)elapsed_ns);

free(a);
free(b);
free(c);
return 0;
}
Loading