Skip to content

File jm_simd.h

FileList > inc > jm_simd.h

Go to the source code of this file

Width-portable SIMD operation macros. More...

  • #include <stddef.h>

Public Types

Type Name
typedef float JM_VEC_F32
typedef double JM_VEC_F64

Macros

Type Name
define JM_ADD_F32 (a, b) ((a) + (b))
define JM_ADD_F64 (a, b) ((a) + (b))
define JM_FMA_F32 (acc, a, b) ((acc) += (a) \* (b))
define JM_FMA_F64 (acc, a, b) ((acc) += (a) \* (b))
define JM_HSUM_F32 (v) ((float)(v))
define JM_HSUM_F64 (v) ((double)(v))
define JM_LOAD_F32 (p) (\*(const float \*)(p))
define JM_LOAD_F64 (p) (\*(const double \*)(p))
define JM_MAC_F32 (acc, ptr, s) ((acc) += (\*(const float \*)(ptr)) \* (float)(s))
define JM_MAC_F64 (acc, ptr, s) ((acc) += (\*(const double \*)(ptr)) \* (double)(s))
define JM_MUL_F32 (a, b) ((a) \* (b))
define JM_MUL_F64 (a, b) ((a) \* (b))
define JM_RESTRICT restrict
define JM_SIMD_WIDTH 1
define JM_SIMD_WIDTH_F32 1
define JM_SIMD_WIDTH_F64 1
define JM_SPLAT_F32 (x) ((float)(x))
define JM_SPLAT_F64 (x) ((double)(x))
define JM_STORE_F32 (p, v) (\*(float \*)(p) = (v))
define JM_STORE_F64 (p, v) (\*(double \*)(p) = (v))
define JM_SUMSQ_F32 (dst, ptr, n) /* multi line expression */
Sum of squares: dst = Σ ptr[i]² for i in [0, n).
define JM_ZERO_F32 () (0.0f)
define JM_ZERO_F64 () (0.0)

Detailed Description

Selects the widest available instruction set at compile time: AVX-512F -> 16 float / 8 double lanes (JM_SIMD_WIDTH_F32 = 16) AVX2+FMA -> 8 float / 4 double lanes (JM_SIMD_WIDTH_F32 = 8) Scalar -> 1 lane (auto-vectorisation still applies)

Typical usage (FIR inner loop, processes JM_SIMD_WIDTH_F32 taps):

JM_VEC_F32 acc = JM_ZERO_F32();
for (int k = 0; k < N_TAPS; k++)
    JM_MAC_F32(acc, window + k, coeffs[k]);
*out += JM_HSUM_F32(acc);

For algorithms that require ISA-specific operations not covered here (gather loads, prefix scans, permutes) use #ifdef AVX512F guards around the raw intrinsics. JM_SIMD_WIDTH_F32 is still useful in that context as the canonical loop-stride constant.

Can be included standalone; if JM_RESTRICT is not already defined (from jm_perf.h) a local fallback is provided.

Public Types Documentation

typedef JM_VEC_F32

typedef float JM_VEC_F32;

typedef JM_VEC_F64

typedef double JM_VEC_F64;

Macro Definition Documentation

define JM_ADD_F32

#define JM_ADD_F32 (
    a,
    b
) `((a) + (b))`

define JM_ADD_F64

#define JM_ADD_F64 (
    a,
    b
) `((a) + (b))`

define JM_FMA_F32

#define JM_FMA_F32 (
    acc,
    a,
    b
) `((acc) += (a) * (b))`

define JM_FMA_F64

#define JM_FMA_F64 (
    acc,
    a,
    b
) `((acc) += (a) * (b))`

define JM_HSUM_F32

#define JM_HSUM_F32 (
    v
) `((float)(v))`

define JM_HSUM_F64

#define JM_HSUM_F64 (
    v
) `((double)(v))`

define JM_LOAD_F32

#define JM_LOAD_F32 (
    p
) `(*(const float *)(p))`

define JM_LOAD_F64

#define JM_LOAD_F64 (
    p
) `(*(const double *)(p))`

define JM_MAC_F32

#define JM_MAC_F32 (
    acc,
    ptr,
    s
) `((acc) += (*(const float *)(ptr)) * (float)(s))`

define JM_MAC_F64

#define JM_MAC_F64 (
    acc,
    ptr,
    s
) `((acc) += (*(const double *)(ptr)) * (double)(s))`

define JM_MUL_F32

#define JM_MUL_F32 (
    a,
    b
) `((a) * (b))`

define JM_MUL_F64

#define JM_MUL_F64 (
    a,
    b
) `((a) * (b))`

define JM_RESTRICT

#define JM_RESTRICT `restrict`

define JM_SIMD_WIDTH

#define JM_SIMD_WIDTH `1`

define JM_SIMD_WIDTH_F32

#define JM_SIMD_WIDTH_F32 `1`

define JM_SIMD_WIDTH_F64

#define JM_SIMD_WIDTH_F64 `1`

define JM_SPLAT_F32

#define JM_SPLAT_F32 (
    x
) `((float)(x))`

define JM_SPLAT_F64

#define JM_SPLAT_F64 (
    x
) `((double)(x))`

define JM_STORE_F32

#define JM_STORE_F32 (
    p,
    v
) `(*(float *)(p) = (v))`

define JM_STORE_F64

#define JM_STORE_F64 (
    p,
    v
) `(*(double *)(p) = (v))`

define JM_SUMSQ_F32

Sum of squares: dst = Σ ptr[i]² for i in [0, n).

#define JM_SUMSQ_F32 (
    dst,
    ptr,
    n
) `/* multi line expression */`

The bulk runs JM_SIMD_WIDTH_F32-wide via FMA accumulation; the trailing n % JM_SIMD_WIDTH_F32 elements are summed scalar. When n is a multiple of the SIMD width (e.g. a power-of-two block whose length is >= the width) the remainder loop has zero trips and folds away, leaving a pure vector reduction.

Parameters:

  • dst lvalue of type float — receives the sum.
  • ptr const float * — base of the contiguous input.
  • n element count (size_t-convertible).
float e;
JM_SUMSQ_F32 (e, buf, 256);   // e = energy of buf[0..255]

define JM_ZERO_F32

#define JM_ZERO_F32 (

) `(0.0f)`

define JM_ZERO_F64

#define JM_ZERO_F64 (

) `(0.0)`


The documentation for this class was generated from the following file native/inc/jm_simd.h