File jm_simd.h¶

Width-portable SIMD operation macros. More...

#include <stddef.h>

Public Types¶

Type	Name
typedef float	JM_VEC_F32
typedef double	JM_VEC_F64

Macros¶

Type	Name
define	JM_ADD_F32 (a, b) `((a) + (b))`
define	JM_ADD_F64 (a, b) `((a) + (b))`
define	JM_FMA_F32 (acc, a, b) `((acc) += (a) \* (b))`
define	JM_FMA_F64 (acc, a, b) `((acc) += (a) \* (b))`
define	JM_HSUM_F32 (v) `((float)(v))`
define	JM_HSUM_F64 (v) `((double)(v))`
define	JM_LOAD_F32 (p) `(\(const float \)(p))`
define	JM_LOAD_F64 (p) `(\(const double \)(p))`
define	JM_MAC_F32 (acc, ptr, s) `((acc) += (\(const float \)(ptr)) \* (float)(s))`
define	JM_MAC_F64 (acc, ptr, s) `((acc) += (\(const double \)(ptr)) \* (double)(s))`
define	JM_MUL_F32 (a, b) `((a) \* (b))`
define	JM_MUL_F64 (a, b) `((a) \* (b))`
define	JM_RESTRICT `restrict`
define	JM_SIMD_WIDTH `1`
define	JM_SIMD_WIDTH_F32 `1`
define	JM_SIMD_WIDTH_F64 `1`
define	JM_SPLAT_F32 (x) `((float)(x))`
define	JM_SPLAT_F64 (x) `((double)(x))`
define	JM_STORE_F32 (p, v) `(\(float \)(p) = (v))`
define	JM_STORE_F64 (p, v) `(\(double \)(p) = (v))`
define	JM_SUMSQ_F32 (dst, ptr, n) `/* multi line expression /` Sum of squares: dst = Σ ptr[i]² for i in [0, n).*
define	JM_ZERO_F32 () `(0.0f)`
define	JM_ZERO_F64 () `(0.0)`

Detailed Description¶

Selects the widest available instruction set at compile time: AVX-512F -> 16 float / 8 double lanes (JM_SIMD_WIDTH_F32 = 16) AVX2+FMA -> 8 float / 4 double lanes (JM_SIMD_WIDTH_F32 = 8) Scalar -> 1 lane (auto-vectorisation still applies)

Typical usage (FIR inner loop, processes JM_SIMD_WIDTH_F32 taps):

JM_VEC_F32 acc = JM_ZERO_F32();
for (int k = 0; k < N_TAPS; k++)
    JM_MAC_F32(acc, window + k, coeffs[k]);
*out += JM_HSUM_F32(acc);

For algorithms that require ISA-specific operations not covered here (gather loads, prefix scans, permutes) use #ifdef AVX512F guards around the raw intrinsics. JM_SIMD_WIDTH_F32 is still useful in that context as the canonical loop-stride constant.

Can be included standalone; if JM_RESTRICT is not already defined (from jm_perf.h) a local fallback is provided.

Public Types Documentation¶

typedef JM_VEC_F32¶

typedef float JM_VEC_F32;

typedef JM_VEC_F64¶

typedef double JM_VEC_F64;

Macro Definition Documentation¶

define JM_ADD_F32¶

#define JM_ADD_F32 (
    a,
    b
) `((a) + (b))`

define JM_ADD_F64¶

#define JM_ADD_F64 (
    a,
    b
) `((a) + (b))`

define JM_FMA_F32¶

#define JM_FMA_F32 (
    acc,
    a,
    b
) `((acc) += (a) * (b))`

define JM_FMA_F64¶

#define JM_FMA_F64 (
    acc,
    a,
    b
) `((acc) += (a) * (b))`

define JM_HSUM_F32¶

#define JM_HSUM_F32 (
    v
) `((float)(v))`

define JM_HSUM_F64¶

#define JM_HSUM_F64 (
    v
) `((double)(v))`

define JM_LOAD_F32¶

#define JM_LOAD_F32 (
    p
) `(*(const float *)(p))`

define JM_LOAD_F64¶

#define JM_LOAD_F64 (
    p
) `(*(const double *)(p))`

define JM_MAC_F32¶

#define JM_MAC_F32 (
    acc,
    ptr,
    s
) `((acc) += (*(const float *)(ptr)) * (float)(s))`

define JM_MAC_F64¶

#define JM_MAC_F64 (
    acc,
    ptr,
    s
) `((acc) += (*(const double *)(ptr)) * (double)(s))`

define JM_MUL_F32¶

#define JM_MUL_F32 (
    a,
    b
) `((a) * (b))`

define JM_MUL_F64¶

#define JM_MUL_F64 (
    a,
    b
) `((a) * (b))`

define JM_RESTRICT¶

#define JM_RESTRICT `restrict`

define JM_SIMD_WIDTH¶

#define JM_SIMD_WIDTH `1`

define JM_SIMD_WIDTH_F32¶

#define JM_SIMD_WIDTH_F32 `1`

define JM_SIMD_WIDTH_F64¶

#define JM_SIMD_WIDTH_F64 `1`

define JM_SPLAT_F32¶

#define JM_SPLAT_F32 (
    x
) `((float)(x))`

define JM_SPLAT_F64¶

#define JM_SPLAT_F64 (
    x
) `((double)(x))`

define JM_STORE_F32¶

#define JM_STORE_F32 (
    p,
    v
) `(*(float *)(p) = (v))`

define JM_STORE_F64¶

#define JM_STORE_F64 (
    p,
    v
) `(*(double *)(p) = (v))`

define JM_SUMSQ_F32¶

Sum of squares: dst = Σ ptr[i]² for i in [0, n).

#define JM_SUMSQ_F32 (
    dst,
    ptr,
    n
) `/* multi line expression */`

The bulk runs JM_SIMD_WIDTH_F32-wide via FMA accumulation; the trailing n % JM_SIMD_WIDTH_F32 elements are summed scalar. When n is a multiple of the SIMD width (e.g. a power-of-two block whose length is >= the width) the remainder loop has zero trips and folds away, leaving a pure vector reduction.

Parameters:

dst lvalue of type float — receives the sum.
ptr const float * — base of the contiguous input.
n element count (size_t-convertible).

float e;
JM_SUMSQ_F32 (e, buf, 256);   // e = energy of buf[0..255]

define JM_ZERO_F32¶

#define JM_ZERO_F32 (

) `(0.0f)`

define JM_ZERO_F64¶

#define JM_ZERO_F64 (

) `(0.0)`

The documentation for this class was generated from the following file native/inc/jm_simd.h