riscv: Use C intrinsics

Signed-off-by: Cosmin Truta <ctruta@gmail.com>
This commit is contained in:
Filip Wasil 2025-06-26 11:54:29 +02:00 committed by Cosmin Truta
parent 21895b05ab
commit 3391bb98e3

View File

@ -51,21 +51,22 @@ png_read_filter_row_sub_rvv(size_t len, size_t bpp, unsigned char* row)
* x .. [v8](e8) * x .. [v8](e8)
*/ */
asm volatile ("vsetvli zero, %0, e8, m1" : : "r" (bpp)); size_t vl = __riscv_vsetvl_e8m1(bpp);
/* a = *row */ /* a = *row */
asm volatile ("vle8.v v0, (%0)" : : "r" (row)); vuint8m1_t a = __riscv_vle8_v_u8m1(row, vl);
row += bpp; row += bpp;
while (row < rp_end) while (row < rp_end)
{ {
/* x = *row */ /* x = *row */
asm volatile ("vle8.v v8, (%0)" : : "r" (row)); vuint8m1_t x = __riscv_vle8_v_u8m1(row, vl);
/* a = a + x */ /* a = a + x */
asm volatile ("vadd.vv v0, v0, v8"); a = __riscv_vadd_vv_u8m1(a, x, vl);
/* *row = a */ /* *row = a */
asm volatile ("vse8.v v0, (%0)" : : "r" (row)); __riscv_vse8_v_u8m1(row, a, vl);
row += bpp; row += bpp;
} }
} }
@ -110,44 +111,46 @@ png_read_filter_row_avg_rvv(size_t len, size_t bpp, unsigned char* row,
/* first pixel */ /* first pixel */
asm volatile ("vsetvli zero, %0, e8, m1" : : "r" (bpp)); size_t vl = __riscv_vsetvl_e8m1(bpp);
/* b = *prev_row */ /* b = *prev_row */
asm volatile ("vle8.v v4, (%0)" : : "r" (prev_row)); vuint8m1_t b = __riscv_vle8_v_u8m1(prev_row, vl);
prev_row += bpp; prev_row += bpp;
/* x = *row */ /* x = *row */
asm volatile ("vle8.v v8, (%0)" : : "r" (row)); vuint8m1_t x = __riscv_vle8_v_u8m1(row, vl);
/* b = b / 2 */ /* b = b / 2 */
asm volatile ("vsrl.vi v4, v4, 1"); b = __riscv_vsrl_vx_u8m1(b, 1, vl);
/* a = x + b */ /* a = x + b */
asm volatile ("vadd.vv v2, v4, v8"); vuint8m1_t a = __riscv_vadd_vv_u8m1(b, x, vl);
/* *row = a */ /* *row = a */
asm volatile ("vse8.v v2, (%0)" : : "r" (row)); __riscv_vse8_v_u8m1(row, a, vl);
row += bpp; row += bpp;
/* remaining pixels */ /* remaining pixels */
while (row < rp_end) while (row < rp_end)
{ {
/* b = *prev_row */ /* b = *prev_row */
asm volatile ("vle8.v v4, (%0)" : : "r" (prev_row)); b = __riscv_vle8_v_u8m1(prev_row, vl);
prev_row += bpp; prev_row += bpp;
/* x = *row */ /* x = *row */
asm volatile ("vle8.v v8, (%0)" : : "r" (row)); x = __riscv_vle8_v_u8m1(row, vl);
/* tmp = a + b */ /* tmp = a + b */
asm volatile ("vwaddu.vv v12, v2, v4"); /* add with widening */ vuint16m2_t tmp = __riscv_vwaddu_vv_u16m2(a, b, vl);
/* a = tmp/2 */ /* a = tmp/2 */
asm volatile ("vnsrl.wi v2, v12, 1"); /* divide/shift with narrowing */ a = __riscv_vnsrl_wx_u8m1(tmp, 1, vl);
/* a += x */ /* a += x */
asm volatile ("vadd.vv v2, v2, v8"); a = __riscv_vadd_vv_u8m1(a, x, vl);
/* *row = a */ /* *row = a */
asm volatile ("vse8.v v2, (%0)" : : "r" (row)); __riscv_vse8_v_u8m1(row, a, vl);
row += bpp; row += bpp;
} }
} }
@ -205,18 +208,14 @@ static inline vint16m1_t
abs_diff(vuint16m1_t a, vuint16m1_t b, size_t vl) abs_diff(vuint16m1_t a, vuint16m1_t b, size_t vl)
{ {
vint16m1_t diff = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vv_u16m1(a, b, vl)); vint16m1_t diff = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vv_u16m1(a, b, vl));
vint16m1_t neg = __riscv_vneg_v_i16m1(diff, vl); vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(diff, 0, vl);
return __riscv_vrsub_vx_i16m1_m(mask, diff, 0, vl);
return __riscv_vmax_vv_i16m1(diff, neg, vl);
} }
static inline vint16m1_t static inline vint16m1_t
abs_sum(vint16m1_t a, vint16m1_t b, size_t vl) abs_sum(vint16m1_t a, vint16m1_t b, size_t vl)
{ {
vint16m1_t sum = __riscv_vadd_vv_i16m1(a, b, vl); return __riscv_vadd_vv_i16m1(a, b, vl);
vint16m1_t neg = __riscv_vneg_v_i16m1(sum, vl);
return __riscv_vmax_vv_i16m1(sum, neg, vl);
} }
static inline void static inline void
@ -226,122 +225,109 @@ png_read_filter_row_paeth_rvv(size_t len, size_t bpp, unsigned char* row,
png_bytep rp_end = row + len; png_bytep rp_end = row + len;
/* /*
* row: | a | x | * row: | a | x |
* prev: | c | b | * prev: | c | b |
* *
* mask .. [v0] * a .. [v2](e8)
* a .. [v2](e8) * b .. [v4](e8)
* b .. [v4](e8) * c .. [v6](e8)
* c .. [v6](e8) * x .. [v8](e8)
* x .. [v8](e8) * p .. [v12-v13](e16)
* p .. [v12-v13](e16) * pa, pb, pc .. [v16-v17, v20-v21, v24-v25](e16)
* pa .. [v16-v17](e16)
* pb .. [v20-v21](e16)
* pc .. [v24-v25](e16)
* tmpmask ..[v31]
*/ */
/* first pixel */ /* first pixel */
asm volatile ("vsetvli zero, %0, e8, m1" : : "r" (bpp)); size_t vl = __riscv_vsetvl_e8m1(bpp);
/* a = *row + *prev_row */ /* a = *row */
asm volatile ("vle8.v v2, (%0)" : : "r" (row)); vuint8m1_t a = __riscv_vle8_v_u8m1(row, vl);
asm volatile ("vle8.v v6, (%0)" : : "r" (prev));
prev += bpp; /* c = *prev */
asm volatile ("vadd.vv v2, v2, v6"); vuint8m1_t c = __riscv_vle8_v_u8m1(prev, vl);
/* a += c */
a = __riscv_vadd_vv_u8m1(a, c, vl);
/* *row = a */ /* *row = a */
asm volatile ("vse8.v v2, (%0)" : : "r" (row)); __riscv_vse8_v_u8m1(row, a, vl);
row += bpp; row += bpp;
prev += bpp;
/* remaining pixels */ /* remaining pixels */
while (row < rp_end) while (row < rp_end)
{ {
/* b = *prev_row */ /* b = *prev */
asm volatile ("vle8.v v4, (%0)" : : "r" (prev)); vuint8m1_t b = __riscv_vle8_v_u8m1(prev, vl);
prev += bpp; prev += bpp;
/* x = *row */ /* x = *row */
asm volatile ("vle8.v v8, (%0)" : : "r" (row)); vuint8m1_t x = __riscv_vle8_v_u8m1(row, vl);
/* sub (widening to 16bit) */ /* Calculate p = b - c and pc = a - c using widening subtraction */
/* p = b - c */ vuint16m2_t p_wide = __riscv_vwsubu_vv_u16m2(b, c, vl);
asm volatile ("vwsubu.vv v12, v4, v6"); vuint16m2_t pc_wide = __riscv_vwsubu_vv_u16m2(a, c, vl);
/* pc = a - c */
asm volatile ("vwsubu.vv v24, v2, v6");
/* switch to widened */ /* Convert to signed for easier manipulation */
asm volatile ("vsetvli zero, %0, e16, m2" : : "r" (bpp)); size_t vl16 = __riscv_vsetvl_e16m2(bpp);
vint16m2_t p = __riscv_vreinterpret_v_u16m2_i16m2(p_wide);
vint16m2_t pc = __riscv_vreinterpret_v_u16m2_i16m2(pc_wide);
/* pa = abs(p) -> pa = p < 0 ? -p : p */ /* pa = |p| */
asm volatile ("vmv.v.v v16, v12"); /* pa = p */ vbool8_t p_neg_mask = __riscv_vmslt_vx_i16m2_b8(p, 0, vl16);
asm volatile ("vmslt.vx v0, v16, zero"); /* set mask[i] if pa[i] < 0 */ vint16m2_t pa = __riscv_vrsub_vx_i16m2_m(p_neg_mask, p, 0, vl16);
asm volatile ("vrsub.vx v16, v16, zero, v0.t"); /* invert negative values in pa; vd[i] = 0 - vs2[i] (if mask[i])
* could be replaced by vneg in rvv >= 1.0
*/
/* pb = abs(p) -> pb = pc < 0 ? -pc : pc */ /* pb = |pc| */
asm volatile ("vmv.v.v v20, v24"); /* pb = pc */ vbool8_t pc_neg_mask = __riscv_vmslt_vx_i16m2_b8(pc, 0, vl16);
asm volatile ("vmslt.vx v0, v20, zero"); /* set mask[i] if pc[i] < 0 */ vint16m2_t pb = __riscv_vrsub_vx_i16m2_m(pc_neg_mask, pc, 0, vl16);
asm volatile ("vrsub.vx v20, v20, zero, v0.t"); /* invert negative values in pb; vd[i] = 0 - vs2[i] (if mask[i])
* could be replaced by vneg in rvv >= 1.0
*/
/* pc = abs(p + pc) -> pc = (p + pc) < 0 ? -(p + pc) : p + pc */ /* pc = |p + pc| */
asm volatile ("vadd.vv v24, v24, v12"); /* pc = p + pc */ vint16m2_t p_plus_pc = __riscv_vadd_vv_i16m2(p, pc, vl16);
asm volatile ("vmslt.vx v0, v24, zero"); /* set mask[i] if pc[i] < 0 */ vbool8_t p_plus_pc_neg_mask = __riscv_vmslt_vx_i16m2_b8(p_plus_pc, 0, vl16);
asm volatile ("vrsub.vx v24, v24, zero, v0.t"); /* invert negative values in pc; vd[i] = 0 - vs2[i] (if mask[i]) pc = __riscv_vrsub_vx_i16m2_m(p_plus_pc_neg_mask, p_plus_pc, 0, vl16);
* could be replaced by vneg in rvv >= 1.0
*/
/* /*
* if (pb < pa) * The key insight is that we want the minimum of pa, pb, pc.
* { * - If pa <= pb and pa <= pc, use a
* pa = pb; * - Else if pb <= pc, use b
* a = b; * - Else use c
* // see (*1)
* }
*/ */
asm volatile ("vmslt.vv v0, v20, v16"); /* set mask[i] if pb[i] < pa[i] */
asm volatile ("vmerge.vvm v16, v16, v20, v0"); /* pa[i] = pb[i] (if mask[i]) */
/* /* Find which predictor to use based on minimum absolute difference */
* if (pc < pa) vbool8_t pa_le_pb = __riscv_vmsle_vv_i16m2_b8(pa, pb, vl16);
* { vbool8_t pa_le_pc = __riscv_vmsle_vv_i16m2_b8(pa, pc, vl16);
* a = c; vbool8_t pb_le_pc = __riscv_vmsle_vv_i16m2_b8(pb, pc, vl16);
* // see (*2)
* }
*/
asm volatile ("vmslt.vv v31, v24, v16"); /* set tmpmask[i] if pc[i] < pa[i] */
/* switch to narrow */ /* use_a = pa <= pb && pa <= pc */
asm volatile ("vsetvli zero, %0, e8, m1" : : "r" (bpp)); vbool8_t use_a = __riscv_vmand_mm_b8(pa_le_pb, pa_le_pc, vl16);
/* (*1) */ /* use_b = !use_a && pb <= pc */
asm volatile ("vmerge.vvm v2, v2, v4, v0"); /* a = b (if mask[i]) */ vbool8_t not_use_a = __riscv_vmnot_m_b8(use_a, vl16);
vbool8_t use_b = __riscv_vmand_mm_b8(not_use_a, pb_le_pc, vl16);
/* (*2) */ /* Switch back to e8m1 for final operations */
asm volatile ("vmand.mm v0, v31, v31"); /* mask = tmpmask vl = __riscv_vsetvl_e8m1(bpp);
* vmand works for rvv 0.7 up to 1.0
* could be replaced by vmcpy in 0.7.1/0.8.1
* or vmmv.m in 1.0
*/
asm volatile ("vmerge.vvm v2, v2, v6, v0"); /* a = c (if mask[i]) */
/* a += x */ /* Start with a, then conditionally replace with b or c */
asm volatile ("vadd.vv v2, v2, v8"); vuint8m1_t result = a;
result = __riscv_vmerge_vvm_u8m1(result, b, use_b, vl);
/* use_c = !use_a && !use_b */
vbool8_t use_c = __riscv_vmnand_mm_b8(__riscv_vmor_mm_b8(use_a, use_b, vl), __riscv_vmor_mm_b8(use_a, use_b, vl), vl);
result = __riscv_vmerge_vvm_u8m1(result, c, use_c, vl);
/* a = result + x */
a = __riscv_vadd_vv_u8m1(result, x, vl);
/* *row = a */ /* *row = a */
asm volatile ("vse8.v v2, (%0)" : : "r" (row)); __riscv_vse8_v_u8m1(row, a, vl);
row += bpp; row += bpp;
/* prepare next iteration (prev is already in a) */ /* c = b for next iteration */
/* c = b */ c = b;
asm volatile ("vmv.v.v v6, v4");
} }
} }
void void
png_read_filter_row_paeth3_rvv(png_row_infop row_info, png_bytep row, png_read_filter_row_paeth3_rvv(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row) png_const_bytep prev_row)
@ -349,8 +335,6 @@ png_read_filter_row_paeth3_rvv(png_row_infop row_info, png_bytep row,
size_t len = row_info->rowbytes; size_t len = row_info->rowbytes;
png_read_filter_row_paeth_rvv(len, 3, row, prev_row); png_read_filter_row_paeth_rvv(len, 3, row, prev_row);
PNG_UNUSED(prev_row)
} }
void void
@ -360,9 +344,7 @@ png_read_filter_row_paeth4_rvv(png_row_infop row_info, png_bytep row,
size_t len = row_info->rowbytes; size_t len = row_info->rowbytes;
png_read_filter_row_paeth_rvv(len, 4, row, prev_row); png_read_filter_row_paeth_rvv(len, 4, row, prev_row);
PNG_UNUSED(prev_row)
} }
#endif /* PNG_RISCV_RVV_IMPLEMENTATION */ #endif /* PNG_RISCV_RVV_IMPLEMENTATION == 1 */
#endif /* READ */ #endif /* PNG_READ_SUPPORTED */