diff --git a/riscv/filter_rvv_intrinsics.c b/riscv/filter_rvv_intrinsics.c index 789996d27..a71e561bb 100644 --- a/riscv/filter_rvv_intrinsics.c +++ b/riscv/filter_rvv_intrinsics.c @@ -51,21 +51,22 @@ png_read_filter_row_sub_rvv(size_t len, size_t bpp, unsigned char* row) * x .. [v8](e8) */ - asm volatile ("vsetvli zero, %0, e8, m1" : : "r" (bpp)); + size_t vl = __riscv_vsetvl_e8m1(bpp); /* a = *row */ - asm volatile ("vle8.v v0, (%0)" : : "r" (row)); + vuint8m1_t a = __riscv_vle8_v_u8m1(row, vl); row += bpp; while (row < rp_end) { /* x = *row */ - asm volatile ("vle8.v v8, (%0)" : : "r" (row)); + vuint8m1_t x = __riscv_vle8_v_u8m1(row, vl); + /* a = a + x */ - asm volatile ("vadd.vv v0, v0, v8"); + a = __riscv_vadd_vv_u8m1(a, x, vl); /* *row = a */ - asm volatile ("vse8.v v0, (%0)" : : "r" (row)); + __riscv_vse8_v_u8m1(row, a, vl); row += bpp; } } @@ -110,44 +111,46 @@ png_read_filter_row_avg_rvv(size_t len, size_t bpp, unsigned char* row, /* first pixel */ - asm volatile ("vsetvli zero, %0, e8, m1" : : "r" (bpp)); + size_t vl = __riscv_vsetvl_e8m1(bpp); /* b = *prev_row */ - asm volatile ("vle8.v v4, (%0)" : : "r" (prev_row)); + vuint8m1_t b = __riscv_vle8_v_u8m1(prev_row, vl); prev_row += bpp; /* x = *row */ - asm volatile ("vle8.v v8, (%0)" : : "r" (row)); + vuint8m1_t x = __riscv_vle8_v_u8m1(row, vl); /* b = b / 2 */ - asm volatile ("vsrl.vi v4, v4, 1"); + b = __riscv_vsrl_vx_u8m1(b, 1, vl); + /* a = x + b */ - asm volatile ("vadd.vv v2, v4, v8"); + vuint8m1_t a = __riscv_vadd_vv_u8m1(b, x, vl); /* *row = a */ - asm volatile ("vse8.v v2, (%0)" : : "r" (row)); + __riscv_vse8_v_u8m1(row, a, vl); row += bpp; /* remaining pixels */ - while (row < rp_end) { /* b = *prev_row */ - asm volatile ("vle8.v v4, (%0)" : : "r" (prev_row)); + b = __riscv_vle8_v_u8m1(prev_row, vl); prev_row += bpp; /* x = *row */ - asm volatile ("vle8.v v8, (%0)" : : "r" (row)); + x = __riscv_vle8_v_u8m1(row, vl); /* tmp = a + b */ - asm volatile ("vwaddu.vv v12, v2, v4"); /* add with widening */ + vuint16m2_t tmp = __riscv_vwaddu_vv_u16m2(a, b, vl); + /* a = tmp/2 */ - asm volatile ("vnsrl.wi v2, v12, 1"); /* divide/shift with narrowing */ + a = __riscv_vnsrl_wx_u8m1(tmp, 1, vl); + /* a += x */ - asm volatile ("vadd.vv v2, v2, v8"); + a = __riscv_vadd_vv_u8m1(a, x, vl); /* *row = a */ - asm volatile ("vse8.v v2, (%0)" : : "r" (row)); + __riscv_vse8_v_u8m1(row, a, vl); row += bpp; } } @@ -205,18 +208,14 @@ static inline vint16m1_t abs_diff(vuint16m1_t a, vuint16m1_t b, size_t vl) { vint16m1_t diff = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vv_u16m1(a, b, vl)); - vint16m1_t neg = __riscv_vneg_v_i16m1(diff, vl); - - return __riscv_vmax_vv_i16m1(diff, neg, vl); + vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(diff, 0, vl); + return __riscv_vrsub_vx_i16m1_m(mask, diff, 0, vl); } static inline vint16m1_t abs_sum(vint16m1_t a, vint16m1_t b, size_t vl) { - vint16m1_t sum = __riscv_vadd_vv_i16m1(a, b, vl); - vint16m1_t neg = __riscv_vneg_v_i16m1(sum, vl); - - return __riscv_vmax_vv_i16m1(sum, neg, vl); + return __riscv_vadd_vv_i16m1(a, b, vl); } static inline void @@ -226,122 +225,109 @@ png_read_filter_row_paeth_rvv(size_t len, size_t bpp, unsigned char* row, png_bytep rp_end = row + len; /* - * row: | a | x | - * prev: | c | b | + * row: | a | x | + * prev: | c | b | * - * mask .. [v0] - * a .. [v2](e8) - * b .. [v4](e8) - * c .. [v6](e8) - * x .. [v8](e8) - * p .. [v12-v13](e16) - * pa .. [v16-v17](e16) - * pb .. [v20-v21](e16) - * pc .. [v24-v25](e16) - * tmpmask ..[v31] + * a .. [v2](e8) + * b .. [v4](e8) + * c .. [v6](e8) + * x .. [v8](e8) + * p .. [v12-v13](e16) + * pa, pb, pc .. [v16-v17, v20-v21, v24-v25](e16) */ /* first pixel */ - asm volatile ("vsetvli zero, %0, e8, m1" : : "r" (bpp)); + size_t vl = __riscv_vsetvl_e8m1(bpp); - /* a = *row + *prev_row */ - asm volatile ("vle8.v v2, (%0)" : : "r" (row)); - asm volatile ("vle8.v v6, (%0)" : : "r" (prev)); - prev += bpp; - asm volatile ("vadd.vv v2, v2, v6"); + /* a = *row */ + vuint8m1_t a = __riscv_vle8_v_u8m1(row, vl); + + /* c = *prev */ + vuint8m1_t c = __riscv_vle8_v_u8m1(prev, vl); + + /* a += c */ + a = __riscv_vadd_vv_u8m1(a, c, vl); /* *row = a */ - asm volatile ("vse8.v v2, (%0)" : : "r" (row)); + __riscv_vse8_v_u8m1(row, a, vl); row += bpp; + prev += bpp; /* remaining pixels */ + while (row < rp_end) { - /* b = *prev_row */ - asm volatile ("vle8.v v4, (%0)" : : "r" (prev)); + /* b = *prev */ + vuint8m1_t b = __riscv_vle8_v_u8m1(prev, vl); prev += bpp; /* x = *row */ - asm volatile ("vle8.v v8, (%0)" : : "r" (row)); + vuint8m1_t x = __riscv_vle8_v_u8m1(row, vl); - /* sub (widening to 16bit) */ - /* p = b - c */ - asm volatile ("vwsubu.vv v12, v4, v6"); - /* pc = a - c */ - asm volatile ("vwsubu.vv v24, v2, v6"); + /* Calculate p = b - c and pc = a - c using widening subtraction */ + vuint16m2_t p_wide = __riscv_vwsubu_vv_u16m2(b, c, vl); + vuint16m2_t pc_wide = __riscv_vwsubu_vv_u16m2(a, c, vl); - /* switch to widened */ - asm volatile ("vsetvli zero, %0, e16, m2" : : "r" (bpp)); + /* Convert to signed for easier manipulation */ + size_t vl16 = __riscv_vsetvl_e16m2(bpp); + vint16m2_t p = __riscv_vreinterpret_v_u16m2_i16m2(p_wide); + vint16m2_t pc = __riscv_vreinterpret_v_u16m2_i16m2(pc_wide); - /* pa = abs(p) -> pa = p < 0 ? -p : p */ - asm volatile ("vmv.v.v v16, v12"); /* pa = p */ - asm volatile ("vmslt.vx v0, v16, zero"); /* set mask[i] if pa[i] < 0 */ - asm volatile ("vrsub.vx v16, v16, zero, v0.t"); /* invert negative values in pa; vd[i] = 0 - vs2[i] (if mask[i]) - * could be replaced by vneg in rvv >= 1.0 - */ + /* pa = |p| */ + vbool8_t p_neg_mask = __riscv_vmslt_vx_i16m2_b8(p, 0, vl16); + vint16m2_t pa = __riscv_vrsub_vx_i16m2_m(p_neg_mask, p, 0, vl16); - /* pb = abs(p) -> pb = pc < 0 ? -pc : pc */ - asm volatile ("vmv.v.v v20, v24"); /* pb = pc */ - asm volatile ("vmslt.vx v0, v20, zero"); /* set mask[i] if pc[i] < 0 */ - asm volatile ("vrsub.vx v20, v20, zero, v0.t"); /* invert negative values in pb; vd[i] = 0 - vs2[i] (if mask[i]) - * could be replaced by vneg in rvv >= 1.0 - */ + /* pb = |pc| */ + vbool8_t pc_neg_mask = __riscv_vmslt_vx_i16m2_b8(pc, 0, vl16); + vint16m2_t pb = __riscv_vrsub_vx_i16m2_m(pc_neg_mask, pc, 0, vl16); - /* pc = abs(p + pc) -> pc = (p + pc) < 0 ? -(p + pc) : p + pc */ - asm volatile ("vadd.vv v24, v24, v12"); /* pc = p + pc */ - asm volatile ("vmslt.vx v0, v24, zero"); /* set mask[i] if pc[i] < 0 */ - asm volatile ("vrsub.vx v24, v24, zero, v0.t"); /* invert negative values in pc; vd[i] = 0 - vs2[i] (if mask[i]) - * could be replaced by vneg in rvv >= 1.0 - */ + /* pc = |p + pc| */ + vint16m2_t p_plus_pc = __riscv_vadd_vv_i16m2(p, pc, vl16); + vbool8_t p_plus_pc_neg_mask = __riscv_vmslt_vx_i16m2_b8(p_plus_pc, 0, vl16); + pc = __riscv_vrsub_vx_i16m2_m(p_plus_pc_neg_mask, p_plus_pc, 0, vl16); /* - * if (pb < pa) - * { - * pa = pb; - * a = b; - * // see (*1) - * } + * The key insight is that we want the minimum of pa, pb, pc. + * - If pa <= pb and pa <= pc, use a + * - Else if pb <= pc, use b + * - Else use c */ - asm volatile ("vmslt.vv v0, v20, v16"); /* set mask[i] if pb[i] < pa[i] */ - asm volatile ("vmerge.vvm v16, v16, v20, v0"); /* pa[i] = pb[i] (if mask[i]) */ - /* - * if (pc < pa) - * { - * a = c; - * // see (*2) - * } - */ - asm volatile ("vmslt.vv v31, v24, v16"); /* set tmpmask[i] if pc[i] < pa[i] */ + /* Find which predictor to use based on minimum absolute difference */ + vbool8_t pa_le_pb = __riscv_vmsle_vv_i16m2_b8(pa, pb, vl16); + vbool8_t pa_le_pc = __riscv_vmsle_vv_i16m2_b8(pa, pc, vl16); + vbool8_t pb_le_pc = __riscv_vmsle_vv_i16m2_b8(pb, pc, vl16); - /* switch to narrow */ - asm volatile ("vsetvli zero, %0, e8, m1" : : "r" (bpp)); + /* use_a = pa <= pb && pa <= pc */ + vbool8_t use_a = __riscv_vmand_mm_b8(pa_le_pb, pa_le_pc, vl16); - /* (*1) */ - asm volatile ("vmerge.vvm v2, v2, v4, v0"); /* a = b (if mask[i]) */ + /* use_b = !use_a && pb <= pc */ + vbool8_t not_use_a = __riscv_vmnot_m_b8(use_a, vl16); + vbool8_t use_b = __riscv_vmand_mm_b8(not_use_a, pb_le_pc, vl16); - /* (*2) */ - asm volatile ("vmand.mm v0, v31, v31"); /* mask = tmpmask - * vmand works for rvv 0.7 up to 1.0 - * could be replaced by vmcpy in 0.7.1/0.8.1 - * or vmmv.m in 1.0 - */ - asm volatile ("vmerge.vvm v2, v2, v6, v0"); /* a = c (if mask[i]) */ + /* Switch back to e8m1 for final operations */ + vl = __riscv_vsetvl_e8m1(bpp); - /* a += x */ - asm volatile ("vadd.vv v2, v2, v8"); + /* Start with a, then conditionally replace with b or c */ + vuint8m1_t result = a; + result = __riscv_vmerge_vvm_u8m1(result, b, use_b, vl); + + /* use_c = !use_a && !use_b */ + vbool8_t use_c = __riscv_vmnand_mm_b8(__riscv_vmor_mm_b8(use_a, use_b, vl), __riscv_vmor_mm_b8(use_a, use_b, vl), vl); + result = __riscv_vmerge_vvm_u8m1(result, c, use_c, vl); + + /* a = result + x */ + a = __riscv_vadd_vv_u8m1(result, x, vl); /* *row = a */ - asm volatile ("vse8.v v2, (%0)" : : "r" (row)); + __riscv_vse8_v_u8m1(row, a, vl); row += bpp; - /* prepare next iteration (prev is already in a) */ - /* c = b */ - asm volatile ("vmv.v.v v6, v4"); + /* c = b for next iteration */ + c = b; } } - void png_read_filter_row_paeth3_rvv(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) @@ -349,8 +335,6 @@ png_read_filter_row_paeth3_rvv(png_row_infop row_info, png_bytep row, size_t len = row_info->rowbytes; png_read_filter_row_paeth_rvv(len, 3, row, prev_row); - - PNG_UNUSED(prev_row) } void @@ -360,9 +344,7 @@ png_read_filter_row_paeth4_rvv(png_row_infop row_info, png_bytep row, size_t len = row_info->rowbytes; png_read_filter_row_paeth_rvv(len, 4, row, prev_row); - - PNG_UNUSED(prev_row) } -#endif /* PNG_RISCV_RVV_IMPLEMENTATION */ -#endif /* READ */ +#endif /* PNG_RISCV_RVV_IMPLEMENTATION == 1 */ +#endif /* PNG_READ_SUPPORTED */