libpng/riscv/palette_vector_intrinsics.c
Dragoș Tiselice cc5ee6b213 Add optimized RISC-V Vector functions
Largely based off of the ARM NEON implementation.

Signed-off-by: Cosmin Truta <ctruta@gmail.com>
2025-05-01 17:44:00 +03:00

105 lines
2.7 KiB
C

/* palette_neon_intrinsics.c - RISC-V Vector optimized palette expansion
* functions
*
* Copyright (c) 2023 Google LLC
* Written by Dragoș Tiselice <dtiselice@google.com>, May 2023.
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*/
#include "../pngpriv.h"
#if PNG_ARM_NEON_IMPLEMENTATION == 1
#include <riscv_vector.h>
void
png_riffle_palette_vector(png_structrp png_ptr)
{
png_const_bytep palette = (png_const_bytep)png_ptr->palette;
png_bytep riffled_palette = png_ptr->riffled_palette;
png_const_bytep trans_alpha = png_ptr->trans_alpha;
size_t len = 256;
vuint8m1_t r;
vuint8m1_t g;
vuint8m1_t b;
for (size_t vl; len > 0; len -= vl, palette += vl * 3, trans_alpha += vl, riffled_palette += vl * 4) {
vl = __riscv_vsetvl_e8m1(len);
__riscv_vlseg3e8_v_u8m1(&r, &g, &b, palette, vl);
vuint8m1_t a = __riscv_vle8_v_u8m1(trans_alpha, vl);
__riscv_vsseg4e8_v_u8m1(riffled_palette, r, g, b, a, vl);
}
}
int
png_do_expand_palette_rgba8_vector(png_structrp png_ptr, png_row_infop row_info,
png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
{
size_t row_width = (size_t)row_info->width;
const png_uint_32* palette = (const png_uint_32*)png_ptr->riffled_palette;
size_t vl = __riscv_vsetvl_e8m1(row_width);
png_bytep sp = *ssp - vl;
png_bytep dp = *ddp - vl * 4;
for (; row_width > 0; row_width -= vl, dp -= vl * 4, sp -= vl) {
vl = __riscv_vsetvl_e8m1(row_width);
vuint8m1_t indices = __riscv_vle8_v_u8m1(sp, vl);
vuint32m4_t pixels = __riscv_vluxei8_v_u32m4(palette, indices, vl);
__riscv_vse32_v_u32m4((unsigned int *)dp, pixels, vl);
}
row_width = (size_t)row_info->width;
*ssp = *ssp - row_width;
*ddp = *ddp - row_width * 4;
return row_width;
}
int
png_do_expand_palette_rgb8_vector(png_structrp png_ptr, png_row_infop row_info,
png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
{
size_t row_width = (size_t)row_info->width;
const png_const_bytep palette = (png_const_bytep)png_ptr->palette;
size_t vl = __riscv_vsetvl_e8m1(row_width);
png_bytep sp = *ssp - vl;
png_bytep dp = *ddp - vl * 3;
vuint8m1_t r;
vuint8m1_t g;
vuint8m1_t b;
for (; row_width > 0; row_width -= vl, dp -= vl * 3, sp -= vl) {
vl = __riscv_vsetvl_e8m1(row_width);
vuint16m2_t indices = __riscv_vwmulu_vx_u16m2(__riscv_vle8_v_u8m1(sp, vl), 3, vl);
__riscv_vluxseg3ei16_v_u8m1(&r, &g, &b, palette, indices, vl);
__riscv_vsseg3e8_v_u8m1(dp, r, g, b, vl);
}
row_width = (size_t)row_info->width;
*ssp = *ssp - row_width;
*ddp = *ddp - row_width * 3;
return row_width;
}
#endif /* PNG_ARM_NEON_IMPLEMENTATION */