From 2b6e59d96819e18a1852a7ac7ab08b19163dfe75 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sat, 14 Jan 2017 16:05:33 +0300 Subject: [PATCH 01/37] Added initial code for PowerPC VSX optimisation --- Makefile.am | 5 + configure.ac | 46 +++++++ png.h | 5 +- pngpriv.h | 30 ++++ pngrutil.c | 2 +- powerpc/filter_vsx_intrinsics.c | 233 ++++++++++++++++++++++++++++++++ powerpc/powerpc_init.c | 126 +++++++++++++++++ 7 files changed, 445 insertions(+), 2 deletions(-) create mode 100644 powerpc/filter_vsx_intrinsics.c create mode 100644 powerpc/powerpc_init.c diff --git a/Makefile.am b/Makefile.am index fb209edd2..fa8281695 100644 --- a/Makefile.am +++ b/Makefile.am @@ -102,6 +102,11 @@ libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += mips/mips_init.c\ mips/filter_msa_intrinsics.c endif +if PNG_POWERPC_VSX +libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += powerpc/powerpc_init.c\ + powerpc/filter_vsx_intrinsics.c +endif + nodist_libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES = pnglibconf.h libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_LDFLAGS = -no-undefined -export-dynamic \ diff --git a/configure.ac b/configure.ac index c060c4603..4168eb268 100644 --- a/configure.ac +++ b/configure.ac @@ -391,6 +391,52 @@ AM_CONDITIONAL([PNG_MIPS_MSA], mipsel*|mips64el*) :;; esac]) +# PowerPC +# === +# +# PowerPC VSX (SIMD) support. + +AC_ARG_ENABLE([powerpc-vsx], +AS_HELP_STRING([[[--enable-powerpc-vsx]]], + [Enable POWERPC VSX optimizations: =no/off, check, api, yes/on:] + [no/off: disable the optimizations; check: use internal checking code] + [(deprecated and poorly supported); api: disable by default, enable by] + [a call to png_set_option; yes/on: turn on unconditionally.] + [If not specified: determined by the compiler.]), + [case "$enableval" in + no|off) + # disable the default enabling on __ppc64__ systems: + AC_DEFINE([PNG_POWERPC_VSX_OPT], [0], + [Disable POWERPC VSX optimizations]) + # Prevent inclusion of the platform specific files below: + enable_powerpc_vsx=no;; + check) + AC_DEFINE([PNG_POWERPC_VSX_CHECK_SUPPORTED], [], + [Check for POWERPC VSX support at run-time]);; + api) + AC_DEFINE([PNG_POWERPC_VSX_API_SUPPORTED], [], + [Turn on POWERPC VSX optimizations at run-time]);; + yes|on) + AC_DEFINE([PNG_POWERPC_VSX_OPT], [2], + [Enable POWERPC VSX optimizations]) + AC_MSG_WARN([--enable-powerpc-vsx: please specify 'check' or 'api', if] + [you want the optimizations unconditionally pass '-maltivec and -mabi=altivec'] + [to the compiler.]);; + *) + AC_MSG_ERROR([--enable-powerpc-vsx=${enable_powerpc_vsx}: invalid value]) + esac]) + +# Add PowerPC specific files to all builds where the host_cpu is powerpc('powerpc*') or +# where POWERPC optimizations were explicitly requested (this allows a fallback if a +# future host CPU does not match 'powerpc*') + +AM_CONDITIONAL([PNG_POWERPC_VSX], + [test "$enable_powerpc_vsx" != 'no' && + case "$host_cpu" in + powerpc*) :;; + esac]) + + AC_MSG_NOTICE([[Extra options for compiler: $PNG_COPTS]]) # Config files, substituting as above diff --git a/png.h b/png.h index cad9a825e..f62f44504 100644 --- a/png.h +++ b/png.h @@ -3225,7 +3225,10 @@ PNG_EXPORT(245, int, png_image_write_to_memory, (png_imagep image, void *memory, # define PNG_MIPS_MSA 6 /* HARDWARE: MIPS Msa SIMD instructions supported */ #endif #define PNG_IGNORE_ADLER32 8 -#define PNG_OPTION_NEXT 10 /* Next option - numbers must be even */ +#ifdef PNG_POWERPC_VSX_API_SUPPORTED +# define PNG_POWERPC_VSX 10 /* HARDWARE: PowerPC VSX SIMD instructions supported */ +#endif +#define PNG_OPTION_NEXT 12 /* Next option - numbers must be even */ /* Return values: NOTE: there are four values and 'off' is *not* zero */ #define PNG_OPTION_UNSET 0 /* Unset - defaults to off */ diff --git a/pngpriv.h b/pngpriv.h index 50ff68b1c..00981abd3 100644 --- a/pngpriv.h +++ b/pngpriv.h @@ -190,6 +190,14 @@ # endif #endif +#ifndef PNG_POWERPC_VSX_OPT +# if defined(__ppc64__) && defined(__ALTIVEC__) && defined(PNG_ALIGNED_MEMORY_SUPPORTED) +# define PNG_POWERPC_VSX_OPT 2 +# else +# define PNG_POWERPC_VSX_OPT 0 +# endif +#endif + #ifndef PNG_INTEL_SSE_OPT # ifdef PNG_INTEL_SSE /* Only check for SSE if the build configuration has been modified to @@ -246,6 +254,11 @@ # endif #endif /* PNG_MIPS_MSA_OPT > 0 */ +#if PNG_POWERPC_VSX_OPT > 0 +# define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_vsx +# define PNG_POWERPC_VSX_IMPLEMENTATION 1 +#endif + /* Is this a build of a DLL where compilation of the object modules requires * different preprocessor settings to those required for a simple library? If @@ -1292,6 +1305,23 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_msa,(png_row_infop row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); #endif +#if PNG_POWERPC_VSX_OPT > 0 +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_vsx,(png_row_infop row_info, + png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +#endif + #if PNG_INTEL_SSE_IMPLEMENTATION > 0 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); diff --git a/pngrutil.c b/pngrutil.c index c34da5d4c..97f50b09f 100644 --- a/pngrutil.c +++ b/pngrutil.c @@ -3797,7 +3797,7 @@ png_read_filter_row_sub(png_row_infop row_info, png_bytep row, png_size_t i; png_size_t istop = row_info->rowbytes; unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_bytep rp = row + bpp; + png_bytep rp = row + bpp; PNG_UNUSED(prev_row) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c new file mode 100644 index 000000000..8e4ef2930 --- /dev/null +++ b/powerpc/filter_vsx_intrinsics.c @@ -0,0 +1,233 @@ + +/* filter_vsx_intrinsics.c - PowerPC optimised filter functions + * + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * Last changed in libpng 1.6.25 [September 1, 2016] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +#include +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +/* This code requires -maltivec and -mabi=altivec on the command line: */ +#if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ + +/* libpng row pointers are not necessarily aligned to any particular boundary, + * however this code will only work with appropriate alignment. arm/arm_init.c + * checks for this (and will not compile unless it is done). This code uses + * variants of png_aligncast to avoid compiler warnings. + */ +#define png_ptr(type,pointer) png_aligncast(type *,pointer) +#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) + +/*#include */ + +#if PNG_POWERPC_VSX_OPT > 0 + +void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_size_t istop = row_info->rowbytes; + png_bytep rp = row; + png_const_bytep pp = prev_row; + + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } + +} + +void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_size_t istop = row_info->rowbytes; + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_bytep rp = row + bpp; + + PNG_UNUSED(prev_row) + + for (i = bpp; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } +} + +void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_size_t istop = row_info->rowbytes; + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_bytep rp = row + bpp; + + PNG_UNUSED(prev_row) + + for (i = bpp; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } +} + +void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_bytep rp = row; + png_const_bytep pp = prev_row; + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_size_t istop = row_info->rowbytes - bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_bytep rp = row; + png_const_bytep pp = prev_row; + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_size_t istop = row_info->rowbytes - bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +void png_read_filter_row_paeth4_vsx(png_row_infop row_info, + png_bytep row, + png_const_bytep prev_row) +{ + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_bytep rp_end = row + bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + while (row < rp_end) + { + int a = *row + *prev_row++; + *row++ = (png_byte)a; + } + + /* Remainder */ + rp_end = rp_end + (row_info->rowbytes - bpp); + + while (row < rp_end) + { + int a, b, c, pa, pb, pc, p; + + c = *(prev_row - bpp); + a = *(row - bpp); + b = *prev_row++; + + p = b - c; + pc = a - c; + + #ifdef PNG_USE_ABS + pa = abs(p); + pb = abs(pc); + pc = abs(p + pc); + #else + pa = p < 0 ? -p : p; + pb = pc < 0 ? -pc : pc; + pc = (p + pc) < 0 ? -(p + pc) : p + pc; + #endif + + if (pb < pa) pa = pb, a = b; + if (pc < pa) a = c; + + a += *row; + *row++ = (png_byte)a; + } +} + +void png_read_filter_row_paeth3_vsx(png_row_infop row_info, + png_bytep row, + png_const_bytep prev_row) +{ + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_bytep rp_end = row + bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + while (row < rp_end) + { + int a = *row + *prev_row++; + *row++ = (png_byte)a; + } + + /* Remainder */ + rp_end = rp_end + (row_info->rowbytes - bpp); + + while (row < rp_end) + { + int a, b, c, pa, pb, pc, p; + + c = *(prev_row - bpp); + a = *(row - bpp); + b = *prev_row++; + + p = b - c; + pc = a - c; + + #ifdef PNG_USE_ABS + pa = abs(p); + pb = abs(pc); + pc = abs(p + pc); + #else + pa = p < 0 ? -p : p; + pb = pc < 0 ? -pc : pc; + pc = (p + pc) < 0 ? -(p + pc) : p + pc; + #endif + + if (pb < pa) pa = pb, a = b; + if (pc < pa) a = c; + + a += *row; + *row++ = (png_byte)a; + } +} + +#endif /* PNG_POWERPC_VSX_OPT > 0 */ +#endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */ +#endif /* READ */ diff --git a/powerpc/powerpc_init.c b/powerpc/powerpc_init.c new file mode 100644 index 000000000..d3aeb28db --- /dev/null +++ b/powerpc/powerpc_init.c @@ -0,0 +1,126 @@ + +/* powerpc_init.c - POWERPC optimised filter functions + * + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are + * called. + */ +#define _POSIX_SOURCE 1 + +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_POWERPC_VSX_OPT > 0 +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED /* Do run-time checks */ +/* WARNING: it is strongly recommended that you do not build libpng with + * run-time checks for CPU features if at all possible. In the case of the PowerPC + * VSX instructions there is no processor-specific way of detecting the + * presence of the required support, therefore run-time detection is extremely + * OS specific. + * + * You may set the macro PNG_POWERPC_VSX_FILE to the file name of file containing + * a fragment of C source code which defines the png_have_vsx function. There + * are a number of implementations in contrib/powerpc-vsx, but the only one that + * has partial support is contrib/powerpc-vsx/linux.c - a generic Linux + * implementation which reads /proc/cpufino. + */ +#ifndef PNG_POWERPC_VSX_FILE +# ifdef __linux__ +# define PNG_POWERPC_VSX_FILE "contrib/powerpc-vsx/linux.c" +# endif +#endif + +#ifdef PNG_POWERPC_VSX_FILE + +#include /* for sig_atomic_t */ +static int png_have_vsx(png_structp png_ptr); +#include PNG_POWERPC_VSX_FILE + +#else /* PNG_POWERPC_VSX_FILE */ +# error "PNG_POWERPC_VSX_FILE undefined: no support for run-time POWERPC VSX checks" +#endif /* PNG_POWERPC_VSX_FILE */ +#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ + +#ifndef PNG_ALIGNED_MEMORY_SUPPORTED +# error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED" +#endif + +void +png_init_filter_functions_vsx(png_structp pp, unsigned int bpp) +{ + /* The switch statement is compiled in for POWERPC_VSX_API, the call to + * png_have_vsx is compiled in for POWERPC_VSX_CHECK. If both are defined + * the check is only performed if the API has not set the PowerPC option on + * or off explicitly. In this case the check controls what happens. + */ + +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + switch ((pp->options >> PNG_POWERPC_VSX) & 3) + { + case PNG_OPTION_UNSET: + /* Allow the run-time check to execute if it has been enabled - + * thus both API and CHECK can be turned on. If it isn't supported + * this case will fall through to the 'default' below, which just + * returns. + */ +#endif /* PNG_POWERPC_VSX_API_SUPPORTED */ +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED + { + static volatile sig_atomic_t no_vsx = -1; /* not checked */ + + if (no_vsx < 0) + no_vsx = !png_have_vsx(pp); + + if (no_vsx) + return; + } +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + break; +#endif +#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ + +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + default: /* OFF or INVALID */ + return; + + case PNG_OPTION_ON: + /* Option turned on */ + break; + } +#endif + + /* IMPORTANT: any new internal functions used here must be declared using + * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the + * 'prefix' option to configure works: + * + * ./configure --with-libpng-prefix=foobar_ + * + * Verify you have got this right by running the above command, doing a build + * and examining pngprefix.h; it must contain a #define for every external + * function you add. (Notice that this happens automatically for the + * initialization function.) + */ + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_vsx; + + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_vsx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_vsx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_vsx; + } + + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_vsx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_vsx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_vsx; + } +} +#endif /* PNG_POWERPC_VSX_OPT > 0 */ +#endif /* READ */ From ee5804fa14d128e07f71abedc8f008a92f95c3a7 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sat, 14 Jan 2017 16:47:10 +0300 Subject: [PATCH 02/37] Added PowerPC detection code for linux --- contrib/powerpc-vsx/linux.c | 57 +++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 contrib/powerpc-vsx/linux.c diff --git a/contrib/powerpc-vsx/linux.c b/contrib/powerpc-vsx/linux.c new file mode 100644 index 000000000..c522f0ddb --- /dev/null +++ b/contrib/powerpc-vsx/linux.c @@ -0,0 +1,57 @@ +/* contrib/powerpc-vsx/linux.c + * + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + * + * SEE contrib/powerpc-vsx/README before reporting bugs + * + * STATUS: COMPILED + * BUG REPORTS: png-mng-implement@sourceforge.net + * + * png_have_vsx implemented for Linux by reading the widely available + * pseudo-file /proc/cpuinfo. Result is cached so if function will be called + * multiple times only one reading is perfomed. + * + * This code is strict ANSI-C and is probably moderately portable; it does + * however use and it assumes that /proc/cpuinfo is never localized. + */ + +#include +#include +#include +#include "png.h" + +#ifndef MAXLINE +# define MAXLINE 1024 +#endif + +static int +png_have_vsx(png_structp png_ptr) +{ + FILE *f; + + const char *string = "altivec supported"; + char input[MAXLINE]; + char *token = NULL; + + f = fopen("/proc/cpuinfo", "r"); + if (f != NULL) + { + memset(input,0,MAXLINE); + while(fgets(input,MAXLINE,f) != NULL) + { + token = strstr(input,string); + if(token != NULL) + return cachedResult; + } + } +#ifdef PNG_WARNINGS_SUPPORTED + else + png_warning(png_ptr, "/proc/cpuinfo open failed"); +#endif + return cachedResult; +} From 6ff408d423b0fa13e735a93ea9876e39ec8176c8 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 14 Jan 2017 18:35:36 +0300 Subject: [PATCH 03/37] Modified CFLAGS recomendations for PowerPC VSX --- configure.ac | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 4168eb268..d378792e6 100644 --- a/configure.ac +++ b/configure.ac @@ -420,8 +420,8 @@ AS_HELP_STRING([[[--enable-powerpc-vsx]]], AC_DEFINE([PNG_POWERPC_VSX_OPT], [2], [Enable POWERPC VSX optimizations]) AC_MSG_WARN([--enable-powerpc-vsx: please specify 'check' or 'api', if] - [you want the optimizations unconditionally pass '-maltivec and -mabi=altivec'] - [to the compiler.]);; + [you want the optimizations unconditionally pass '-maltivec -mabi=altivec'] + [or '-mcpu=power8'to the compiler.]);; *) AC_MSG_ERROR([--enable-powerpc-vsx=${enable_powerpc_vsx}: invalid value]) esac]) From 91acd4baf8041cf2df2ab790e98a8e8025a727cc Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sun, 15 Jan 2017 20:20:29 +0300 Subject: [PATCH 04/37] Implemented png_read_filter_up_vsx --- powerpc/filter_vsx_intrinsics.c | 57 ++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 8e4ef2930..fe1483a13 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -18,18 +18,20 @@ /* This code requires -maltivec and -mabi=altivec on the command line: */ #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ -/* libpng row pointers are not necessarily aligned to any particular boundary, - * however this code will only work with appropriate alignment. arm/arm_init.c - * checks for this (and will not compile unless it is done). This code uses - * variants of png_aligncast to avoid compiler warnings. - */ -#define png_ptr(type,pointer) png_aligncast(type *,pointer) -#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) - -/*#include */ +#include #if PNG_POWERPC_VSX_OPT > 0 +/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). + * They're positioned like this: + * prev: c b + * row: a d + * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be + * whichever of a, b, or c is closest to p=a+b-c. + * ( this is taken from ../intel/filter_sse2_intrinsics.c ) + */ + + void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { @@ -38,27 +40,50 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, png_bytep rp = row; png_const_bytep pp = prev_row; - for (i = 0; i < istop; i++) + vector unsigned char rp_vec; + vector unsigned char pp_vec; + + /* Using SIMD while we can */ + while( istop >= 16 ) { - *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); - rp++; + rp_vec = vec_ld(0,rp); + pp_vec = vec_ld(0,pp); + + rp_vec = vec_add(rp_vec,pp_vec); + + vec_st(rp_vec,0,rp); + + pp += 16; + rp += 16; + istop -= 16; } + if(istop % 16 > 0) + { + /* If byte count of row is not divisible by 16 + * we will process remaining part as usual + */ + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } + } } void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { + const unsigned int bpp = 4; png_size_t i; png_size_t istop = row_info->rowbytes; - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; png_bytep rp = row + bpp; PNG_UNUSED(prev_row) for (i = bpp; i < istop; i++) { - *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-4))) & 0xff); rp++; } } @@ -66,16 +91,16 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { + const unsigned int bpp = 4; png_size_t i; png_size_t istop = row_info->rowbytes; - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; png_bytep rp = row + bpp; PNG_UNUSED(prev_row) for (i = bpp; i < istop; i++) { - *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-3))) & 0xff); rp++; } } From ccef5edbfe128415bdc6cbe73da868707d4ccc16 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Mon, 16 Jan 2017 19:45:17 +0300 Subject: [PATCH 05/37] Fixed bug with unaligned input on VSX filter_up --- powerpc/filter_vsx_intrinsics.c | 53 +++++++++++++++------------------ 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index fe1483a13..833f9cc44 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -36,26 +36,37 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; - png_size_t istop = row_info->rowbytes; + png_size_t unaligned_top = (png_size_t)row % 16; + png_size_t istop = row_info->rowbytes - unaligned_top; png_bytep rp = row; png_const_bytep pp = prev_row; vector unsigned char rp_vec; vector unsigned char pp_vec; + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } + /* Using SIMD while we can */ while( istop >= 16 ) { - rp_vec = vec_ld(0,rp); - pp_vec = vec_ld(0,pp); - - rp_vec = vec_add(rp_vec,pp_vec); - - vec_st(rp_vec,0,rp); + rp_vec = vec_ld(0,rp); + pp_vec = vec_ld(0,pp); + + rp_vec = vec_add(rp_vec,pp_vec); - pp += 16; - rp += 16; - istop -= 16; + vec_st(rp_vec,0,rp); + + pp += 16; + rp += 16; + istop -= 16; } if(istop % 16 > 0) @@ -68,10 +79,11 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); rp++; } - } } -void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, +} + +void png_read_filter_row_sub_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { const unsigned int bpp = 4; @@ -88,23 +100,6 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, } } -void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, - png_const_bytep prev_row) -{ - const unsigned int bpp = 4; - png_size_t i; - png_size_t istop = row_info->rowbytes; - png_bytep rp = row + bpp; - - PNG_UNUSED(prev_row) - - for (i = bpp; i < istop; i++) - { - *rp = (png_byte)(((int)(*rp) + (int)(*(rp-3))) & 0xff); - rp++; - } -} - void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { From b455622aa84e4eb27e3887db962f59687ed0b5db Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Thu, 19 Jan 2017 01:29:24 +0300 Subject: [PATCH 06/37] Fixed potential bug on align calculation for VSX filter_up --- powerpc/filter_vsx_intrinsics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 833f9cc44..7ebabfae2 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -36,7 +36,7 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; - png_size_t unaligned_top = (png_size_t)row % 16; + png_size_t unaligned_top = 16 - ((png_size_t)row % 16); png_size_t istop = row_info->rowbytes - unaligned_top; png_bytep rp = row; png_const_bytep pp = prev_row; From 9b0311a4d8b3fd5473375aa760ebe9f601b7bfe5 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Thu, 19 Jan 2017 02:11:34 +0300 Subject: [PATCH 07/37] Implemented filter_sub optimisation for PowerPC VSX --- powerpc/filter_vsx_intrinsics.c | 116 ++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 6 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 7ebabfae2..d8dadf340 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -83,21 +83,125 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, } -void png_read_filter_row_sub_vsx(png_row_infop row_info, png_bytep row, +void png_read_filter_row_sub_vsx4(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { + png_size_t i; + png_size_t unaligned_top = 16 - ((png_size_t)row % 16); + png_size_t istop = row_info->rowbytes - unaligned_top; + const unsigned int bpp = 4; - png_size_t i; - png_size_t istop = row_info->rowbytes; + png_bytep rp = row + bpp; - + vector unsigned char rp_vec; + vector unsigned char part_vec; + vector unsigned char zero_vec = {0}; + PNG_UNUSED(prev_row) - for (i = bpp; i < istop; i++) + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + + for (i = bpp; i < unaligned_top; i++) { - *rp = (png_byte)(((int)(*rp) + (int)(*(rp-4))) & 0xff); + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); rp++; } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + *(rp+i) += *(rp+i - bpp); + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_4); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + istop -= 16; + } + + if(istop % 16 > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff); + rp++; + } +} + +void png_read_filter_row_sub_vsx3(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_size_t unaligned_top = 16 - ((png_size_t)row % 16); + png_size_t istop = row_info->rowbytes - unaligned_top; + + const unsigned int bpp = 3; + + png_bytep rp = row + bpp; + vector unsigned char rp_vec; + vector unsigned char part_vec; + vector unsigned char zero_vec = {0}; + + PNG_UNUSED(prev_row) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + + for (i = bpp; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + *(rp+i) += *(rp+i - bpp); + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT4_3); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + rp += 16; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *(rp - 1) += *(rp - 1 - 3); + } + + if(istop % 16 > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } } void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, From adbf1d6d1cb9503fee590eeef8b53f2f51ad0776 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Thu, 19 Jan 2017 17:49:04 +0300 Subject: [PATCH 08/37] Fixed missing defines for VSX filter_sub --- powerpc/filter_vsx_intrinsics.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index d8dadf340..955e19ec9 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -83,7 +83,17 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, } -void png_read_filter_row_sub_vsx4(png_row_infop row_info, png_bytep row, +#define VEC_SELECT1_4 (vector unsigned char){16,16,16,16,0,1,2,3,16,16,16,16,16,16,16,16} +#define VEC_SELECT2_4 (vector unsigned char){16,16,16,16,16,16,16,16,4,5,6,7,16,16,16,16} +#define VEC_SELECT3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,8,9,10,11} + +#define VEC_SELECT1_3 (vector unsigned char){16,16,16,0,1,2,16,16,16,16,16,16,16,16,16,16} +#define VEC_SELECT2_3 (vector unsigned char){16,16,16,16,16,16,3,4,5,16,16,16,16,16,16,16} +#define VEC_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,6,7,8,16,16,16,16} +#define VEC_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,9,10,11,16} + + +void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; @@ -140,7 +150,7 @@ void png_read_filter_row_sub_vsx4(png_row_infop row_info, png_bytep row, } } -void png_read_filter_row_sub_vsx3(png_row_infop row_info, png_bytep row, +void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; From 6ddcd33daab9124389fc7db311d1acc1828cc044 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sun, 29 Jan 2017 18:37:53 +0300 Subject: [PATCH 09/37] Fixed potential align errors for PowerPC VSX filter functions The problem is that row and prev_row may not be aligned to the same byte count. This situation leaded to undefined behaviour. --- powerpc/filter_vsx_intrinsics.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 955e19ec9..e92a70899 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -15,13 +15,21 @@ #ifdef PNG_READ_SUPPORTED -/* This code requires -maltivec and -mabi=altivec on the command line: */ +/* This code requires -maltivec and -mvsx on the command line: */ #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ #include #if PNG_POWERPC_VSX_OPT > 0 +#ifndef __VSX__ +# error "This code requires VSX support (POWER8 and later). Please provide -mvsx compiler flag." +#endif + +#define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,pp) +#define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data) + + /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). * They're positioned like this: * prev: c b @@ -31,7 +39,6 @@ * ( this is taken from ../intel/filter_sse2_intrinsics.c ) */ - void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { @@ -58,7 +65,7 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, while( istop >= 16 ) { rp_vec = vec_ld(0,rp); - pp_vec = vec_ld(0,pp); + vec_ld_unaligned(pp_vec,pp); rp_vec = vec_add(rp_vec,pp_vec); From c43aaa8b20e5c222ff1275ca77e45876a2135641 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sun, 29 Jan 2017 18:44:54 +0300 Subject: [PATCH 10/37] Implemented filter_avg for PowerPC VXS --- powerpc/filter_vsx_intrinsics.c | 213 ++++++++++++++++++++++++++------ 1 file changed, 178 insertions(+), 35 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index e92a70899..40bd81a56 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -98,8 +98,17 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, #define VEC_SELECT2_3 (vector unsigned char){16,16,16,16,16,16,3,4,5,16,16,16,16,16,16,16} #define VEC_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,6,7,8,16,16,16,16} #define VEC_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,9,10,11,16} - + +#define VEC_AVG_SELECT1_4 (vector unsigned char){16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16} +#define VEC_AVG_SELECT2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16} +#define VEC_AVG_SELECT3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15} + +#define VEC_AVG_SELECT1_3 (vector unsigned char){16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16} +#define VEC_AVG_SELECT2_3 (vector unsigned char){16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16} +#define VEC_AVG_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16} +#define VEC_AVG_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16} + void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { @@ -224,53 +233,187 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - png_size_t i; - png_bytep rp = row; - png_const_bytep pp = prev_row; - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_size_t istop = row_info->rowbytes - bpp; + const unsigned int bpp = 4; + png_size_t i; - for (i = 0; i < bpp; i++) - { - *rp = (png_byte)(((int)(*rp) + - ((int)(*pp++) / 2 )) & 0xff); + png_size_t unaligned_top = 16 - ((png_size_t)row % 16); + png_size_t istop = row_info->rowbytes - unaligned_top; + + png_bytep rp = row; + png_const_bytep pp = prev_row; - rp++; - } + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + vector unsigned char zero_vec = {0}; - for (i = 0; i < istop; i++) - { - *rp = (png_byte)(((int)(*rp) + - (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); - rp++; - } + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = bpp; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_4); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT1_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_4); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT2_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_4); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT3_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop % 16 > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } } void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - png_size_t i; - png_bytep rp = row; - png_const_bytep pp = prev_row; - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_size_t istop = row_info->rowbytes - bpp; + const unsigned int bpp = 3; + png_size_t i; - for (i = 0; i < bpp; i++) - { - *rp = (png_byte)(((int)(*rp) + - ((int)(*pp++) / 2 )) & 0xff); + png_size_t unaligned_top = 16 - ((png_size_t)row % 16); + png_size_t istop = row_info->rowbytes - unaligned_top; + + png_bytep rp = row; + png_const_bytep pp = prev_row; - rp++; - } + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + vector unsigned char zero_vec = {0}; - for (i = 0; i < istop; i++) - { - *rp = (png_byte)(((int)(*rp) + - (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); - rp++; - } + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = bpp; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_3); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT1_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_3); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT2_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_3); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT3_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *(rp - 1) += ((int)(*(pp-1) + *(rp-1-bpp)) / 2 ) & 0xff; + + } + + if(istop % 16 > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + } void png_read_filter_row_paeth4_vsx(png_row_infop row_info, From 565d4beea2b4136ea259b8e89962395205fa65cd Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sun, 29 Jan 2017 19:07:12 +0300 Subject: [PATCH 11/37] Updated CFLAGS recomendations --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index d378792e6..2aeb3b6f3 100644 --- a/configure.ac +++ b/configure.ac @@ -420,7 +420,7 @@ AS_HELP_STRING([[[--enable-powerpc-vsx]]], AC_DEFINE([PNG_POWERPC_VSX_OPT], [2], [Enable POWERPC VSX optimizations]) AC_MSG_WARN([--enable-powerpc-vsx: please specify 'check' or 'api', if] - [you want the optimizations unconditionally pass '-maltivec -mabi=altivec'] + [you want the optimizations unconditionally pass '-maltivec -mvsx'] [or '-mcpu=power8'to the compiler.]);; *) AC_MSG_ERROR([--enable-powerpc-vsx=${enable_powerpc_vsx}: invalid value]) From acb155d8939e11198ca788310bed7a5619d9c4f3 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 04:05:19 +0300 Subject: [PATCH 12/37] Implemented filter_paeth PowerPC VSX variant --- powerpc/filter_vsx_intrinsics.c | 350 ++++++++++++++++++++++++-------- 1 file changed, 265 insertions(+), 85 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 40bd81a56..2f2ca9a98 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -108,7 +108,15 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, #define VEC_AVG_SELECT2_3 (vector unsigned char){16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16} #define VEC_AVG_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16} #define VEC_AVG_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16} - + + +#ifdef PNG_USE_ABS +# define vsx_abs(number) abs(number) +#else +# define vsx_abs(number) (number > 0) ? (number) : -(number) +#endif + + void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { @@ -416,101 +424,273 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, } -void png_read_filter_row_paeth4_vsx(png_row_infop row_info, - png_bytep row, - png_const_bytep prev_row) +/* Bytewise c ? t : e. */ +#define if_then_else(c,t,e) vec_sel(e,t,c) + +#define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\ + c = *(pp - bpp);\ + a = *(rp - bpp);\ + b = *pp++;\ + p = b - c;\ + pc = a - c;\ + pa = vsx_abs(p);\ + pb = vsx_abs(pc);\ + pc = vsx_abs(p + pc);\ + if (pb < pa) pa = pb, a = b;\ + if (pc < pa) a = c;\ + a += *rp;\ + *rp++ = (png_byte)a;\ + } + +void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) { - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_bytep rp_end = row + bpp; + const unsigned int bpp = 4; + png_size_t i; - /* Process the first pixel in the row completely (this is the same as 'up' - * because there is only one candidate predictor for the first row). - */ - while (row < rp_end) - { - int a = *row + *prev_row++; - *row++ = (png_byte)a; - } + png_size_t unaligned_top = 16 - ((png_size_t)row % 16); + png_size_t istop = row_info->rowbytes - unaligned_top; + + png_bytep rp = row; + png_const_bytep pp = prev_row; - /* Remainder */ - rp_end = rp_end + (row_info->rowbytes - bpp); + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char a_vec,b_vec,c_vec,nearest_vec; + vector signed char pa_vec,pb_vec,pc_vec,smallest_vec; + vector unsigned char zero_vec = {0}; - while (row < rp_end) - { - int a, b, c, pa, pb, pc, p; + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } - c = *(prev_row - bpp); - a = *(row - bpp); - b = *prev_row++; + for(i = bpp; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } - p = b - c; - pc = a - c; + while( istop > 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } - #ifdef PNG_USE_ABS - pa = abs(p); - pb = abs(pc); - pc = abs(p + pc); - #else - pa = p < 0 ? -p : p; - pb = pc < 0 ? -pc : pc; - pc = (p + pc) < 0 ? -(p + pc) : p + pc; - #endif + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); - if (pb < pa) pa = pb, a = b; - if (pc < pa) a = c; + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT1_4); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_4); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT1_4); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + ); + rp_vec = vec_add(rp_vec, nearest_vec); - a += *row; - *row++ = (png_byte)a; - } + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT2_4); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_4); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT2_4); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + ); + + rp_vec = vec_add(rp_vec, nearest_vec); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT3_4); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_4); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT3_4); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + ); + + rp_vec = vec_add(rp_vec, nearest_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const unsigned int bpp = 3; + png_size_t i; + + png_size_t unaligned_top = 16 - ((png_size_t)row % 16); + png_size_t istop = row_info->rowbytes - unaligned_top; + + png_bytep rp = row; + png_const_bytep pp = prev_row; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char a_vec,b_vec,c_vec,nearest_vec; + vector signed char pa_vec,pb_vec,pc_vec,smallest_vec; + vector unsigned char zero_vec = {0}; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = bpp; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop > 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT1_3); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_3); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT1_3); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + ); + rp_vec = vec_add(rp_vec, nearest_vec); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT2_3); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_3); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT2_3); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + ); + + rp_vec = vec_add(rp_vec, nearest_vec); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT3_3); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_3); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT3_3); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + ); + + rp_vec = vec_add(rp_vec, nearest_vec); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT4_3); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT4_3); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT4_3); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + ); + + rp_vec = vec_add(rp_vec, nearest_vec); + + + vec_st(rp_vec,0,rp); + + rp += 16-1; + pp += 16-1; + istop -= 16; + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } } -void png_read_filter_row_paeth3_vsx(png_row_infop row_info, - png_bytep row, - png_const_bytep prev_row) -{ - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_bytep rp_end = row + bpp; - - /* Process the first pixel in the row completely (this is the same as 'up' - * because there is only one candidate predictor for the first row). - */ - while (row < rp_end) - { - int a = *row + *prev_row++; - *row++ = (png_byte)a; - } - - /* Remainder */ - rp_end = rp_end + (row_info->rowbytes - bpp); - - while (row < rp_end) - { - int a, b, c, pa, pb, pc, p; - - c = *(prev_row - bpp); - a = *(row - bpp); - b = *prev_row++; - - p = b - c; - pc = a - c; - - #ifdef PNG_USE_ABS - pa = abs(p); - pb = abs(pc); - pc = abs(p + pc); - #else - pa = p < 0 ? -p : p; - pb = pc < 0 ? -pc : pc; - pc = (p + pc) < 0 ? -(p + pc) : p + pc; - #endif - - if (pb < pa) pa = pb, a = b; - if (pc < pa) a = c; - - a += *row; - *row++ = (png_byte)a; - } -} #endif /* PNG_POWERPC_VSX_OPT > 0 */ #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */ From 3907feb3068e7b3c52489ac021384ae8240e2395 Mon Sep 17 00:00:00 2001 From: debian Date: Tue, 31 Jan 2017 03:50:45 +0000 Subject: [PATCH 13/37] Added ppc64le to arch's in configure.ac script --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 61fbafc34..2ee4f1d63 100644 --- a/configure.ac +++ b/configure.ac @@ -469,7 +469,7 @@ AS_HELP_STRING([[[--enable-powerpc-vsx]]], AM_CONDITIONAL([PNG_POWERPC_VSX], [test "$enable_powerpc_vsx" != 'no' && case "$host_cpu" in - powerpc*) :;; + powerpc*|ppc64*) :;; esac]) AC_MSG_NOTICE([[Extra options for compiler: $PNG_COPTS]]) From d0d310f1cb7bd5ddb62850a84da14e8b1e65d9c8 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 03:56:23 +0000 Subject: [PATCH 14/37] Added cmake support for PowerPC VSX optimizations --- CMakeLists.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7bbfe58e2..4a15c7167 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,6 +109,36 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm" OR endif() endif() +# set definitions and sources for powerpc +if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^powerpc*" OR + ${CMAKE_SYSTEM_PROCESSOR} MATCHES "^ppc64*" ) + set(PNG_POWERPC_VSX_POSSIBLE_VALUES check on off) + set(PNG_POWERPC_VSX "check" CACHE STRING "Enable POWERPC VSX optimizations: + check: (default) use internal checking code; + off: disable the optimizations; + on: turn on unconditionally.") + set_property(CACHE PNG_POWERPC_VSX PROPERTY STRINGS + ${PNG_POWERPC_VSX_POSSIBLE_VALUES}) + list(FIND PNG_POWERPC_VSX_POSSIBLE_VALUES ${PNG_POWERPC_VSX} index) + if(index EQUAL -1) + message(FATAL_ERROR + " PNG_POWERPC_VSX must be one of [${PNG_POWERPC_VSX_POSSIBLE_VALUES}]") + elseif(NOT ${PNG_POWERPC_VSX} STREQUAL "no") + set(libpng_powerpc_sources + powerpc/powerpc_init.c + powerpc/filter_vsx_intrinsics.c) + + if(${PNG_POWERPC_VSX} STREQUAL "on") + add_definitions(-DPNG_POWERPC_VSX_OPT=2) + elseif(${PNG_POWERPC_VSX} STREQUAL "check") + add_definitions(-DPNG_POWERPC_VSX_CHECK_SUPPORTED) + endif() + else() + add_definitions(-DPNG_POWERPC_VSX_OPT=0) + endif() +endif() + + # SET LIBNAME set(PNG_LIB_NAME png${PNGLIB_MAJOR}${PNGLIB_MINOR}) @@ -400,6 +430,7 @@ set(libpng_sources pngwtran.c pngwutil.c ${libpng_arm_sources} + ${libpng_powerpc_sources} ) set(pngtest_sources pngtest.c From bea573d370e78e9b81f7098b7abb0b65460f7edc Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 11:59:11 +0000 Subject: [PATCH 15/37] Added auxv-based VSX detection method for PowerPC and enabled it by default --- contrib/powerpc-vsx/linux_aux.c | 34 +++++++++++++++++++++++++++++++++ powerpc/powerpc_init.c | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 contrib/powerpc-vsx/linux_aux.c diff --git a/contrib/powerpc-vsx/linux_aux.c b/contrib/powerpc-vsx/linux_aux.c new file mode 100644 index 000000000..ceb2ab0c3 --- /dev/null +++ b/contrib/powerpc-vsx/linux_aux.c @@ -0,0 +1,34 @@ +/* contrib/powerpc-vsx/linux.c + * + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + * + * SEE contrib/powerpc-vsx/README before reporting bugs + * + * STATUS: COMPILED + * BUG REPORTS: png-mng-implement@sourceforge.net + * + * png_have_vsx implemented for Linux by reading the widely available + * pseudo-file /proc/cpuinfo. Result is cached so if function will be called + * multiple times only one reading is perfomed. + * + * This code is strict ANSI-C and is probably moderately portable; it does + * however use and it assumes that /proc/cpuinfo is never localized. + */ + +#include "sys/auxv.h" +#include "png.h" + +static int +png_have_vsx(png_structp png_ptr) +{ + const unsigned long auxv = getauxval( AT_HWCAP ); + if(auxv & (PPC_FEATURE_HAS_ALTIVEC|PPC_FEATURE_HAS_VSX )) + return 1; + else + return 0; +} diff --git a/powerpc/powerpc_init.c b/powerpc/powerpc_init.c index d3aeb28db..d406893ce 100644 --- a/powerpc/powerpc_init.c +++ b/powerpc/powerpc_init.c @@ -32,7 +32,7 @@ */ #ifndef PNG_POWERPC_VSX_FILE # ifdef __linux__ -# define PNG_POWERPC_VSX_FILE "contrib/powerpc-vsx/linux.c" +# define PNG_POWERPC_VSX_FILE "contrib/powerpc-vsx/linux_aux.c" # endif #endif From 2cc569eab73ee61457f004d35b83c55197c2f234 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 12:00:02 +0000 Subject: [PATCH 16/37] Removed remark about poor supporting VSX check code --- configure.ac | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 2ee4f1d63..73f7183e5 100644 --- a/configure.ac +++ b/configure.ac @@ -436,8 +436,8 @@ AC_ARG_ENABLE([powerpc-vsx], AS_HELP_STRING([[[--enable-powerpc-vsx]]], [Enable POWERPC VSX optimizations: =no/off, check, api, yes/on:] [no/off: disable the optimizations; check: use internal checking code] - [(deprecated and poorly supported); api: disable by default, enable by] - [a call to png_set_option; yes/on: turn on unconditionally.] + [api: disable by default, enable by a call to png_set_option] + [yes/on: turn on unconditionally.] [If not specified: determined by the compiler.]), [case "$enableval" in no|off) From 80041d15b2c27d42a9e15748330f3e2cb7193401 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 15:07:14 +0300 Subject: [PATCH 17/37] Update contrib/powerpc/linux_aux.c header --- contrib/powerpc-vsx/linux_aux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/powerpc-vsx/linux_aux.c b/contrib/powerpc-vsx/linux_aux.c index ceb2ab0c3..8987daf30 100644 --- a/contrib/powerpc-vsx/linux_aux.c +++ b/contrib/powerpc-vsx/linux_aux.c @@ -1,4 +1,4 @@ -/* contrib/powerpc-vsx/linux.c +/* contrib/powerpc-vsx/linux_aux.c * * Copyright (c) 2016 Glenn Randers-Pehrson * Written by Vadim Barkov, 2017. From 0aa1b967f083628c031888ba87d98e92bd1d4f6f Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 15:09:48 +0300 Subject: [PATCH 18/37] Removed PNG_ALIGN dependency from VSX since it is not needed --- pngpriv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pngpriv.h b/pngpriv.h index 60080ad24..3c2387e00 100644 --- a/pngpriv.h +++ b/pngpriv.h @@ -191,7 +191,7 @@ #endif #ifndef PNG_POWERPC_VSX_OPT -# if defined(__ppc64__) && defined(__ALTIVEC__) && defined(PNG_ALIGNED_MEMORY_SUPPORTED) +# if defined(__ppc64__) && defined(__ALTIVEC__) && defined(__VSX__) # define PNG_POWERPC_VSX_OPT 2 # else # define PNG_POWERPC_VSX_OPT 0 From ab2a639a2a5588b9fc452626c663ed4cee9d2427 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 15:12:00 +0300 Subject: [PATCH 19/37] Removed trailing spaces --- pngrutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pngrutil.c b/pngrutil.c index ae9478133..36820cf2f 100644 --- a/pngrutil.c +++ b/pngrutil.c @@ -3797,7 +3797,7 @@ png_read_filter_row_sub(png_row_infop row_info, png_bytep row, png_size_t i; png_size_t istop = row_info->rowbytes; unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_bytep rp = row + bpp; + png_bytep rp = row + bpp; PNG_UNUSED(prev_row) From f2b829166d0973840c256d8987eb41093c0c1b60 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 12:13:39 +0000 Subject: [PATCH 20/37] Removed commentary about contrib/powerpc/READM --- contrib/powerpc-vsx/linux.c | 2 -- contrib/powerpc-vsx/linux_aux.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/contrib/powerpc-vsx/linux.c b/contrib/powerpc-vsx/linux.c index c522f0ddb..162d86f5c 100644 --- a/contrib/powerpc-vsx/linux.c +++ b/contrib/powerpc-vsx/linux.c @@ -7,8 +7,6 @@ * For conditions of distribution and use, see the disclaimer * and license in png.h * - * SEE contrib/powerpc-vsx/README before reporting bugs - * * STATUS: COMPILED * BUG REPORTS: png-mng-implement@sourceforge.net * diff --git a/contrib/powerpc-vsx/linux_aux.c b/contrib/powerpc-vsx/linux_aux.c index 8987daf30..e06916234 100644 --- a/contrib/powerpc-vsx/linux_aux.c +++ b/contrib/powerpc-vsx/linux_aux.c @@ -7,8 +7,6 @@ * For conditions of distribution and use, see the disclaimer * and license in png.h * - * SEE contrib/powerpc-vsx/README before reporting bugs - * * STATUS: COMPILED * BUG REPORTS: png-mng-implement@sourceforge.net * From 9bd58f93873549db54702e63857501123dc57987 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 12:15:45 +0000 Subject: [PATCH 21/37] Removed PNG_ALIGN check in VSX code since it is not used --- powerpc/powerpc_init.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/powerpc/powerpc_init.c b/powerpc/powerpc_init.c index d406893ce..9bbae8b5b 100644 --- a/powerpc/powerpc_init.c +++ b/powerpc/powerpc_init.c @@ -47,10 +47,6 @@ static int png_have_vsx(png_structp png_ptr); #endif /* PNG_POWERPC_VSX_FILE */ #endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ -#ifndef PNG_ALIGNED_MEMORY_SUPPORTED -# error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED" -#endif - void png_init_filter_functions_vsx(png_structp pp, unsigned int bpp) { From 19425d32f30c202f8ef28ab4020131de382907b3 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Tue, 31 Jan 2017 16:48:17 +0300 Subject: [PATCH 22/37] Update README --- README | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README b/README index 34e7ae1f7..5d3a4c77c 100644 --- a/README +++ b/README @@ -179,8 +179,10 @@ Files in this distribution: pngwtran.c => Write data transformations pngwutil.c => Write utility functions arm => Contains optimized code for the ARM platform + powerpc => Contains optimized code for the PowerPC platform contrib => Contributions arm-neon => Optimized code for ARM-NEON platform + powerpc => Optimized code for POWERPC-VSX platform examples => Example programs gregbook => source code for PNG reading and writing, from Greg Roelofs' "PNG: The Definitive Guide", From d57bed783845bc1c92f3ac7b95a8b4022091231c Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Wed, 1 Feb 2017 13:57:50 +0000 Subject: [PATCH 23/37] Fixed mixed tabs and spacing and comments in contrib/powerpc/linux.c --- contrib/powerpc-vsx/linux.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/contrib/powerpc-vsx/linux.c b/contrib/powerpc-vsx/linux.c index 162d86f5c..c0a4eca2d 100644 --- a/contrib/powerpc-vsx/linux.c +++ b/contrib/powerpc-vsx/linux.c @@ -11,8 +11,7 @@ * BUG REPORTS: png-mng-implement@sourceforge.net * * png_have_vsx implemented for Linux by reading the widely available - * pseudo-file /proc/cpuinfo. Result is cached so if function will be called - * multiple times only one reading is perfomed. + * pseudo-file /proc/cpuinfo. * * This code is strict ANSI-C and is probably moderately portable; it does * however use and it assumes that /proc/cpuinfo is never localized. @@ -43,13 +42,13 @@ png_have_vsx(png_structp png_ptr) while(fgets(input,MAXLINE,f) != NULL) { token = strstr(input,string); - if(token != NULL) - return cachedResult; + if(token != NULL) + return 1; } } #ifdef PNG_WARNINGS_SUPPORTED else png_warning(png_ptr, "/proc/cpuinfo open failed"); #endif - return cachedResult; + return 0; } From b1be78460a93caf6badee87fc6031e2084dbb112 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Wed, 1 Feb 2017 13:58:54 +0000 Subject: [PATCH 24/37] Fixed mixed tabs and spaces in contrib/powerpc/linux_aux.c --- contrib/powerpc-vsx/linux_aux.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/contrib/powerpc-vsx/linux_aux.c b/contrib/powerpc-vsx/linux_aux.c index e06916234..c21120cf3 100644 --- a/contrib/powerpc-vsx/linux_aux.c +++ b/contrib/powerpc-vsx/linux_aux.c @@ -10,9 +10,7 @@ * STATUS: COMPILED * BUG REPORTS: png-mng-implement@sourceforge.net * - * png_have_vsx implemented for Linux by reading the widely available - * pseudo-file /proc/cpuinfo. Result is cached so if function will be called - * multiple times only one reading is perfomed. + * png_have_vsx implemented for Linux by using the auxiliary vector mechanism. * * This code is strict ANSI-C and is probably moderately portable; it does * however use and it assumes that /proc/cpuinfo is never localized. @@ -26,7 +24,8 @@ png_have_vsx(png_structp png_ptr) { const unsigned long auxv = getauxval( AT_HWCAP ); if(auxv & (PPC_FEATURE_HAS_ALTIVEC|PPC_FEATURE_HAS_VSX )) - return 1; + return 1; else - return 0; + return 0; } + From b42e8bce3a1d4af3e648dc3675fbb565b579e42c Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Wed, 1 Feb 2017 14:03:57 +0000 Subject: [PATCH 25/37] Added PNG_UNUSED macro in contrib/powerpc/linux{_aux}.c --- contrib/powerpc-vsx/linux.c | 2 ++ contrib/powerpc-vsx/linux_aux.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/contrib/powerpc-vsx/linux.c b/contrib/powerpc-vsx/linux.c index c0a4eca2d..f54f65f59 100644 --- a/contrib/powerpc-vsx/linux.c +++ b/contrib/powerpc-vsx/linux.c @@ -35,6 +35,8 @@ png_have_vsx(png_structp png_ptr) char input[MAXLINE]; char *token = NULL; + PNG_UNUSED(png_ptr) + f = fopen("/proc/cpuinfo", "r"); if (f != NULL) { diff --git a/contrib/powerpc-vsx/linux_aux.c b/contrib/powerpc-vsx/linux_aux.c index c21120cf3..058bb02b3 100644 --- a/contrib/powerpc-vsx/linux_aux.c +++ b/contrib/powerpc-vsx/linux_aux.c @@ -22,7 +22,11 @@ static int png_have_vsx(png_structp png_ptr) { + const unsigned long auxv = getauxval( AT_HWCAP ); + + PNG_UNUSED(png_ptr) + if(auxv & (PPC_FEATURE_HAS_ALTIVEC|PPC_FEATURE_HAS_VSX )) return 1; else From 8a242668fda04f5bb162b1a09566c4df99b2b6d8 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Wed, 1 Feb 2017 14:26:02 +0000 Subject: [PATCH 26/37] Added warning with supported OSes list for VSX check --- CMakeLists.txt | 2 + configure.ac | 4 +- contrib/powerpc-vsx/README | 81 +++++++++++++++++++++++++++++++++ contrib/powerpc-vsx/linux.c | 2 +- contrib/powerpc-vsx/linux_aux.c | 2 +- 5 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 contrib/powerpc-vsx/README diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a15c7167..5d2a364d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,6 +132,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^powerpc*" OR add_definitions(-DPNG_POWERPC_VSX_OPT=2) elseif(${PNG_POWERPC_VSX} STREQUAL "check") add_definitions(-DPNG_POWERPC_VSX_CHECK_SUPPORTED) + message(WARNING + " Please check contrib/powerpc/README file for the list of supported OSes.") endif() else() add_definitions(-DPNG_POWERPC_VSX_OPT=0) diff --git a/configure.ac b/configure.ac index 73f7183e5..e19474e21 100644 --- a/configure.ac +++ b/configure.ac @@ -448,7 +448,9 @@ AS_HELP_STRING([[[--enable-powerpc-vsx]]], enable_powerpc_vsx=no;; check) AC_DEFINE([PNG_POWERPC_VSX_CHECK_SUPPORTED], [], - [Check for POWERPC VSX support at run-time]);; + [Check for POWERPC VSX support at run-time]) + AC_MSG_WARN([--enable-powerpc-vsx Please check contrib/powerpc/README file] + [for the list of supported OSes.]);; api) AC_DEFINE([PNG_POWERPC_VSX_API_SUPPORTED], [], [Turn on POWERPC VSX optimizations at run-time]);; diff --git a/contrib/powerpc-vsx/README b/contrib/powerpc-vsx/README new file mode 100644 index 000000000..e566147ea --- /dev/null +++ b/contrib/powerpc-vsx/README @@ -0,0 +1,81 @@ +OPERATING SYSTEM SPECIFIC POWERPC DETECTION +-------------------------------------------- + +Detection of the ability to execute POWERPC on processor requires +operating system support. (The information is not available in user mode.) + +Currently only this feature is supported only for linux platform. + +HOW TO USE THIS +--------------- + +This directory contains C code fragments that can be included in powerpc/powerpc_init.c +by setting the macro PNG_POWERPC_VSX_FILE to the file name in "" or <> at build +time. This setting is not recorded in pnglibconf.h and can be changed simply by +rebuilding arm/arm_init.o with the required macro definition. + +For any of this code to be used the POWERPC code must be enabled and run time +checks must be supported. I.e.: + +#if PNG_POWERPC_VSX_OPT > 0 +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED + +This is done in a 'configure' build by passing configure the argument: + + --enable-powerpc-vsx=check + +FILE FORMAT +----------- + +Each file documents its testing status as of the last time it was tested (which +may have been a long time ago): + +STATUS: one of: + SUPPORTED: This indicates that the file is included in the regularly + performed test builds and bugs are fixed when discovered. + COMPILED: This indicates that the code did compile at least once. See the + more detailed description for the extent to which the result was + successful. + TESTED: This means the code was fully compiled into the libpng test programs + and these were run at least once. + +BUG REPORTS: an email address to which to send reports of problems + +The file is a fragment of C code. It should not define any 'extern' symbols; +everything should be static. It must define the function: + +static int png_have_vsx(png_structp png_ptr); + +That function must return 1 if ARM NEON instructions are supported, 0 if not. +It must not execute png_error unless it detects a bug. A png_error will prevent +the reading of the PNG and in the future, writing too. + +BUG REPORTS +----------- + +If you mail a bug report for any file that is not SUPPORTED there may only be +limited response. Consider fixing it and sending a patch to fix the problem - +this is more likely to result in action. + +CONTRIBUTIONS +------------- + +You may send contributions of new implementations to +png-mng-implement@sourceforge.net. Please write code in strict C90 C where +possible. Obviously OS dependencies are to be expected. If you submit code you +must have the authors permission and it must have a license that is acceptable +to the current maintainer; in particular that license must permit modification +and redistribution. + +Please try to make the contribution a single file and give the file a clear and +unambiguous name that identifies the target OS. If multiple files really are +required put them all in a sub-directory. + +You must also be prepared to handle bug reports from users of the code, either +by joining the png-mng-implement mailing list or by providing an email for the +"BUG REPORTS" entry or both. Please make sure that the header of the file +contains the STATUS and BUG REPORTS fields as above. + +Please list the OS requirements as precisely as possible. Ideally you should +also list the environment in which the code has been tested and certainly list +any environments where you suspect it might not work. diff --git a/contrib/powerpc-vsx/linux.c b/contrib/powerpc-vsx/linux.c index f54f65f59..8fc76e34c 100644 --- a/contrib/powerpc-vsx/linux.c +++ b/contrib/powerpc-vsx/linux.c @@ -7,7 +7,7 @@ * For conditions of distribution and use, see the disclaimer * and license in png.h * - * STATUS: COMPILED + * STATUS: TESTED * BUG REPORTS: png-mng-implement@sourceforge.net * * png_have_vsx implemented for Linux by reading the widely available diff --git a/contrib/powerpc-vsx/linux_aux.c b/contrib/powerpc-vsx/linux_aux.c index 058bb02b3..d946d6cad 100644 --- a/contrib/powerpc-vsx/linux_aux.c +++ b/contrib/powerpc-vsx/linux_aux.c @@ -7,7 +7,7 @@ * For conditions of distribution and use, see the disclaimer * and license in png.h * - * STATUS: COMPILED + * STATUS: TESTED * BUG REPORTS: png-mng-implement@sourceforge.net * * png_have_vsx implemented for Linux by using the auxiliary vector mechanism. From ae15e839d9aafd26d0db305fe6ba7fa0bd55c973 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Thu, 2 Feb 2017 04:51:56 +0000 Subject: [PATCH 27/37] Small bugfix --- powerpc/filter_vsx_intrinsics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 2f2ca9a98..c6426e8d9 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -26,7 +26,7 @@ # error "This code requires VSX support (POWER8 and later). Please provide -mvsx compiler flag." #endif -#define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,pp) +#define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data) #define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data) From 58e9d5d59d5defad892a8dc21c3401c075720989 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Thu, 2 Feb 2017 22:37:36 +0000 Subject: [PATCH 28/37] Refactoring --- powerpc/filter_vsx_intrinsics.c | 80 +++++++++++++-------------------- 1 file changed, 31 insertions(+), 49 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index c6426e8d9..8e649a192 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -39,18 +39,25 @@ * ( this is taken from ../intel/filter_sse2_intrinsics.c ) */ +#define declare_common_vars(row_info,row,prev_row) \ + png_size_t i;\ + png_bytep rp = row;\ + png_const_bytep pp = prev_row;\ + png_size_t unaligned_top = 16 - (((png_size_t)row % 16));\ + png_size_t istop;\ + if(unaligned_top == 16)\ + unaligned_top = 0;\ + istop = row_info->rowbytes - unaligned_top; + + void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - png_size_t i; - png_size_t unaligned_top = 16 - ((png_size_t)row % 16); - png_size_t istop = row_info->rowbytes - unaligned_top; - png_bytep rp = row; - png_const_bytep pp = prev_row; - vector unsigned char rp_vec; vector unsigned char pp_vec; + declare_common_vars(row_info,row,prev_row) + /* Altivec operations require 16-byte aligned data * but input can be unaligned. So we calculate * unaligned part as usual. @@ -120,18 +127,16 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - png_size_t i; - png_size_t unaligned_top = 16 - ((png_size_t)row % 16); - png_size_t istop = row_info->rowbytes - unaligned_top; - const unsigned int bpp = 4; - png_bytep rp = row + bpp; vector unsigned char rp_vec; vector unsigned char part_vec; vector unsigned char zero_vec = {0}; - PNG_UNUSED(prev_row) + declare_common_vars(row_info,row,prev_row) + rp += bpp; + + PNG_UNUSED(pp) /* Altivec operations require 16-byte aligned data * but input can be unaligned. So we calculate @@ -177,18 +182,15 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - png_size_t i; - png_size_t unaligned_top = 16 - ((png_size_t)row % 16); - png_size_t istop = row_info->rowbytes - unaligned_top; - const unsigned int bpp = 3; - png_bytep rp = row + bpp; vector unsigned char rp_vec; vector unsigned char part_vec; vector unsigned char zero_vec = {0}; - - PNG_UNUSED(prev_row) + + declare_common_vars(row_info,row,prev_row) + rp += bpp; + PNG_UNUSED(pp) /* Altivec operations require 16-byte aligned data * but input can be unaligned. So we calculate @@ -242,14 +244,7 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { const unsigned int bpp = 4; - png_size_t i; - - png_size_t unaligned_top = 16 - ((png_size_t)row % 16); - png_size_t istop = row_info->rowbytes - unaligned_top; - - png_bytep rp = row; - png_const_bytep pp = prev_row; - + vector unsigned char rp_vec; vector unsigned char pp_vec; vector unsigned char pp_part_vec; @@ -257,6 +252,8 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, vector unsigned char avg_vec; vector unsigned char zero_vec = {0}; + declare_common_vars(row_info,row,prev_row) + for (i = 0; i < bpp; i++) { *rp = (png_byte)(((int)(*rp) + @@ -332,14 +329,7 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { const unsigned int bpp = 3; - png_size_t i; - - png_size_t unaligned_top = 16 - ((png_size_t)row % 16); - png_size_t istop = row_info->rowbytes - unaligned_top; - - png_bytep rp = row; - png_const_bytep pp = prev_row; - + vector unsigned char rp_vec; vector unsigned char pp_vec; vector unsigned char pp_part_vec; @@ -347,6 +337,8 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, vector unsigned char avg_vec; vector unsigned char zero_vec = {0}; + declare_common_vars(row_info,row,prev_row) + for (i = 0; i < bpp; i++) { *rp = (png_byte)(((int)(*rp) + @@ -446,13 +438,6 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { const unsigned int bpp = 4; - png_size_t i; - - png_size_t unaligned_top = 16 - ((png_size_t)row % 16); - png_size_t istop = row_info->rowbytes - unaligned_top; - - png_bytep rp = row; - png_const_bytep pp = prev_row; int a, b, c, pa, pb, pc, p; vector unsigned char rp_vec; @@ -461,6 +446,8 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, vector signed char pa_vec,pb_vec,pc_vec,smallest_vec; vector unsigned char zero_vec = {0}; + declare_common_vars(row_info,row,prev_row) + /* Process the first pixel in the row completely (this is the same as 'up' * because there is only one candidate predictor for the first row). */ @@ -559,13 +546,6 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { const unsigned int bpp = 3; - png_size_t i; - - png_size_t unaligned_top = 16 - ((png_size_t)row % 16); - png_size_t istop = row_info->rowbytes - unaligned_top; - - png_bytep rp = row; - png_const_bytep pp = prev_row; int a, b, c, pa, pb, pc, p; vector unsigned char rp_vec; @@ -574,6 +554,8 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, vector signed char pa_vec,pb_vec,pc_vec,smallest_vec; vector unsigned char zero_vec = {0}; + declare_common_vars(row_info,row,prev_row) + /* Process the first pixel in the row completely (this is the same as 'up' * because there is only one candidate predictor for the first row). */ From d4bdca45b37662a0c25c316829a9594b10d0fc1f Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Wed, 8 Feb 2017 13:58:23 +0000 Subject: [PATCH 29/37] Fixed multiple bugs in VSX filter functions Now up,sub and avg filter VSX functions pass tests --- powerpc/filter_vsx_intrinsics.c | 559 ++++++++++++++++---------------- 1 file changed, 288 insertions(+), 271 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 8e649a192..9376bc885 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -39,27 +39,31 @@ * ( this is taken from ../intel/filter_sse2_intrinsics.c ) */ -#define declare_common_vars(row_info,row,prev_row) \ +#define declare_common_vars(row_info,row,prev_row,offset) \ png_size_t i;\ - png_bytep rp = row;\ + png_bytep rp = row + offset;\ png_const_bytep pp = prev_row;\ - png_size_t unaligned_top = 16 - (((png_size_t)row % 16));\ + png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\ png_size_t istop;\ if(unaligned_top == 16)\ unaligned_top = 0;\ - istop = row_info->rowbytes - unaligned_top; - + istop = row_info->rowbytes;\ + if((unaligned_top < istop))\ + istop -= unaligned_top;\ + else{\ + unaligned_top = istop;\ + istop = 0;\ + } void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { vector unsigned char rp_vec; vector unsigned char pp_vec; + declare_common_vars(row_info,row,prev_row,0) - declare_common_vars(row_info,row,prev_row) - - /* Altivec operations require 16-byte aligned data - * but input can be unaligned. So we calculate + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate * unaligned part as usual. */ for (i = 0; i < unaligned_top; i++) @@ -73,7 +77,7 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, { rp_vec = vec_ld(0,rp); vec_ld_unaligned(pp_vec,pp); - + rp_vec = vec_add(rp_vec,pp_vec); vec_st(rp_vec,0,rp); @@ -83,7 +87,7 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, istop -= 16; } - if(istop % 16 > 0) + if(istop > 0) { /* If byte count of row is not divisible by 16 * we will process remaining part as usual @@ -97,14 +101,14 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, } -#define VEC_SELECT1_4 (vector unsigned char){16,16,16,16,0,1,2,3,16,16,16,16,16,16,16,16} -#define VEC_SELECT2_4 (vector unsigned char){16,16,16,16,16,16,16,16,4,5,6,7,16,16,16,16} -#define VEC_SELECT3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,8,9,10,11} +#define VEC_SELECT1_4 (vector unsigned char){16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16} +#define VEC_SELECT2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16} +#define VEC_SELECT3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11} -#define VEC_SELECT1_3 (vector unsigned char){16,16,16,0,1,2,16,16,16,16,16,16,16,16,16,16} -#define VEC_SELECT2_3 (vector unsigned char){16,16,16,16,16,16,3,4,5,16,16,16,16,16,16,16} -#define VEC_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,6,7,8,16,16,16,16} -#define VEC_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,9,10,11,16} +#define VEC_SELECT1_3 (vector unsigned char){16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16} +#define VEC_SELECT2_3 (vector unsigned char){16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16} +#define VEC_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16} +#define VEC_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16} #define VEC_AVG_SELECT1_4 (vector unsigned char){16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16} @@ -116,34 +120,30 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, #define VEC_AVG_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16} #define VEC_AVG_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16} - #ifdef PNG_USE_ABS # define vsx_abs(number) abs(number) #else # define vsx_abs(number) (number > 0) ? (number) : -(number) #endif - void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { const unsigned int bpp = 4; - + vector unsigned char rp_vec; vector unsigned char part_vec; vector unsigned char zero_vec = {0}; - - declare_common_vars(row_info,row,prev_row) - rp += bpp; + + declare_common_vars(row_info,row,prev_row,bpp) PNG_UNUSED(pp) - /* Altivec operations require 16-byte aligned data - * but input can be unaligned. So we calculate + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate * unaligned part as usual. */ - - for (i = bpp; i < unaligned_top; i++) + for (i = 0; i < unaligned_top; i++) { *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); rp++; @@ -151,7 +151,7 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, /* Using SIMD while we can */ while( istop >= 16 ) - { + { for(i=0;i < bpp ; i++) *(rp+i) += *(rp+i - bpp); @@ -171,14 +171,16 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, istop -= 16; } - if(istop % 16 > 0) + if(istop > 0) for (i = 0; i < istop % 16; i++) { *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff); rp++; } + } + void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { @@ -188,16 +190,15 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, vector unsigned char part_vec; vector unsigned char zero_vec = {0}; - declare_common_vars(row_info,row,prev_row) - rp += bpp; + declare_common_vars(row_info,row,prev_row,bpp) + PNG_UNUSED(pp) - /* Altivec operations require 16-byte aligned data - * but input can be unaligned. So we calculate + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate * unaligned part as usual. */ - - for (i = bpp; i < unaligned_top; i++) + for (i = 0; i < unaligned_top; i++) { *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); rp++; @@ -205,7 +206,7 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, /* Using SIMD while we can */ while( istop >= 16 ) - { + { for(i=0;i < bpp ; i++) *(rp+i) += *(rp+i - bpp); @@ -224,15 +225,15 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, vec_st(rp_vec,0,rp); rp += 16; - istop -= 16; - + istop -= 16; + /* Since 16 % bpp = 16 % 3 = 1, last element of array must - * be proceeded manually + * be proceeded manually */ *(rp - 1) += *(rp - 1 - 3); } - if(istop % 16 > 0) + if(istop > 0) for (i = 0; i < istop % 16; i++) { *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); @@ -244,7 +245,7 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { const unsigned int bpp = 4; - + vector unsigned char rp_vec; vector unsigned char pp_vec; vector unsigned char pp_part_vec; @@ -252,8 +253,11 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, vector unsigned char avg_vec; vector unsigned char zero_vec = {0}; - declare_common_vars(row_info,row,prev_row) - + declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + for (i = 0; i < bpp; i++) { *rp = (png_byte)(((int)(*rp) + @@ -262,21 +266,21 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, rp++; } - /* Altivec operations require 16-byte aligned data - * but input can be unaligned. So we calculate + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate * unaligned part as usual. */ - for (i = bpp; i < unaligned_top; i++) + for (i = 0; i < unaligned_top; i++) { *rp = (png_byte)(((int)(*rp) + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); rp++; } - + /* Using SIMD while we can */ while( istop >= 16 ) - { + { for(i=0;i < bpp ; i++) { *rp = (png_byte)(((int)(*rp) + @@ -315,7 +319,7 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, istop -= 16; } - if(istop % 16 > 0) + if(istop > 0) for (i = 0; i < istop % 16; i++) { *rp = (png_byte)(((int)(*rp) + @@ -328,92 +332,102 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - const unsigned int bpp = 3; - - vector unsigned char rp_vec; - vector unsigned char pp_vec; - vector unsigned char pp_part_vec; - vector unsigned char rp_part_vec; - vector unsigned char avg_vec; - vector unsigned char zero_vec = {0}; + const unsigned int bpp = 3; - declare_common_vars(row_info,row,prev_row) + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + vector unsigned char zero_vec = {0}; - for (i = 0; i < bpp; i++) - { - *rp = (png_byte)(((int)(*rp) + - ((int)(*pp++) / 2 )) & 0xff); + declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; - rp++; - } + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); - /* Altivec operations require 16-byte aligned data - * but input can be unaligned. So we calculate - * unaligned part as usual. - */ - for (i = bpp; i < unaligned_top; i++) - { - *rp = (png_byte)(((int)(*rp) + - (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + rp++; + } - rp++; - } - - /* Using SIMD while we can */ - while( istop >= 16 ) - { - for(i=0;i < bpp ; i++) - { - *rp = (png_byte)(((int)(*rp) + - (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); - rp++; - } - rp -= bpp; - pp -= bpp; + rp++; + } - vec_ld_unaligned(pp_vec,pp); - rp_vec = vec_ld(0,rp); + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_3); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT1_3); - avg_vec = vec_avg(rp_part_vec,pp_part_vec); - avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); - rp_vec = vec_add(rp_vec,avg_vec); + rp++; + } + rp -= bpp; + pp -= bpp; - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_3); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT2_3); - avg_vec = vec_avg(rp_part_vec,pp_part_vec); - avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); - rp_vec = vec_add(rp_vec,avg_vec); + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_3); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT3_3); - avg_vec = vec_avg(rp_part_vec,pp_part_vec); - avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); - rp_vec = vec_add(rp_vec,avg_vec); + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_3); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT1_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); - vec_st(rp_vec,0,rp); + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_3); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT2_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); - rp += 16; - pp += 16; - istop -= 16; - /* Since 16 % bpp = 16 % 3 = 1, last element of array must - * be proceeded manually - */ - *(rp - 1) += ((int)(*(pp-1) + *(rp-1-bpp)) / 2 ) & 0xff; + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_3); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT3_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); - } + rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT4_3); + pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT4_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); - if(istop % 16 > 0) - for (i = 0; i < istop % 16; i++) - { - *rp = (png_byte)(((int)(*rp) + - (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + vec_st(rp_vec,0,rp); - rp++; - } - + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } } /* Bytewise c ? t : e. */ @@ -443,10 +457,14 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, vector unsigned char rp_vec; vector unsigned char pp_vec; vector unsigned char a_vec,b_vec,c_vec,nearest_vec; - vector signed char pa_vec,pb_vec,pc_vec,smallest_vec; + vector signed char pa_vec,pb_vec,pc_vec; + vector unsigned char pa_vec_abs,pb_vec_abs,pc_vec_abs,smallest_vec; vector unsigned char zero_vec = {0}; - declare_common_vars(row_info,row,prev_row) + declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; /* Process the first pixel in the row completely (this is the same as 'up' * because there is only one candidate predictor for the first row). @@ -458,12 +476,12 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, pp++; } - for(i = bpp; i < unaligned_top ; i++) + for(i = 0; i < unaligned_top ; i++) { vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) } - while( istop > 16) + while( istop >= 16) { for(i = 0; i < bpp ; i++) { @@ -481,14 +499,14 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, pa_vec = (vector signed char) vec_sub(b_vec,c_vec); pb_vec = (vector signed char) vec_sub(a_vec , c_vec); pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec = vec_abs(pa_vec); - pb_vec = vec_abs(pb_vec); - pc_vec = vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); + pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); + pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec,smallest_vec), + vec_cmpeq(pa_vec_abs,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) ); rp_vec = vec_add(rp_vec, nearest_vec); @@ -498,34 +516,32 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, pa_vec = (vector signed char) vec_sub(b_vec,c_vec); pb_vec = (vector signed char) vec_sub(a_vec , c_vec); pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec = vec_abs(pa_vec); - pb_vec = vec_abs(pb_vec); - pc_vec = vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); + pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); + pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec,smallest_vec), + vec_cmpeq(pa_vec_abs,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) ); - rp_vec = vec_add(rp_vec, nearest_vec); - + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT3_4); b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_4); c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT3_4); pa_vec = (vector signed char) vec_sub(b_vec,c_vec); pb_vec = (vector signed char) vec_sub(a_vec , c_vec); pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec = vec_abs(pa_vec); - pb_vec = vec_abs(pb_vec); - pc_vec = vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); + pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); + pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec,smallest_vec), + vec_cmpeq(pa_vec_abs,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) + if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) ); - rp_vec = vec_add(rp_vec, nearest_vec); vec_st(rp_vec,0,rp); @@ -535,142 +551,143 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, istop -= 16; } - if(istop > 0) + if(istop > 0) for (i = 0; i < istop % 16; i++) - { - vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) - } -} - -void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, - png_const_bytep prev_row) -{ - const unsigned int bpp = 3; - - int a, b, c, pa, pb, pc, p; - vector unsigned char rp_vec; - vector unsigned char pp_vec; - vector unsigned char a_vec,b_vec,c_vec,nearest_vec; - vector signed char pa_vec,pb_vec,pc_vec,smallest_vec; - vector unsigned char zero_vec = {0}; - - declare_common_vars(row_info,row,prev_row) - - /* Process the first pixel in the row completely (this is the same as 'up' - * because there is only one candidate predictor for the first row). - */ - for(i = 0; i < bpp ; i++) - { - *rp = (png_byte)( *rp + *pp); - rp++; - pp++; - } - - for(i = bpp; i < unaligned_top ; i++) - { - vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) - } - - while( istop > 16) - { - for(i = 0; i < bpp ; i++) { vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) } +} - rp -= bpp; - pp -= bpp; - rp_vec = vec_ld(0,rp); - vec_ld_unaligned(pp_vec,pp); +void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const unsigned int bpp = 3; - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT1_3); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_3); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT1_3); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec = vec_abs(pa_vec); - pb_vec = vec_abs(pb_vec); - pc_vec = vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); - nearest_vec = if_then_else( - vec_cmpeq(pa_vec,smallest_vec), - a_vec, - if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) - ); - rp_vec = vec_add(rp_vec, nearest_vec); + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char a_vec,b_vec,c_vec,nearest_vec; + vector signed char pa_vec,pb_vec,pc_vec; + vector unsigned char pa_vec_abs,pb_vec_abs,pc_vec_abs,smallest_vec; + vector unsigned char zero_vec = {0}; - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT2_3); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_3); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT2_3); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec = vec_abs(pa_vec); - pb_vec = vec_abs(pb_vec); - pc_vec = vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); - nearest_vec = if_then_else( - vec_cmpeq(pa_vec,smallest_vec), - a_vec, - if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) - ); - - rp_vec = vec_add(rp_vec, nearest_vec); - - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT3_3); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_3); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT3_3); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec = vec_abs(pa_vec); - pb_vec = vec_abs(pb_vec); - pc_vec = vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); - nearest_vec = if_then_else( - vec_cmpeq(pa_vec,smallest_vec), - a_vec, - if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) - ); - - rp_vec = vec_add(rp_vec, nearest_vec); - - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT4_3); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT4_3); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT4_3); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec = vec_abs(pa_vec); - pb_vec = vec_abs(pb_vec); - pc_vec = vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); - nearest_vec = if_then_else( - vec_cmpeq(pa_vec,smallest_vec), - a_vec, - if_then_else(vec_cmpeq(pb_vec,smallest_vec),b_vec,c_vec) - ); - - rp_vec = vec_add(rp_vec, nearest_vec); + declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } - vec_st(rp_vec,0,rp); + for(i = 0; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } - rp += 16-1; - pp += 16-1; - istop -= 16; - /* Since 16 % bpp = 16 % 3 = 1, last element of array must - * be proceeded manually - */ - vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) - } + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } - if(istop > 0) - for (i = 0; i < istop % 16; i++) - { - vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) - } + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT1_3); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_3); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT1_3); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); + pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); + pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec_abs,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) + ); + rp_vec = vec_add(rp_vec, nearest_vec); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT2_3); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_3); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT2_3); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); + pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); + pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec_abs,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) + ); + rp_vec = vec_add(rp_vec, nearest_vec); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT3_3); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_3); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT3_3); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); + pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); + pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec_abs,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) + ); + rp_vec = vec_add(rp_vec, nearest_vec); + + a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT4_3); + b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT4_3); + c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT4_3); + pa_vec = (vector signed char) vec_sub(b_vec,c_vec); + pb_vec = (vector signed char) vec_sub(a_vec , c_vec); + pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); + pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); + pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); + pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec_abs,smallest_vec), + a_vec, + if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) + ); + rp_vec = vec_add(rp_vec, nearest_vec); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } } From 99d7285f206685aa80eea874cc17d0a6304761c2 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Thu, 9 Feb 2017 18:57:53 +0000 Subject: [PATCH 30/37] Fixed filter_paeth for PowerPC VSX Now all tests are fine for ppc64le --- powerpc/filter_vsx_intrinsics.c | 260 +++++++++++++++++++------------- 1 file changed, 156 insertions(+), 104 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 9376bc885..e1ba064a6 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -448,6 +448,32 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, *rp++ = (png_byte)a;\ } +#define VEC_CHAR_ZERO (vector unsigned char){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +#ifdef __LITTLE_ENDIAN__ + +# define VEC_CHAR_TO_SHORT1_4 (vector unsigned char){ 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT2_4 (vector unsigned char){ 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT3_4 (vector unsigned char){12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16} + +# define VEC_SHORT_TO_CHAR1_4 (vector unsigned char){16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16} +# define VEC_SHORT_TO_CHAR2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16} +# define VEC_SHORT_TO_CHAR3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6} + +# define VEC_CHAR_TO_SHORT1_3 (vector unsigned char){ 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT2_3 (vector unsigned char){ 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT3_3 (vector unsigned char){ 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT4_3 (vector unsigned char){12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16} + +# define VEC_SHORT_TO_CHAR1_3 (vector unsigned char){16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16} +# define VEC_SHORT_TO_CHAR2_3 (vector unsigned char){16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16} +# define VEC_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16} +# define VEC_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16} + +#endif + +#define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VEC_CHAR_ZERO,VEC_CHAR_TO_SHORT##offset##_##bpp) +#define vsx_short_to_char(vec,offset,bpp) vec_perm((vector unsigned char)(vec),VEC_CHAR_ZERO,VEC_SHORT_TO_CHAR##offset##_##bpp) + void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { @@ -456,10 +482,9 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, int a, b, c, pa, pb, pc, p; vector unsigned char rp_vec; vector unsigned char pp_vec; - vector unsigned char a_vec,b_vec,c_vec,nearest_vec; - vector signed char pa_vec,pb_vec,pc_vec; - vector unsigned char pa_vec_abs,pb_vec_abs,pc_vec_abs,smallest_vec; vector unsigned char zero_vec = {0}; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; declare_common_vars(row_info,row,prev_row,bpp) rp -= bpp; @@ -493,56 +518,68 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, rp_vec = vec_ld(0,rp); vec_ld_unaligned(pp_vec,pp); - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT1_4); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_4); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT1_4); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); - pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); - pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT1_4),1,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_4),1,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT1_4),1,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec_abs,smallest_vec), + vec_cmpeq(pa_vec,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) - ); - rp_vec = vec_add(rp_vec, nearest_vec); + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4))); - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT2_4); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_4); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT2_4); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); - pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); - pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT2_4),2,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_4),2,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT2_4),2,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec_abs,smallest_vec), + vec_cmpeq(pa_vec,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) - ); - rp_vec = vec_add(rp_vec, nearest_vec); + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4))); - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT3_4); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_4); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT3_4); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); - pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); - pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT3_4),3,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_4),3,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT3_4),3,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec_abs,smallest_vec), + vec_cmpeq(pa_vec,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) - ); - rp_vec = vec_add(rp_vec, nearest_vec); + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4))); vec_st(rp_vec,0,rp); @@ -566,10 +603,9 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, int a, b, c, pa, pb, pc, p; vector unsigned char rp_vec; vector unsigned char pp_vec; - vector unsigned char a_vec,b_vec,c_vec,nearest_vec; - vector signed char pa_vec,pb_vec,pc_vec; - vector unsigned char pa_vec_abs,pb_vec_abs,pc_vec_abs,smallest_vec; vector unsigned char zero_vec = {0}; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; declare_common_vars(row_info,row,prev_row,bpp) rp -= bpp; @@ -603,73 +639,89 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, rp_vec = vec_ld(0,rp); vec_ld_unaligned(pp_vec,pp); - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT1_3); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_3); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT1_3); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); - pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); - pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT1_3),1,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_3),1,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT1_3),1,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec_abs,smallest_vec), + vec_cmpeq(pa_vec,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) - ); - rp_vec = vec_add(rp_vec, nearest_vec); + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3))); - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT2_3); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_3); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT2_3); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); - pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); - pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT2_3),2,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_3),2,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT2_3),2,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec_abs,smallest_vec), + vec_cmpeq(pa_vec,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) - ); - rp_vec = vec_add(rp_vec, nearest_vec); + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3))); - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT3_3); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_3); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT3_3); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); - pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); - pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT3_3),3,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_3),3,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT3_3),3,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec_abs,smallest_vec), + vec_cmpeq(pa_vec,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) - ); - rp_vec = vec_add(rp_vec, nearest_vec); + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3))); - a_vec = vec_perm(rp_vec , zero_vec , VEC_SELECT4_3); - b_vec = vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT4_3); - c_vec = vec_perm(pp_vec , zero_vec , VEC_SELECT4_3); - pa_vec = (vector signed char) vec_sub(b_vec,c_vec); - pb_vec = (vector signed char) vec_sub(a_vec , c_vec); - pc_vec = (vector signed char) vec_add(pa_vec,pb_vec); - pa_vec_abs = (vector unsigned char)vec_abs(pa_vec); - pb_vec_abs = (vector unsigned char)vec_abs(pb_vec); - pc_vec_abs = (vector unsigned char)vec_abs(pc_vec); - smallest_vec = vec_min(pc_vec_abs, vec_min(pa_vec_abs,pb_vec_abs)); + a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT4_3),4,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT4_3),4,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT4_3),4,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); nearest_vec = if_then_else( - vec_cmpeq(pa_vec_abs,smallest_vec), + vec_cmpeq(pa_vec,smallest_vec), a_vec, - if_then_else(vec_cmpeq(pb_vec_abs,smallest_vec),b_vec,c_vec) - ); - rp_vec = vec_add(rp_vec, nearest_vec); + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3))); vec_st(rp_vec,0,rp); From d1c12e85c1c91e4cc5f70a1b4a543a5a934ca149 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Thu, 9 Feb 2017 19:11:31 +0000 Subject: [PATCH 31/37] Added support for ppc64 big endian for filter_paeth VSX --- powerpc/filter_vsx_intrinsics.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index e1ba064a6..828bd25c6 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -1,4 +1,3 @@ - /* filter_vsx_intrinsics.c - PowerPC optimised filter functions * * Copyright (c) 2016 Glenn Randers-Pehrson @@ -180,7 +179,6 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, } - void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { @@ -469,6 +467,26 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, # define VEC_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16} # define VEC_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16} +#elif defined(__BIG_ENDIAN__) + +# define VEC_CHAR_TO_SHORT1_4 (vector unsigned char){16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT2_4 (vector unsigned char){16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT3_4 (vector unsigned char){16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16} + +# define VEC_SHORT_TO_CHAR1_4 (vector unsigned char){16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16} +# define VEC_SHORT_TO_CHAR2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16} +# define VEC_SHORT_TO_CHAR3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7} + +# define VEC_CHAR_TO_SHORT1_3 (vector unsigned char){16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT2_3 (vector unsigned char){16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT3_3 (vector unsigned char){16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16} +# define VEC_CHAR_TO_SHORT4_3 (vector unsigned char){16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16} + +# define VEC_SHORT_TO_CHAR1_3 (vector unsigned char){16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16} +# define VEC_SHORT_TO_CHAR2_3 (vector unsigned char){16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16} +# define VEC_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16} +# define VEC_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16} + #endif #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VEC_CHAR_ZERO,VEC_CHAR_TO_SHORT##offset##_##bpp) @@ -742,7 +760,6 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, } } - #endif /* PNG_POWERPC_VSX_OPT > 0 */ #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */ #endif /* READ */ From 310dee21f72c765771dffed4f131da3fe93d1c48 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Fri, 10 Feb 2017 07:20:59 +0000 Subject: [PATCH 32/37] Refactoring Now all defines in filter VSX file are prefixed with VSX and have more clear names --- powerpc/filter_vsx_intrinsics.c | 225 ++++++++++++++++---------------- 1 file changed, 109 insertions(+), 116 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 828bd25c6..fdb50d84f 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -38,7 +38,7 @@ * ( this is taken from ../intel/filter_sse2_intrinsics.c ) */ -#define declare_common_vars(row_info,row,prev_row,offset) \ +#define vsx_declare_common_vars(row_info,row,prev_row,offset) \ png_size_t i;\ png_bytep rp = row + offset;\ png_const_bytep pp = prev_row;\ @@ -59,7 +59,7 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, { vector unsigned char rp_vec; vector unsigned char pp_vec; - declare_common_vars(row_info,row,prev_row,0) + vsx_declare_common_vars(row_info,row,prev_row,0) /* Altivec operations require 16-byte aligned data * but input can be unaligned. So we calculate @@ -100,24 +100,69 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, } -#define VEC_SELECT1_4 (vector unsigned char){16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16} -#define VEC_SELECT2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16} -#define VEC_SELECT3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11} +#define VSX_LEFTSHIFTED1_4 (vector unsigned char){16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16} +#define VSX_LEFTSHIFTED2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16} +#define VSX_LEFTSHIFTED3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11} -#define VEC_SELECT1_3 (vector unsigned char){16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16} -#define VEC_SELECT2_3 (vector unsigned char){16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16} -#define VEC_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16} -#define VEC_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16} +#define VSX_LEFTSHIFTED1_3 (vector unsigned char){16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16} +#define VSX_LEFTSHIFTED2_3 (vector unsigned char){16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16} +#define VSX_LEFTSHIFTED3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16} +#define VSX_LEFTSHIFTED4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16} +#define VSX_NOT_SHIFTED1_4 (vector unsigned char){16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16} +#define VSX_NOT_SHIFTED2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16} +#define VSX_NOT_SHIFTED3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15} -#define VEC_AVG_SELECT1_4 (vector unsigned char){16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16} -#define VEC_AVG_SELECT2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16} -#define VEC_AVG_SELECT3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15} +#define VSX_NOT_SHIFTED1_3 (vector unsigned char){16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16} +#define VSX_NOT_SHIFTED2_3 (vector unsigned char){16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16} +#define VSX_NOT_SHIFTED3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16} +#define VSX_NOT_SHIFTED4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16} -#define VEC_AVG_SELECT1_3 (vector unsigned char){16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16} -#define VEC_AVG_SELECT2_3 (vector unsigned char){16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16} -#define VEC_AVG_SELECT3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16} -#define VEC_AVG_SELECT4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16} +#define VSX_CHAR_ZERO (vector unsigned char){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +#ifdef __LITTLE_ENDIAN__ + +# define VSX_CHAR_TO_SHORT1_4 (vector unsigned char){ 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT2_4 (vector unsigned char){ 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT3_4 (vector unsigned char){12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16} + +# define VSX_SHORT_TO_CHAR1_4 (vector unsigned char){16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16} +# define VSX_SHORT_TO_CHAR2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16} +# define VSX_SHORT_TO_CHAR3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6} + +# define VSX_CHAR_TO_SHORT1_3 (vector unsigned char){ 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT2_3 (vector unsigned char){ 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT3_3 (vector unsigned char){ 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT4_3 (vector unsigned char){12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16} + +# define VSX_SHORT_TO_CHAR1_3 (vector unsigned char){16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16} +# define VSX_SHORT_TO_CHAR2_3 (vector unsigned char){16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16} +# define VSX_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16} +# define VSX_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16} + +#elif defined(__BIG_ENDIAN__) + +# define VSX_CHAR_TO_SHORT1_4 (vector unsigned char){16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT2_4 (vector unsigned char){16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT3_4 (vector unsigned char){16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16} + +# define VSX_SHORT_TO_CHAR1_4 (vector unsigned char){16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16} +# define VSX_SHORT_TO_CHAR2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16} +# define VSX_SHORT_TO_CHAR3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7} + +# define VSX_CHAR_TO_SHORT1_3 (vector unsigned char){16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT2_3 (vector unsigned char){16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT3_3 (vector unsigned char){16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16} +# define VSX_CHAR_TO_SHORT4_3 (vector unsigned char){16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16} + +# define VSX_SHORT_TO_CHAR1_3 (vector unsigned char){16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16} +# define VSX_SHORT_TO_CHAR2_3 (vector unsigned char){16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16} +# define VSX_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16} +# define VSX_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16} + +#endif + +#define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp) +#define vsx_short_to_char(vec,offset,bpp) vec_perm((vector unsigned char)(vec),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp) #ifdef PNG_USE_ABS # define vsx_abs(number) abs(number) @@ -132,9 +177,8 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, vector unsigned char rp_vec; vector unsigned char part_vec; - vector unsigned char zero_vec = {0}; - declare_common_vars(row_info,row,prev_row,bpp) + vsx_declare_common_vars(row_info,row,prev_row,bpp) PNG_UNUSED(pp) @@ -155,13 +199,13 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, *(rp+i) += *(rp+i - bpp); rp_vec = vec_ld(0,rp); - part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_4); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); rp_vec = vec_add(rp_vec,part_vec); - part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_4); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4); rp_vec = vec_add(rp_vec,part_vec); - part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_4); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4); rp_vec = vec_add(rp_vec,part_vec); vec_st(rp_vec,0,rp); @@ -186,9 +230,8 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, vector unsigned char rp_vec; vector unsigned char part_vec; - vector unsigned char zero_vec = {0}; - declare_common_vars(row_info,row,prev_row,bpp) + vsx_declare_common_vars(row_info,row,prev_row,bpp) PNG_UNUSED(pp) @@ -209,16 +252,16 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, *(rp+i) += *(rp+i - bpp); rp_vec = vec_ld(0,rp); - part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_3); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); rp_vec = vec_add(rp_vec,part_vec); - part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_3); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3); rp_vec = vec_add(rp_vec,part_vec); - part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_3); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3); rp_vec = vec_add(rp_vec,part_vec); - part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT4_3); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3); rp_vec = vec_add(rp_vec,part_vec); vec_st(rp_vec,0,rp); @@ -249,9 +292,8 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, vector unsigned char pp_part_vec; vector unsigned char rp_part_vec; vector unsigned char avg_vec; - vector unsigned char zero_vec = {0}; - declare_common_vars(row_info,row,prev_row,bpp) + vsx_declare_common_vars(row_info,row,prev_row,bpp) rp -= bpp; if(istop >= bpp) istop -= bpp; @@ -292,20 +334,20 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, vec_ld_unaligned(pp_vec,pp); rp_vec = vec_ld(0,rp); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_4); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT1_4); + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4); avg_vec = vec_avg(rp_part_vec,pp_part_vec); avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); rp_vec = vec_add(rp_vec,avg_vec); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_4); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT2_4); + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4); avg_vec = vec_avg(rp_part_vec,pp_part_vec); avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); rp_vec = vec_add(rp_vec,avg_vec); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_4); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT3_4); + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4); avg_vec = vec_avg(rp_part_vec,pp_part_vec); avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); rp_vec = vec_add(rp_vec,avg_vec); @@ -337,9 +379,8 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, vector unsigned char pp_part_vec; vector unsigned char rp_part_vec; vector unsigned char avg_vec; - vector unsigned char zero_vec = {0}; - declare_common_vars(row_info,row,prev_row,bpp) + vsx_declare_common_vars(row_info,row,prev_row,bpp) rp -= bpp; if(istop >= bpp) istop -= bpp; @@ -380,26 +421,26 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, vec_ld_unaligned(pp_vec,pp); rp_vec = vec_ld(0,rp); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT1_3); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT1_3); + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3); avg_vec = vec_avg(rp_part_vec,pp_part_vec); avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); rp_vec = vec_add(rp_vec,avg_vec); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT2_3); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT2_3); + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3); avg_vec = vec_avg(rp_part_vec,pp_part_vec); avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); rp_vec = vec_add(rp_vec,avg_vec); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT3_3); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT3_3); + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3); avg_vec = vec_avg(rp_part_vec,pp_part_vec); avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); rp_vec = vec_add(rp_vec,avg_vec); - rp_part_vec = vec_perm(rp_vec,zero_vec,VEC_SELECT4_3); - pp_part_vec = vec_perm(pp_vec,zero_vec,VEC_AVG_SELECT4_3); + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3); avg_vec = vec_avg(rp_part_vec,pp_part_vec); avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); rp_vec = vec_add(rp_vec,avg_vec); @@ -446,52 +487,6 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, *rp++ = (png_byte)a;\ } -#define VEC_CHAR_ZERO (vector unsigned char){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} -#ifdef __LITTLE_ENDIAN__ - -# define VEC_CHAR_TO_SHORT1_4 (vector unsigned char){ 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT2_4 (vector unsigned char){ 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT3_4 (vector unsigned char){12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16} - -# define VEC_SHORT_TO_CHAR1_4 (vector unsigned char){16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16} -# define VEC_SHORT_TO_CHAR2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16} -# define VEC_SHORT_TO_CHAR3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6} - -# define VEC_CHAR_TO_SHORT1_3 (vector unsigned char){ 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT2_3 (vector unsigned char){ 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT3_3 (vector unsigned char){ 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT4_3 (vector unsigned char){12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16} - -# define VEC_SHORT_TO_CHAR1_3 (vector unsigned char){16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16} -# define VEC_SHORT_TO_CHAR2_3 (vector unsigned char){16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16} -# define VEC_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16} -# define VEC_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16} - -#elif defined(__BIG_ENDIAN__) - -# define VEC_CHAR_TO_SHORT1_4 (vector unsigned char){16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT2_4 (vector unsigned char){16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT3_4 (vector unsigned char){16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16} - -# define VEC_SHORT_TO_CHAR1_4 (vector unsigned char){16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16} -# define VEC_SHORT_TO_CHAR2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16} -# define VEC_SHORT_TO_CHAR3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7} - -# define VEC_CHAR_TO_SHORT1_3 (vector unsigned char){16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT2_3 (vector unsigned char){16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT3_3 (vector unsigned char){16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16} -# define VEC_CHAR_TO_SHORT4_3 (vector unsigned char){16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16} - -# define VEC_SHORT_TO_CHAR1_3 (vector unsigned char){16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16} -# define VEC_SHORT_TO_CHAR2_3 (vector unsigned char){16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16} -# define VEC_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16} -# define VEC_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16} - -#endif - -#define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VEC_CHAR_ZERO,VEC_CHAR_TO_SHORT##offset##_##bpp) -#define vsx_short_to_char(vec,offset,bpp) vec_perm((vector unsigned char)(vec),VEC_CHAR_ZERO,VEC_SHORT_TO_CHAR##offset##_##bpp) - void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { @@ -500,11 +495,10 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, int a, b, c, pa, pb, pc, p; vector unsigned char rp_vec; vector unsigned char pp_vec; - vector unsigned char zero_vec = {0}; vector unsigned short a_vec,b_vec,c_vec,nearest_vec; vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; - declare_common_vars(row_info,row,prev_row,bpp) + vsx_declare_common_vars(row_info,row,prev_row,bpp) rp -= bpp; if(istop >= bpp) istop -= bpp; @@ -536,9 +530,9 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, rp_vec = vec_ld(0,rp); vec_ld_unaligned(pp_vec,pp); - a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT1_4),1,4); - b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_4),1,4); - c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT1_4),1,4); + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4); pa_vec = (vector signed short) vec_sub(b_vec,c_vec); pb_vec = (vector signed short) vec_sub(a_vec , c_vec); pc_vec = vec_add(pa_vec,pb_vec); @@ -557,9 +551,9 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, ); rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4))); - a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT2_4),2,4); - b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_4),2,4); - c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT2_4),2,4); + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4); pa_vec = (vector signed short) vec_sub(b_vec,c_vec); pb_vec = (vector signed short) vec_sub(a_vec , c_vec); pc_vec = vec_add(pa_vec,pb_vec); @@ -578,9 +572,9 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, ); rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4))); - a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT3_4),3,4); - b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_4),3,4); - c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT3_4),3,4); + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4); pa_vec = (vector signed short) vec_sub(b_vec,c_vec); pb_vec = (vector signed short) vec_sub(a_vec , c_vec); pc_vec = vec_add(pa_vec,pb_vec); @@ -621,11 +615,10 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, int a, b, c, pa, pb, pc, p; vector unsigned char rp_vec; vector unsigned char pp_vec; - vector unsigned char zero_vec = {0}; vector unsigned short a_vec,b_vec,c_vec,nearest_vec; vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; - declare_common_vars(row_info,row,prev_row,bpp) + vsx_declare_common_vars(row_info,row,prev_row,bpp) rp -= bpp; if(istop >= bpp) istop -= bpp; @@ -657,9 +650,9 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, rp_vec = vec_ld(0,rp); vec_ld_unaligned(pp_vec,pp); - a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT1_3),1,3); - b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT1_3),1,3); - c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT1_3),1,3); + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3); pa_vec = (vector signed short) vec_sub(b_vec,c_vec); pb_vec = (vector signed short) vec_sub(a_vec , c_vec); pc_vec = vec_add(pa_vec,pb_vec); @@ -678,9 +671,9 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, ); rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3))); - a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT2_3),2,3); - b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT2_3),2,3); - c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT2_3),2,3); + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3); pa_vec = (vector signed short) vec_sub(b_vec,c_vec); pb_vec = (vector signed short) vec_sub(a_vec , c_vec); pc_vec = vec_add(pa_vec,pb_vec); @@ -699,9 +692,9 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, ); rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3))); - a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT3_3),3,3); - b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT3_3),3,3); - c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT3_3),3,3); + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3); pa_vec = (vector signed short) vec_sub(b_vec,c_vec); pb_vec = (vector signed short) vec_sub(a_vec , c_vec); pc_vec = vec_add(pa_vec,pb_vec); @@ -720,9 +713,9 @@ void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, ); rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3))); - a_vec = vsx_char_to_short(vec_perm(rp_vec , zero_vec , VEC_SELECT4_3),4,3); - b_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_AVG_SELECT4_3),4,3); - c_vec = vsx_char_to_short(vec_perm(pp_vec , zero_vec , VEC_SELECT4_3),4,3); + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3); pa_vec = (vector signed short) vec_sub(b_vec,c_vec); pb_vec = (vector signed short) vec_sub(a_vec , c_vec); pc_vec = vec_add(pa_vec,pb_vec); From 483bcad21d1bc9298c76ffe674a620e7e1716578 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Fri, 10 Feb 2017 07:33:06 +0000 Subject: [PATCH 33/37] Changed minimum supported PowerPC CPU comment Changed it from POWER8 to POWER7 due to the last one supports VSX too. --- powerpc/filter_vsx_intrinsics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index fdb50d84f..688466c7b 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -22,7 +22,7 @@ #if PNG_POWERPC_VSX_OPT > 0 #ifndef __VSX__ -# error "This code requires VSX support (POWER8 and later). Please provide -mvsx compiler flag." +# error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag." #endif #define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data) From 6f6c396604f012678a750cc48c929c821fc26ea8 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Fri, 10 Feb 2017 07:38:28 +0000 Subject: [PATCH 34/37] Added prefix to cmake VSX check warning --- CMakeLists.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d2a364d1..8dee004b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,7 +44,7 @@ include(GNUInstallDirs) # needed packages -#Allow users to specify location of Zlib, +#Allow users to specify location of Zlib, # Useful if zlib is being built alongside this as a sub-project option(PNG_BUILD_ZLIB "Custom zlib Location, else find_package is used" OFF) @@ -98,7 +98,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm" OR arm/arm_init.c arm/filter_neon.S arm/filter_neon_intrinsics.c) - + if(${PNG_ARM_NEON} STREQUAL "on") add_definitions(-DPNG_ARM_NEON_OPT=2) elseif(${PNG_ARM_NEON} STREQUAL "check") @@ -127,13 +127,13 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^powerpc*" OR set(libpng_powerpc_sources powerpc/powerpc_init.c powerpc/filter_vsx_intrinsics.c) - + if(${PNG_POWERPC_VSX} STREQUAL "on") add_definitions(-DPNG_POWERPC_VSX_OPT=2) elseif(${PNG_POWERPC_VSX} STREQUAL "check") add_definitions(-DPNG_POWERPC_VSX_CHECK_SUPPORTED) message(WARNING - " Please check contrib/powerpc/README file for the list of supported OSes.") + "[PNG_POWERPC_VSX==check] Please check contrib/powerpc/README file for the list of supported OSes.") endif() else() add_definitions(-DPNG_POWERPC_VSX_OPT=0) @@ -875,4 +875,3 @@ endif() # to create msvc import lib for mingw compiled shared lib # pexports libpng.dll > libpng.def # lib /def:libpng.def /machine:x86 - From a343882c74ecde75b119072cae766ace88c6beac Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sat, 11 Feb 2017 21:33:32 +0000 Subject: [PATCH 35/37] Made VSX code pedantic strict C90 compliant Fixed signed/unsigned comparations, png_byte and summations. Also fixed combound literals which are permited by C90. --- powerpc/filter_vsx_intrinsics.c | 119 +++++++++++++++++--------------- 1 file changed, 64 insertions(+), 55 deletions(-) diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c index 688466c7b..3a2de79da 100644 --- a/powerpc/filter_vsx_intrinsics.c +++ b/powerpc/filter_vsx_intrinsics.c @@ -39,7 +39,7 @@ */ #define vsx_declare_common_vars(row_info,row,prev_row,offset) \ - png_size_t i;\ + png_byte i;\ png_bytep rp = row + offset;\ png_const_bytep pp = prev_row;\ png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\ @@ -100,69 +100,69 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, } -#define VSX_LEFTSHIFTED1_4 (vector unsigned char){16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16} -#define VSX_LEFTSHIFTED2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16} -#define VSX_LEFTSHIFTED3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11} +static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11}; -#define VSX_LEFTSHIFTED1_3 (vector unsigned char){16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16} -#define VSX_LEFTSHIFTED2_3 (vector unsigned char){16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16} -#define VSX_LEFTSHIFTED3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16} -#define VSX_LEFTSHIFTED4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16} +static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16}; -#define VSX_NOT_SHIFTED1_4 (vector unsigned char){16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16} -#define VSX_NOT_SHIFTED2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16} -#define VSX_NOT_SHIFTED3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15} +static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15}; -#define VSX_NOT_SHIFTED1_3 (vector unsigned char){16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16} -#define VSX_NOT_SHIFTED2_3 (vector unsigned char){16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16} -#define VSX_NOT_SHIFTED3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16} -#define VSX_NOT_SHIFTED4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16} +static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16}; -#define VSX_CHAR_ZERO (vector unsigned char){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; #ifdef __LITTLE_ENDIAN__ -# define VSX_CHAR_TO_SHORT1_4 (vector unsigned char){ 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT2_4 (vector unsigned char){ 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT3_4 (vector unsigned char){12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16} +static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16}; -# define VSX_SHORT_TO_CHAR1_4 (vector unsigned char){16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16} -# define VSX_SHORT_TO_CHAR2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16} -# define VSX_SHORT_TO_CHAR3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6} +static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6}; -# define VSX_CHAR_TO_SHORT1_3 (vector unsigned char){ 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT2_3 (vector unsigned char){ 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT3_3 (vector unsigned char){ 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT4_3 (vector unsigned char){12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16} +static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16}; -# define VSX_SHORT_TO_CHAR1_3 (vector unsigned char){16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16} -# define VSX_SHORT_TO_CHAR2_3 (vector unsigned char){16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16} -# define VSX_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16} -# define VSX_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16} +static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16}; #elif defined(__BIG_ENDIAN__) -# define VSX_CHAR_TO_SHORT1_4 (vector unsigned char){16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT2_4 (vector unsigned char){16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT3_4 (vector unsigned char){16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16} +static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16}; -# define VSX_SHORT_TO_CHAR1_4 (vector unsigned char){16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16} -# define VSX_SHORT_TO_CHAR2_4 (vector unsigned char){16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16} -# define VSX_SHORT_TO_CHAR3_4 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7} +static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7}; -# define VSX_CHAR_TO_SHORT1_3 (vector unsigned char){16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT2_3 (vector unsigned char){16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT3_3 (vector unsigned char){16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16} -# define VSX_CHAR_TO_SHORT4_3 (vector unsigned char){16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16} +static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16}; -# define VSX_SHORT_TO_CHAR1_3 (vector unsigned char){16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16} -# define VSX_SHORT_TO_CHAR2_3 (vector unsigned char){16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16} -# define VSX_SHORT_TO_CHAR3_3 (vector unsigned char){16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16} -# define VSX_SHORT_TO_CHAR4_3 (vector unsigned char){16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16} +static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16}; #endif #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp) -#define vsx_short_to_char(vec,offset,bpp) vec_perm((vector unsigned char)(vec),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp) +#define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp) #ifdef PNG_USE_ABS # define vsx_abs(number) abs(number) @@ -173,7 +173,7 @@ void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - const unsigned int bpp = 4; + const png_byte bpp = 4; vector unsigned char rp_vec; vector unsigned char part_vec; @@ -196,7 +196,11 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, while( istop >= 16 ) { for(i=0;i < bpp ; i++) - *(rp+i) += *(rp+i - bpp); + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; rp_vec = vec_ld(0,rp); part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); @@ -226,7 +230,7 @@ void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - const unsigned int bpp = 3; + const png_byte bpp = 3; vector unsigned char rp_vec; vector unsigned char part_vec; @@ -249,7 +253,11 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, while( istop >= 16 ) { for(i=0;i < bpp ; i++) - *(rp+i) += *(rp+i - bpp); + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; rp_vec = vec_ld(0,rp); part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); @@ -265,13 +273,14 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, rp_vec = vec_add(rp_vec,part_vec); vec_st(rp_vec,0,rp); - rp += 16; + rp += 15; istop -= 16; /* Since 16 % bpp = 16 % 3 = 1, last element of array must * be proceeded manually */ - *(rp - 1) += *(rp - 1 - 3); + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; } if(istop > 0) @@ -285,7 +294,7 @@ void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - const unsigned int bpp = 4; + const png_byte bpp = 4; vector unsigned char rp_vec; vector unsigned char pp_vec; @@ -372,7 +381,7 @@ void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - const unsigned int bpp = 3; + const png_byte bpp = 3; vector unsigned char rp_vec; vector unsigned char pp_vec; @@ -490,7 +499,7 @@ void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - const unsigned int bpp = 4; + const png_byte bpp = 4; int a, b, c, pa, pb, pc, p; vector unsigned char rp_vec; @@ -610,7 +619,7 @@ void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { - const unsigned int bpp = 3; + const png_byte bpp = 3; int a, b, c, pa, pb, pc, p; vector unsigned char rp_vec; From 29775cef5ec8b6434bcb6e62d216e73a5e8c20dc Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sun, 12 Feb 2017 09:57:16 +0000 Subject: [PATCH 36/37] Fixed VSX compilation time checks __ppc64__ -> __PPC64__ --- pngpriv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pngpriv.h b/pngpriv.h index 3c2387e00..5aecda7a9 100644 --- a/pngpriv.h +++ b/pngpriv.h @@ -191,7 +191,7 @@ #endif #ifndef PNG_POWERPC_VSX_OPT -# if defined(__ppc64__) && defined(__ALTIVEC__) && defined(__VSX__) +# if defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) # define PNG_POWERPC_VSX_OPT 2 # else # define PNG_POWERPC_VSX_OPT 0 From 3644db298b3eb02cf23c6b6578fd2b38dd2871c3 Mon Sep 17 00:00:00 2001 From: Vadim Barkov Date: Sun, 12 Feb 2017 12:14:03 +0000 Subject: [PATCH 37/37] Added VSX code to libconfig scripts --- scripts/pnglibconf.dfa | 27 +++++++++++++++++++++++++++ scripts/pnglibconf.h.prebuilt | 2 ++ 2 files changed, 29 insertions(+) diff --git a/scripts/pnglibconf.dfa b/scripts/pnglibconf.dfa index 019c06d47..9df2a4311 100644 --- a/scripts/pnglibconf.dfa +++ b/scripts/pnglibconf.dfa @@ -229,6 +229,33 @@ option ARM_NEON_API disabled requires ALIGNED_MEMORY enables SET_OPTION, option ARM_NEON_CHECK disabled requires ALIGNED_MEMORY, sets ARM_NEON_OPT 1 +# These options are specific to the PowerPC VSX hardware optimizations. +# +# POWERPC_VSX_OPT: unset: check at compile time (__PPC64__,__ALTIVEC__,__VSX__ +# must be defined by the compiler, typically as a result +# of specifying +# "-mvsx -maltivec" compiler flags) +# 0: disable (even if the CPU supports VSX.) +# 1: check at run time (via POWERPC_VSX_{API,CHECK}) +# 2: switch on unconditionally (inadvisable - instead pass +# -mvsx -maltivec to compiler options) +# When building libpng avoid using any setting other than '0'; '1' is +# set automatically when either 'API' or 'CHECK' are configured in, +# '2' should not be necessary as "-mvsx -maltivec" will achieve the same +# effect as well as applying VSX optimizations to the rest of the +# libpng code. +# POWERPC_VSX_API: (PNG_POWERPC_VSX == 1) allow the optimization to be switched on +# with png_set_option +# POWERPC_VSX_CHECK: (PNG_POWERPC_VSX == 1) compile a run-time check to see if VSX +# extensions are supported. This is supported not for all OSes +# (see contrib/powerpc/README) +setting POWERPC_VSX_OPT +option POWERPC_VSX_API disabled enables SET_OPTION, + sets POWERPC_VSX_OPT 1 +option POWERPC_VSX_CHECK disabled, + sets POWERPC_VSX_OPT 1 + + # These settings configure the default compression level (0-9) and 'strategy'; # strategy is as defined by the implementors of zlib. It describes the input # data and modifies the zlib parameters in an attempt to optimize the balance diff --git a/scripts/pnglibconf.h.prebuilt b/scripts/pnglibconf.h.prebuilt index 20c6873af..1cc5ed01f 100644 --- a/scripts/pnglibconf.h.prebuilt +++ b/scripts/pnglibconf.h.prebuilt @@ -20,6 +20,8 @@ #define PNG_ALIGNED_MEMORY_SUPPORTED /*#undef PNG_ARM_NEON_API_SUPPORTED*/ /*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/ +/*#undef PNG_POWERPC_VSX_API_SUPPORTED*/ +/*#undef PNG_POWERPC_VSX_CHECK_SUPPORTED*/ #define PNG_BENIGN_ERRORS_SUPPORTED #define PNG_BENIGN_READ_ERRORS_SUPPORTED /*#undef PNG_BENIGN_WRITE_ERRORS_SUPPORTED*/