diff --git a/CMakeLists.txt b/CMakeLists.txt index 7bbfe58e2..8dee004b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,7 +44,7 @@ include(GNUInstallDirs) # needed packages -#Allow users to specify location of Zlib, +#Allow users to specify location of Zlib, # Useful if zlib is being built alongside this as a sub-project option(PNG_BUILD_ZLIB "Custom zlib Location, else find_package is used" OFF) @@ -98,7 +98,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm" OR arm/arm_init.c arm/filter_neon.S arm/filter_neon_intrinsics.c) - + if(${PNG_ARM_NEON} STREQUAL "on") add_definitions(-DPNG_ARM_NEON_OPT=2) elseif(${PNG_ARM_NEON} STREQUAL "check") @@ -109,6 +109,38 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm" OR endif() endif() +# set definitions and sources for powerpc +if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^powerpc*" OR + ${CMAKE_SYSTEM_PROCESSOR} MATCHES "^ppc64*" ) + set(PNG_POWERPC_VSX_POSSIBLE_VALUES check on off) + set(PNG_POWERPC_VSX "check" CACHE STRING "Enable POWERPC VSX optimizations: + check: (default) use internal checking code; + off: disable the optimizations; + on: turn on unconditionally.") + set_property(CACHE PNG_POWERPC_VSX PROPERTY STRINGS + ${PNG_POWERPC_VSX_POSSIBLE_VALUES}) + list(FIND PNG_POWERPC_VSX_POSSIBLE_VALUES ${PNG_POWERPC_VSX} index) + if(index EQUAL -1) + message(FATAL_ERROR + " PNG_POWERPC_VSX must be one of [${PNG_POWERPC_VSX_POSSIBLE_VALUES}]") + elseif(NOT ${PNG_POWERPC_VSX} STREQUAL "no") + set(libpng_powerpc_sources + powerpc/powerpc_init.c + powerpc/filter_vsx_intrinsics.c) + + if(${PNG_POWERPC_VSX} STREQUAL "on") + add_definitions(-DPNG_POWERPC_VSX_OPT=2) + elseif(${PNG_POWERPC_VSX} STREQUAL "check") + add_definitions(-DPNG_POWERPC_VSX_CHECK_SUPPORTED) + message(WARNING + "[PNG_POWERPC_VSX==check] Please check contrib/powerpc/README file for the list of supported OSes.") + endif() + else() + add_definitions(-DPNG_POWERPC_VSX_OPT=0) + endif() +endif() + + # SET LIBNAME set(PNG_LIB_NAME png${PNGLIB_MAJOR}${PNGLIB_MINOR}) @@ -400,6 +432,7 @@ set(libpng_sources pngwtran.c pngwutil.c ${libpng_arm_sources} + ${libpng_powerpc_sources} ) set(pngtest_sources pngtest.c @@ -842,4 +875,3 @@ endif() # to create msvc import lib for mingw compiled shared lib # pexports libpng.dll > libpng.def # lib /def:libpng.def /machine:x86 - diff --git a/Makefile.am b/Makefile.am index 2ec0e1b39..18ac74866 100644 --- a/Makefile.am +++ b/Makefile.am @@ -107,6 +107,11 @@ libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += intel/intel_init.c\ intel/filter_sse2_intrinsics.c endif +if PNG_POWERPC_VSX +libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += powerpc/powerpc_init.c\ + powerpc/filter_vsx_intrinsics.c +endif + nodist_libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES = pnglibconf.h libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_LDFLAGS = -no-undefined -export-dynamic \ diff --git a/README b/README index 28b02a43b..b21846707 100644 --- a/README +++ b/README @@ -179,8 +179,10 @@ Files in this distribution: pngwtran.c => Write data transformations pngwutil.c => Write utility functions arm => Contains optimized code for the ARM platform + powerpc => Contains optimized code for the PowerPC platform contrib => Contributions arm-neon => Optimized code for ARM-NEON platform + powerpc => Optimized code for POWERPC-VSX platform examples => Example programs gregbook => source code for PNG reading and writing, from Greg Roelofs' "PNG: The Definitive Guide", diff --git a/configure.ac b/configure.ac index c4b882ff8..e19474e21 100644 --- a/configure.ac +++ b/configure.ac @@ -426,6 +426,54 @@ AM_CONDITIONAL([PNG_INTEL_SSE], *) test "$enable_intel_sse" != '';; esac]) + +# PowerPC +# === +# +# PowerPC VSX (SIMD) support. + +AC_ARG_ENABLE([powerpc-vsx], +AS_HELP_STRING([[[--enable-powerpc-vsx]]], + [Enable POWERPC VSX optimizations: =no/off, check, api, yes/on:] + [no/off: disable the optimizations; check: use internal checking code] + [api: disable by default, enable by a call to png_set_option] + [yes/on: turn on unconditionally.] + [If not specified: determined by the compiler.]), + [case "$enableval" in + no|off) + # disable the default enabling on __ppc64__ systems: + AC_DEFINE([PNG_POWERPC_VSX_OPT], [0], + [Disable POWERPC VSX optimizations]) + # Prevent inclusion of the platform specific files below: + enable_powerpc_vsx=no;; + check) + AC_DEFINE([PNG_POWERPC_VSX_CHECK_SUPPORTED], [], + [Check for POWERPC VSX support at run-time]) + AC_MSG_WARN([--enable-powerpc-vsx Please check contrib/powerpc/README file] + [for the list of supported OSes.]);; + api) + AC_DEFINE([PNG_POWERPC_VSX_API_SUPPORTED], [], + [Turn on POWERPC VSX optimizations at run-time]);; + yes|on) + AC_DEFINE([PNG_POWERPC_VSX_OPT], [2], + [Enable POWERPC VSX optimizations]) + AC_MSG_WARN([--enable-powerpc-vsx: please specify 'check' or 'api', if] + [you want the optimizations unconditionally pass '-maltivec -mvsx'] + [or '-mcpu=power8'to the compiler.]);; + *) + AC_MSG_ERROR([--enable-powerpc-vsx=${enable_powerpc_vsx}: invalid value]) + esac]) + +# Add PowerPC specific files to all builds where the host_cpu is powerpc('powerpc*') or +# where POWERPC optimizations were explicitly requested (this allows a fallback if a +# future host CPU does not match 'powerpc*') + +AM_CONDITIONAL([PNG_POWERPC_VSX], + [test "$enable_powerpc_vsx" != 'no' && + case "$host_cpu" in + powerpc*|ppc64*) :;; + esac]) + AC_MSG_NOTICE([[Extra options for compiler: $PNG_COPTS]]) # Config files, substituting as above diff --git a/contrib/powerpc-vsx/README b/contrib/powerpc-vsx/README new file mode 100644 index 000000000..e566147ea --- /dev/null +++ b/contrib/powerpc-vsx/README @@ -0,0 +1,81 @@ +OPERATING SYSTEM SPECIFIC POWERPC DETECTION +-------------------------------------------- + +Detection of the ability to execute POWERPC on processor requires +operating system support. (The information is not available in user mode.) + +Currently only this feature is supported only for linux platform. + +HOW TO USE THIS +--------------- + +This directory contains C code fragments that can be included in powerpc/powerpc_init.c +by setting the macro PNG_POWERPC_VSX_FILE to the file name in "" or <> at build +time. This setting is not recorded in pnglibconf.h and can be changed simply by +rebuilding arm/arm_init.o with the required macro definition. + +For any of this code to be used the POWERPC code must be enabled and run time +checks must be supported. I.e.: + +#if PNG_POWERPC_VSX_OPT > 0 +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED + +This is done in a 'configure' build by passing configure the argument: + + --enable-powerpc-vsx=check + +FILE FORMAT +----------- + +Each file documents its testing status as of the last time it was tested (which +may have been a long time ago): + +STATUS: one of: + SUPPORTED: This indicates that the file is included in the regularly + performed test builds and bugs are fixed when discovered. + COMPILED: This indicates that the code did compile at least once. See the + more detailed description for the extent to which the result was + successful. + TESTED: This means the code was fully compiled into the libpng test programs + and these were run at least once. + +BUG REPORTS: an email address to which to send reports of problems + +The file is a fragment of C code. It should not define any 'extern' symbols; +everything should be static. It must define the function: + +static int png_have_vsx(png_structp png_ptr); + +That function must return 1 if ARM NEON instructions are supported, 0 if not. +It must not execute png_error unless it detects a bug. A png_error will prevent +the reading of the PNG and in the future, writing too. + +BUG REPORTS +----------- + +If you mail a bug report for any file that is not SUPPORTED there may only be +limited response. Consider fixing it and sending a patch to fix the problem - +this is more likely to result in action. + +CONTRIBUTIONS +------------- + +You may send contributions of new implementations to +png-mng-implement@sourceforge.net. Please write code in strict C90 C where +possible. Obviously OS dependencies are to be expected. If you submit code you +must have the authors permission and it must have a license that is acceptable +to the current maintainer; in particular that license must permit modification +and redistribution. + +Please try to make the contribution a single file and give the file a clear and +unambiguous name that identifies the target OS. If multiple files really are +required put them all in a sub-directory. + +You must also be prepared to handle bug reports from users of the code, either +by joining the png-mng-implement mailing list or by providing an email for the +"BUG REPORTS" entry or both. Please make sure that the header of the file +contains the STATUS and BUG REPORTS fields as above. + +Please list the OS requirements as precisely as possible. Ideally you should +also list the environment in which the code has been tested and certainly list +any environments where you suspect it might not work. diff --git a/contrib/powerpc-vsx/linux.c b/contrib/powerpc-vsx/linux.c new file mode 100644 index 000000000..8fc76e34c --- /dev/null +++ b/contrib/powerpc-vsx/linux.c @@ -0,0 +1,56 @@ +/* contrib/powerpc-vsx/linux.c + * + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + * + * STATUS: TESTED + * BUG REPORTS: png-mng-implement@sourceforge.net + * + * png_have_vsx implemented for Linux by reading the widely available + * pseudo-file /proc/cpuinfo. + * + * This code is strict ANSI-C and is probably moderately portable; it does + * however use and it assumes that /proc/cpuinfo is never localized. + */ + +#include +#include +#include +#include "png.h" + +#ifndef MAXLINE +# define MAXLINE 1024 +#endif + +static int +png_have_vsx(png_structp png_ptr) +{ + FILE *f; + + const char *string = "altivec supported"; + char input[MAXLINE]; + char *token = NULL; + + PNG_UNUSED(png_ptr) + + f = fopen("/proc/cpuinfo", "r"); + if (f != NULL) + { + memset(input,0,MAXLINE); + while(fgets(input,MAXLINE,f) != NULL) + { + token = strstr(input,string); + if(token != NULL) + return 1; + } + } +#ifdef PNG_WARNINGS_SUPPORTED + else + png_warning(png_ptr, "/proc/cpuinfo open failed"); +#endif + return 0; +} diff --git a/contrib/powerpc-vsx/linux_aux.c b/contrib/powerpc-vsx/linux_aux.c new file mode 100644 index 000000000..d946d6cad --- /dev/null +++ b/contrib/powerpc-vsx/linux_aux.c @@ -0,0 +1,35 @@ +/* contrib/powerpc-vsx/linux_aux.c + * + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + * + * STATUS: TESTED + * BUG REPORTS: png-mng-implement@sourceforge.net + * + * png_have_vsx implemented for Linux by using the auxiliary vector mechanism. + * + * This code is strict ANSI-C and is probably moderately portable; it does + * however use and it assumes that /proc/cpuinfo is never localized. + */ + +#include "sys/auxv.h" +#include "png.h" + +static int +png_have_vsx(png_structp png_ptr) +{ + + const unsigned long auxv = getauxval( AT_HWCAP ); + + PNG_UNUSED(png_ptr) + + if(auxv & (PPC_FEATURE_HAS_ALTIVEC|PPC_FEATURE_HAS_VSX )) + return 1; + else + return 0; +} + diff --git a/png.h b/png.h index f200d7a64..22620098b 100644 --- a/png.h +++ b/png.h @@ -3225,7 +3225,10 @@ PNG_EXPORT(245, int, png_image_write_to_memory, (png_imagep image, void *memory, # define PNG_MIPS_MSA 6 /* HARDWARE: MIPS Msa SIMD instructions supported */ #endif #define PNG_IGNORE_ADLER32 8 -#define PNG_OPTION_NEXT 10 /* Next option - numbers must be even */ +#ifdef PNG_POWERPC_VSX_API_SUPPORTED +# define PNG_POWERPC_VSX 10 /* HARDWARE: PowerPC VSX SIMD instructions supported */ +#endif +#define PNG_OPTION_NEXT 12 /* Next option - numbers must be even */ /* Return values: NOTE: there are four values and 'off' is *not* zero */ #define PNG_OPTION_UNSET 0 /* Unset - defaults to off */ diff --git a/pngpriv.h b/pngpriv.h index 083cda643..5aecda7a9 100644 --- a/pngpriv.h +++ b/pngpriv.h @@ -190,6 +190,14 @@ # endif #endif +#ifndef PNG_POWERPC_VSX_OPT +# if defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) +# define PNG_POWERPC_VSX_OPT 2 +# else +# define PNG_POWERPC_VSX_OPT 0 +# endif +#endif + #ifndef PNG_INTEL_SSE_OPT # ifdef PNG_INTEL_SSE /* Only check for SSE if the build configuration has been modified to @@ -246,6 +254,11 @@ # endif #endif /* PNG_MIPS_MSA_OPT > 0 */ +#if PNG_POWERPC_VSX_OPT > 0 +# define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_vsx +# define PNG_POWERPC_VSX_IMPLEMENTATION 1 +#endif + /* Is this a build of a DLL where compilation of the object modules requires * different preprocessor settings to those required for a simple library? If @@ -1292,6 +1305,23 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_msa,(png_row_infop row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); #endif +#if PNG_POWERPC_VSX_OPT > 0 +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_vsx,(png_row_infop row_info, + png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_vsx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +#endif + #if PNG_INTEL_SSE_IMPLEMENTATION > 0 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); diff --git a/powerpc/filter_vsx_intrinsics.c b/powerpc/filter_vsx_intrinsics.c new file mode 100644 index 000000000..3a2de79da --- /dev/null +++ b/powerpc/filter_vsx_intrinsics.c @@ -0,0 +1,767 @@ +/* filter_vsx_intrinsics.c - PowerPC optimised filter functions + * + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * Last changed in libpng 1.6.25 [September 1, 2016] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +#include +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +/* This code requires -maltivec and -mvsx on the command line: */ +#if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ + +#include + +#if PNG_POWERPC_VSX_OPT > 0 + +#ifndef __VSX__ +# error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag." +#endif + +#define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data) +#define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data) + + +/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). + * They're positioned like this: + * prev: c b + * row: a d + * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be + * whichever of a, b, or c is closest to p=a+b-c. + * ( this is taken from ../intel/filter_sse2_intrinsics.c ) + */ + +#define vsx_declare_common_vars(row_info,row,prev_row,offset) \ + png_byte i;\ + png_bytep rp = row + offset;\ + png_const_bytep pp = prev_row;\ + png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\ + png_size_t istop;\ + if(unaligned_top == 16)\ + unaligned_top = 0;\ + istop = row_info->rowbytes;\ + if((unaligned_top < istop))\ + istop -= unaligned_top;\ + else{\ + unaligned_top = istop;\ + istop = 0;\ + } + +void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vsx_declare_common_vars(row_info,row,prev_row,0) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + rp_vec = vec_add(rp_vec,pp_vec); + + vec_st(rp_vec,0,rp); + + pp += 16; + rp += 16; + istop -= 16; + } + + if(istop > 0) + { + /* If byte count of row is not divisible by 16 + * we will process remaining part as usual + */ + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } +} + +} + +static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11}; + +static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16}; + +static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15}; + +static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16}; + +static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; +#ifdef __LITTLE_ENDIAN__ + +static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6}; + +static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16}; + +#elif defined(__BIG_ENDIAN__) + +static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7}; + +static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16}; + +#endif + +#define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp) +#define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp) + +#ifdef PNG_USE_ABS +# define vsx_abs(number) abs(number) +#else +# define vsx_abs(number) (number > 0) ? (number) : -(number) +#endif + +void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + vector unsigned char rp_vec; + vector unsigned char part_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + + PNG_UNUSED(pp) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff); + rp++; + } + +} + +void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + vector unsigned char rp_vec; + vector unsigned char part_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + + PNG_UNUSED(pp) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + rp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } +} + +void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +/* Bytewise c ? t : e. */ +#define if_then_else(c,t,e) vec_sel(e,t,c) + +#define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\ + c = *(pp - bpp);\ + a = *(rp - bpp);\ + b = *pp++;\ + p = b - c;\ + pc = a - c;\ + pa = vsx_abs(p);\ + pb = vsx_abs(pc);\ + pc = vsx_abs(p + pc);\ + if (pb < pa) pa = pb, a = b;\ + if (pc < pa) a = c;\ + a += *rp;\ + *rp++ = (png_byte)a;\ + } + +void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = 0; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4))); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = 0; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3))); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +#endif /* PNG_POWERPC_VSX_OPT > 0 */ +#endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */ +#endif /* READ */ diff --git a/powerpc/powerpc_init.c b/powerpc/powerpc_init.c new file mode 100644 index 000000000..9bbae8b5b --- /dev/null +++ b/powerpc/powerpc_init.c @@ -0,0 +1,122 @@ + +/* powerpc_init.c - POWERPC optimised filter functions + * + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are + * called. + */ +#define _POSIX_SOURCE 1 + +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_POWERPC_VSX_OPT > 0 +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED /* Do run-time checks */ +/* WARNING: it is strongly recommended that you do not build libpng with + * run-time checks for CPU features if at all possible. In the case of the PowerPC + * VSX instructions there is no processor-specific way of detecting the + * presence of the required support, therefore run-time detection is extremely + * OS specific. + * + * You may set the macro PNG_POWERPC_VSX_FILE to the file name of file containing + * a fragment of C source code which defines the png_have_vsx function. There + * are a number of implementations in contrib/powerpc-vsx, but the only one that + * has partial support is contrib/powerpc-vsx/linux.c - a generic Linux + * implementation which reads /proc/cpufino. + */ +#ifndef PNG_POWERPC_VSX_FILE +# ifdef __linux__ +# define PNG_POWERPC_VSX_FILE "contrib/powerpc-vsx/linux_aux.c" +# endif +#endif + +#ifdef PNG_POWERPC_VSX_FILE + +#include /* for sig_atomic_t */ +static int png_have_vsx(png_structp png_ptr); +#include PNG_POWERPC_VSX_FILE + +#else /* PNG_POWERPC_VSX_FILE */ +# error "PNG_POWERPC_VSX_FILE undefined: no support for run-time POWERPC VSX checks" +#endif /* PNG_POWERPC_VSX_FILE */ +#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ + +void +png_init_filter_functions_vsx(png_structp pp, unsigned int bpp) +{ + /* The switch statement is compiled in for POWERPC_VSX_API, the call to + * png_have_vsx is compiled in for POWERPC_VSX_CHECK. If both are defined + * the check is only performed if the API has not set the PowerPC option on + * or off explicitly. In this case the check controls what happens. + */ + +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + switch ((pp->options >> PNG_POWERPC_VSX) & 3) + { + case PNG_OPTION_UNSET: + /* Allow the run-time check to execute if it has been enabled - + * thus both API and CHECK can be turned on. If it isn't supported + * this case will fall through to the 'default' below, which just + * returns. + */ +#endif /* PNG_POWERPC_VSX_API_SUPPORTED */ +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED + { + static volatile sig_atomic_t no_vsx = -1; /* not checked */ + + if (no_vsx < 0) + no_vsx = !png_have_vsx(pp); + + if (no_vsx) + return; + } +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + break; +#endif +#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ + +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + default: /* OFF or INVALID */ + return; + + case PNG_OPTION_ON: + /* Option turned on */ + break; + } +#endif + + /* IMPORTANT: any new internal functions used here must be declared using + * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the + * 'prefix' option to configure works: + * + * ./configure --with-libpng-prefix=foobar_ + * + * Verify you have got this right by running the above command, doing a build + * and examining pngprefix.h; it must contain a #define for every external + * function you add. (Notice that this happens automatically for the + * initialization function.) + */ + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_vsx; + + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_vsx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_vsx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_vsx; + } + + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_vsx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_vsx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_vsx; + } +} +#endif /* PNG_POWERPC_VSX_OPT > 0 */ +#endif /* READ */ diff --git a/scripts/pnglibconf.dfa b/scripts/pnglibconf.dfa index 019c06d47..9df2a4311 100644 --- a/scripts/pnglibconf.dfa +++ b/scripts/pnglibconf.dfa @@ -229,6 +229,33 @@ option ARM_NEON_API disabled requires ALIGNED_MEMORY enables SET_OPTION, option ARM_NEON_CHECK disabled requires ALIGNED_MEMORY, sets ARM_NEON_OPT 1 +# These options are specific to the PowerPC VSX hardware optimizations. +# +# POWERPC_VSX_OPT: unset: check at compile time (__PPC64__,__ALTIVEC__,__VSX__ +# must be defined by the compiler, typically as a result +# of specifying +# "-mvsx -maltivec" compiler flags) +# 0: disable (even if the CPU supports VSX.) +# 1: check at run time (via POWERPC_VSX_{API,CHECK}) +# 2: switch on unconditionally (inadvisable - instead pass +# -mvsx -maltivec to compiler options) +# When building libpng avoid using any setting other than '0'; '1' is +# set automatically when either 'API' or 'CHECK' are configured in, +# '2' should not be necessary as "-mvsx -maltivec" will achieve the same +# effect as well as applying VSX optimizations to the rest of the +# libpng code. +# POWERPC_VSX_API: (PNG_POWERPC_VSX == 1) allow the optimization to be switched on +# with png_set_option +# POWERPC_VSX_CHECK: (PNG_POWERPC_VSX == 1) compile a run-time check to see if VSX +# extensions are supported. This is supported not for all OSes +# (see contrib/powerpc/README) +setting POWERPC_VSX_OPT +option POWERPC_VSX_API disabled enables SET_OPTION, + sets POWERPC_VSX_OPT 1 +option POWERPC_VSX_CHECK disabled, + sets POWERPC_VSX_OPT 1 + + # These settings configure the default compression level (0-9) and 'strategy'; # strategy is as defined by the implementors of zlib. It describes the input # data and modifies the zlib parameters in an attempt to optimize the balance diff --git a/scripts/pnglibconf.h.prebuilt b/scripts/pnglibconf.h.prebuilt index 20c6873af..1cc5ed01f 100644 --- a/scripts/pnglibconf.h.prebuilt +++ b/scripts/pnglibconf.h.prebuilt @@ -20,6 +20,8 @@ #define PNG_ALIGNED_MEMORY_SUPPORTED /*#undef PNG_ARM_NEON_API_SUPPORTED*/ /*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/ +/*#undef PNG_POWERPC_VSX_API_SUPPORTED*/ +/*#undef PNG_POWERPC_VSX_CHECK_SUPPORTED*/ #define PNG_BENIGN_ERRORS_SUPPORTED #define PNG_BENIGN_READ_ERRORS_SUPPORTED /*#undef PNG_BENIGN_WRITE_ERRORS_SUPPORTED*/