Merge branch 'libpng16' of git://github.com/barkovv/libpng into libpng16

This commit is contained in:
Glenn Randers-Pehrson 2017-02-21 20:41:27 -06:00
commit 7980c79d69
13 changed files with 1214 additions and 4 deletions

View File

@ -44,7 +44,7 @@ include(GNUInstallDirs)
# needed packages # needed packages
#Allow users to specify location of Zlib, #Allow users to specify location of Zlib,
# Useful if zlib is being built alongside this as a sub-project # Useful if zlib is being built alongside this as a sub-project
option(PNG_BUILD_ZLIB "Custom zlib Location, else find_package is used" OFF) option(PNG_BUILD_ZLIB "Custom zlib Location, else find_package is used" OFF)
@ -98,7 +98,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm" OR
arm/arm_init.c arm/arm_init.c
arm/filter_neon.S arm/filter_neon.S
arm/filter_neon_intrinsics.c) arm/filter_neon_intrinsics.c)
if(${PNG_ARM_NEON} STREQUAL "on") if(${PNG_ARM_NEON} STREQUAL "on")
add_definitions(-DPNG_ARM_NEON_OPT=2) add_definitions(-DPNG_ARM_NEON_OPT=2)
elseif(${PNG_ARM_NEON} STREQUAL "check") elseif(${PNG_ARM_NEON} STREQUAL "check")
@ -109,6 +109,38 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm" OR
endif() endif()
endif() endif()
# set definitions and sources for powerpc
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^powerpc*" OR
${CMAKE_SYSTEM_PROCESSOR} MATCHES "^ppc64*" )
set(PNG_POWERPC_VSX_POSSIBLE_VALUES check on off)
set(PNG_POWERPC_VSX "check" CACHE STRING "Enable POWERPC VSX optimizations:
check: (default) use internal checking code;
off: disable the optimizations;
on: turn on unconditionally.")
set_property(CACHE PNG_POWERPC_VSX PROPERTY STRINGS
${PNG_POWERPC_VSX_POSSIBLE_VALUES})
list(FIND PNG_POWERPC_VSX_POSSIBLE_VALUES ${PNG_POWERPC_VSX} index)
if(index EQUAL -1)
message(FATAL_ERROR
" PNG_POWERPC_VSX must be one of [${PNG_POWERPC_VSX_POSSIBLE_VALUES}]")
elseif(NOT ${PNG_POWERPC_VSX} STREQUAL "no")
set(libpng_powerpc_sources
powerpc/powerpc_init.c
powerpc/filter_vsx_intrinsics.c)
if(${PNG_POWERPC_VSX} STREQUAL "on")
add_definitions(-DPNG_POWERPC_VSX_OPT=2)
elseif(${PNG_POWERPC_VSX} STREQUAL "check")
add_definitions(-DPNG_POWERPC_VSX_CHECK_SUPPORTED)
message(WARNING
"[PNG_POWERPC_VSX==check] Please check contrib/powerpc/README file for the list of supported OSes.")
endif()
else()
add_definitions(-DPNG_POWERPC_VSX_OPT=0)
endif()
endif()
# SET LIBNAME # SET LIBNAME
set(PNG_LIB_NAME png${PNGLIB_MAJOR}${PNGLIB_MINOR}) set(PNG_LIB_NAME png${PNGLIB_MAJOR}${PNGLIB_MINOR})
@ -400,6 +432,7 @@ set(libpng_sources
pngwtran.c pngwtran.c
pngwutil.c pngwutil.c
${libpng_arm_sources} ${libpng_arm_sources}
${libpng_powerpc_sources}
) )
set(pngtest_sources set(pngtest_sources
pngtest.c pngtest.c
@ -842,4 +875,3 @@ endif()
# to create msvc import lib for mingw compiled shared lib # to create msvc import lib for mingw compiled shared lib
# pexports libpng.dll > libpng.def # pexports libpng.dll > libpng.def
# lib /def:libpng.def /machine:x86 # lib /def:libpng.def /machine:x86

View File

@ -107,6 +107,11 @@ libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += intel/intel_init.c\
intel/filter_sse2_intrinsics.c intel/filter_sse2_intrinsics.c
endif endif
if PNG_POWERPC_VSX
libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += powerpc/powerpc_init.c\
powerpc/filter_vsx_intrinsics.c
endif
nodist_libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES = pnglibconf.h nodist_libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES = pnglibconf.h
libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_LDFLAGS = -no-undefined -export-dynamic \ libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_LDFLAGS = -no-undefined -export-dynamic \

2
README
View File

@ -179,8 +179,10 @@ Files in this distribution:
pngwtran.c => Write data transformations pngwtran.c => Write data transformations
pngwutil.c => Write utility functions pngwutil.c => Write utility functions
arm => Contains optimized code for the ARM platform arm => Contains optimized code for the ARM platform
powerpc => Contains optimized code for the PowerPC platform
contrib => Contributions contrib => Contributions
arm-neon => Optimized code for ARM-NEON platform arm-neon => Optimized code for ARM-NEON platform
powerpc => Optimized code for POWERPC-VSX platform
examples => Example programs examples => Example programs
gregbook => source code for PNG reading and writing, from gregbook => source code for PNG reading and writing, from
Greg Roelofs' "PNG: The Definitive Guide", Greg Roelofs' "PNG: The Definitive Guide",

View File

@ -426,6 +426,54 @@ AM_CONDITIONAL([PNG_INTEL_SSE],
*) test "$enable_intel_sse" != '';; *) test "$enable_intel_sse" != '';;
esac]) esac])
# PowerPC
# ===
#
# PowerPC VSX (SIMD) support.
AC_ARG_ENABLE([powerpc-vsx],
AS_HELP_STRING([[[--enable-powerpc-vsx]]],
[Enable POWERPC VSX optimizations: =no/off, check, api, yes/on:]
[no/off: disable the optimizations; check: use internal checking code]
[api: disable by default, enable by a call to png_set_option]
[yes/on: turn on unconditionally.]
[If not specified: determined by the compiler.]),
[case "$enableval" in
no|off)
# disable the default enabling on __ppc64__ systems:
AC_DEFINE([PNG_POWERPC_VSX_OPT], [0],
[Disable POWERPC VSX optimizations])
# Prevent inclusion of the platform specific files below:
enable_powerpc_vsx=no;;
check)
AC_DEFINE([PNG_POWERPC_VSX_CHECK_SUPPORTED], [],
[Check for POWERPC VSX support at run-time])
AC_MSG_WARN([--enable-powerpc-vsx Please check contrib/powerpc/README file]
[for the list of supported OSes.]);;
api)
AC_DEFINE([PNG_POWERPC_VSX_API_SUPPORTED], [],
[Turn on POWERPC VSX optimizations at run-time]);;
yes|on)
AC_DEFINE([PNG_POWERPC_VSX_OPT], [2],
[Enable POWERPC VSX optimizations])
AC_MSG_WARN([--enable-powerpc-vsx: please specify 'check' or 'api', if]
[you want the optimizations unconditionally pass '-maltivec -mvsx']
[or '-mcpu=power8'to the compiler.]);;
*)
AC_MSG_ERROR([--enable-powerpc-vsx=${enable_powerpc_vsx}: invalid value])
esac])
# Add PowerPC specific files to all builds where the host_cpu is powerpc('powerpc*') or
# where POWERPC optimizations were explicitly requested (this allows a fallback if a
# future host CPU does not match 'powerpc*')
AM_CONDITIONAL([PNG_POWERPC_VSX],
[test "$enable_powerpc_vsx" != 'no' &&
case "$host_cpu" in
powerpc*|ppc64*) :;;
esac])
AC_MSG_NOTICE([[Extra options for compiler: $PNG_COPTS]]) AC_MSG_NOTICE([[Extra options for compiler: $PNG_COPTS]])
# Config files, substituting as above # Config files, substituting as above

View File

@ -0,0 +1,81 @@
OPERATING SYSTEM SPECIFIC POWERPC DETECTION
--------------------------------------------
Detection of the ability to execute POWERPC on processor requires
operating system support. (The information is not available in user mode.)
Currently only this feature is supported only for linux platform.
HOW TO USE THIS
---------------
This directory contains C code fragments that can be included in powerpc/powerpc_init.c
by setting the macro PNG_POWERPC_VSX_FILE to the file name in "" or <> at build
time. This setting is not recorded in pnglibconf.h and can be changed simply by
rebuilding arm/arm_init.o with the required macro definition.
For any of this code to be used the POWERPC code must be enabled and run time
checks must be supported. I.e.:
#if PNG_POWERPC_VSX_OPT > 0
#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED
This is done in a 'configure' build by passing configure the argument:
--enable-powerpc-vsx=check
FILE FORMAT
-----------
Each file documents its testing status as of the last time it was tested (which
may have been a long time ago):
STATUS: one of:
SUPPORTED: This indicates that the file is included in the regularly
performed test builds and bugs are fixed when discovered.
COMPILED: This indicates that the code did compile at least once. See the
more detailed description for the extent to which the result was
successful.
TESTED: This means the code was fully compiled into the libpng test programs
and these were run at least once.
BUG REPORTS: an email address to which to send reports of problems
The file is a fragment of C code. It should not define any 'extern' symbols;
everything should be static. It must define the function:
static int png_have_vsx(png_structp png_ptr);
That function must return 1 if ARM NEON instructions are supported, 0 if not.
It must not execute png_error unless it detects a bug. A png_error will prevent
the reading of the PNG and in the future, writing too.
BUG REPORTS
-----------
If you mail a bug report for any file that is not SUPPORTED there may only be
limited response. Consider fixing it and sending a patch to fix the problem -
this is more likely to result in action.
CONTRIBUTIONS
-------------
You may send contributions of new implementations to
png-mng-implement@sourceforge.net. Please write code in strict C90 C where
possible. Obviously OS dependencies are to be expected. If you submit code you
must have the authors permission and it must have a license that is acceptable
to the current maintainer; in particular that license must permit modification
and redistribution.
Please try to make the contribution a single file and give the file a clear and
unambiguous name that identifies the target OS. If multiple files really are
required put them all in a sub-directory.
You must also be prepared to handle bug reports from users of the code, either
by joining the png-mng-implement mailing list or by providing an email for the
"BUG REPORTS" entry or both. Please make sure that the header of the file
contains the STATUS and BUG REPORTS fields as above.
Please list the OS requirements as precisely as possible. Ideally you should
also list the environment in which the code has been tested and certainly list
any environments where you suspect it might not work.

View File

@ -0,0 +1,56 @@
/* contrib/powerpc-vsx/linux.c
*
* Copyright (c) 2016 Glenn Randers-Pehrson
* Written by Vadim Barkov, 2017.
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*
* STATUS: TESTED
* BUG REPORTS: png-mng-implement@sourceforge.net
*
* png_have_vsx implemented for Linux by reading the widely available
* pseudo-file /proc/cpuinfo.
*
* This code is strict ANSI-C and is probably moderately portable; it does
* however use <stdio.h> and it assumes that /proc/cpuinfo is never localized.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "png.h"
#ifndef MAXLINE
# define MAXLINE 1024
#endif
static int
png_have_vsx(png_structp png_ptr)
{
FILE *f;
const char *string = "altivec supported";
char input[MAXLINE];
char *token = NULL;
PNG_UNUSED(png_ptr)
f = fopen("/proc/cpuinfo", "r");
if (f != NULL)
{
memset(input,0,MAXLINE);
while(fgets(input,MAXLINE,f) != NULL)
{
token = strstr(input,string);
if(token != NULL)
return 1;
}
}
#ifdef PNG_WARNINGS_SUPPORTED
else
png_warning(png_ptr, "/proc/cpuinfo open failed");
#endif
return 0;
}

View File

@ -0,0 +1,35 @@
/* contrib/powerpc-vsx/linux_aux.c
*
* Copyright (c) 2016 Glenn Randers-Pehrson
* Written by Vadim Barkov, 2017.
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*
* STATUS: TESTED
* BUG REPORTS: png-mng-implement@sourceforge.net
*
* png_have_vsx implemented for Linux by using the auxiliary vector mechanism.
*
* This code is strict ANSI-C and is probably moderately portable; it does
* however use <stdio.h> and it assumes that /proc/cpuinfo is never localized.
*/
#include "sys/auxv.h"
#include "png.h"
static int
png_have_vsx(png_structp png_ptr)
{
const unsigned long auxv = getauxval( AT_HWCAP );
PNG_UNUSED(png_ptr)
if(auxv & (PPC_FEATURE_HAS_ALTIVEC|PPC_FEATURE_HAS_VSX ))
return 1;
else
return 0;
}

5
png.h
View File

@ -3225,7 +3225,10 @@ PNG_EXPORT(245, int, png_image_write_to_memory, (png_imagep image, void *memory,
# define PNG_MIPS_MSA 6 /* HARDWARE: MIPS Msa SIMD instructions supported */ # define PNG_MIPS_MSA 6 /* HARDWARE: MIPS Msa SIMD instructions supported */
#endif #endif
#define PNG_IGNORE_ADLER32 8 #define PNG_IGNORE_ADLER32 8
#define PNG_OPTION_NEXT 10 /* Next option - numbers must be even */ #ifdef PNG_POWERPC_VSX_API_SUPPORTED
# define PNG_POWERPC_VSX 10 /* HARDWARE: PowerPC VSX SIMD instructions supported */
#endif
#define PNG_OPTION_NEXT 12 /* Next option - numbers must be even */
/* Return values: NOTE: there are four values and 'off' is *not* zero */ /* Return values: NOTE: there are four values and 'off' is *not* zero */
#define PNG_OPTION_UNSET 0 /* Unset - defaults to off */ #define PNG_OPTION_UNSET 0 /* Unset - defaults to off */

View File

@ -190,6 +190,14 @@
# endif # endif
#endif #endif
#ifndef PNG_POWERPC_VSX_OPT
# if defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__)
# define PNG_POWERPC_VSX_OPT 2
# else
# define PNG_POWERPC_VSX_OPT 0
# endif
#endif
#ifndef PNG_INTEL_SSE_OPT #ifndef PNG_INTEL_SSE_OPT
# ifdef PNG_INTEL_SSE # ifdef PNG_INTEL_SSE
/* Only check for SSE if the build configuration has been modified to /* Only check for SSE if the build configuration has been modified to
@ -246,6 +254,11 @@
# endif # endif
#endif /* PNG_MIPS_MSA_OPT > 0 */ #endif /* PNG_MIPS_MSA_OPT > 0 */
#if PNG_POWERPC_VSX_OPT > 0
# define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_vsx
# define PNG_POWERPC_VSX_IMPLEMENTATION 1
#endif
/* Is this a build of a DLL where compilation of the object modules requires /* Is this a build of a DLL where compilation of the object modules requires
* different preprocessor settings to those required for a simple library? If * different preprocessor settings to those required for a simple library? If
@ -1292,6 +1305,23 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_msa,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
#endif #endif
#if PNG_POWERPC_VSX_OPT > 0
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_vsx,(png_row_infop row_info,
png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_vsx,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_vsx,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_vsx,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_vsx,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_vsx,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_vsx,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
#endif
#if PNG_INTEL_SSE_IMPLEMENTATION > 0 #if PNG_INTEL_SSE_IMPLEMENTATION > 0
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);

View File

@ -0,0 +1,767 @@
/* filter_vsx_intrinsics.c - PowerPC optimised filter functions
*
* Copyright (c) 2016 Glenn Randers-Pehrson
* Written by Vadim Barkov, 2017.
* Last changed in libpng 1.6.25 [September 1, 2016]
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*/
#include <stdio.h>
#include <stdint.h>
#include "../pngpriv.h"
#ifdef PNG_READ_SUPPORTED
/* This code requires -maltivec and -mvsx on the command line: */
#if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
#include <altivec.h>
#if PNG_POWERPC_VSX_OPT > 0
#ifndef __VSX__
# error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag."
#endif
#define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data)
#define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data)
/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
* They're positioned like this:
* prev: c b
* row: a d
* The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
* whichever of a, b, or c is closest to p=a+b-c.
* ( this is taken from ../intel/filter_sse2_intrinsics.c )
*/
#define vsx_declare_common_vars(row_info,row,prev_row,offset) \
png_byte i;\
png_bytep rp = row + offset;\
png_const_bytep pp = prev_row;\
png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\
png_size_t istop;\
if(unaligned_top == 16)\
unaligned_top = 0;\
istop = row_info->rowbytes;\
if((unaligned_top < istop))\
istop -= unaligned_top;\
else{\
unaligned_top = istop;\
istop = 0;\
}
void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
vector unsigned char rp_vec;
vector unsigned char pp_vec;
vsx_declare_common_vars(row_info,row,prev_row,0)
/* Altivec operations require 16-byte aligned data
* but input can be unaligned. So we calculate
* unaligned part as usual.
*/
for (i = 0; i < unaligned_top; i++)
{
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
rp++;
}
/* Using SIMD while we can */
while( istop >= 16 )
{
rp_vec = vec_ld(0,rp);
vec_ld_unaligned(pp_vec,pp);
rp_vec = vec_add(rp_vec,pp_vec);
vec_st(rp_vec,0,rp);
pp += 16;
rp += 16;
istop -= 16;
}
if(istop > 0)
{
/* If byte count of row is not divisible by 16
* we will process remaining part as usual
*/
for (i = 0; i < istop; i++)
{
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
rp++;
}
}
}
static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16};
static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11};
static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16};
static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16};
static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16};
static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16};
static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15};
static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16};
static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16};
static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16};
static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
#ifdef __LITTLE_ENDIAN__
static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6};
static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16};
#elif defined(__BIG_ENDIAN__)
static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7};
static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16};
static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16};
#endif
#define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp)
#define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp)
#ifdef PNG_USE_ABS
# define vsx_abs(number) abs(number)
#else
# define vsx_abs(number) (number > 0) ? (number) : -(number)
#endif
void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
const png_byte bpp = 4;
vector unsigned char rp_vec;
vector unsigned char part_vec;
vsx_declare_common_vars(row_info,row,prev_row,bpp)
PNG_UNUSED(pp)
/* Altivec operations require 16-byte aligned data
* but input can be unaligned. So we calculate
* unaligned part as usual.
*/
for (i = 0; i < unaligned_top; i++)
{
*rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
rp++;
}
/* Using SIMD while we can */
while( istop >= 16 )
{
for(i=0;i < bpp ; i++)
{
*rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
rp++;
}
rp -= bpp;
rp_vec = vec_ld(0,rp);
part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
rp_vec = vec_add(rp_vec,part_vec);
part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
rp_vec = vec_add(rp_vec,part_vec);
part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
rp_vec = vec_add(rp_vec,part_vec);
vec_st(rp_vec,0,rp);
rp += 16;
istop -= 16;
}
if(istop > 0)
for (i = 0; i < istop % 16; i++)
{
*rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff);
rp++;
}
}
void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
const png_byte bpp = 3;
vector unsigned char rp_vec;
vector unsigned char part_vec;
vsx_declare_common_vars(row_info,row,prev_row,bpp)
PNG_UNUSED(pp)
/* Altivec operations require 16-byte aligned data
* but input can be unaligned. So we calculate
* unaligned part as usual.
*/
for (i = 0; i < unaligned_top; i++)
{
*rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
rp++;
}
/* Using SIMD while we can */
while( istop >= 16 )
{
for(i=0;i < bpp ; i++)
{
*rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
rp++;
}
rp -= bpp;
rp_vec = vec_ld(0,rp);
part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
rp_vec = vec_add(rp_vec,part_vec);
part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
rp_vec = vec_add(rp_vec,part_vec);
part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
rp_vec = vec_add(rp_vec,part_vec);
part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
rp_vec = vec_add(rp_vec,part_vec);
vec_st(rp_vec,0,rp);
rp += 15;
istop -= 16;
/* Since 16 % bpp = 16 % 3 = 1, last element of array must
* be proceeded manually
*/
*rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
rp++;
}
if(istop > 0)
for (i = 0; i < istop % 16; i++)
{
*rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
rp++;
}
}
void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
const png_byte bpp = 4;
vector unsigned char rp_vec;
vector unsigned char pp_vec;
vector unsigned char pp_part_vec;
vector unsigned char rp_part_vec;
vector unsigned char avg_vec;
vsx_declare_common_vars(row_info,row,prev_row,bpp)
rp -= bpp;
if(istop >= bpp)
istop -= bpp;
for (i = 0; i < bpp; i++)
{
*rp = (png_byte)(((int)(*rp) +
((int)(*pp++) / 2 )) & 0xff);
rp++;
}
/* Altivec operations require 16-byte aligned data
* but input can be unaligned. So we calculate
* unaligned part as usual.
*/
for (i = 0; i < unaligned_top; i++)
{
*rp = (png_byte)(((int)(*rp) +
(int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
rp++;
}
/* Using SIMD while we can */
while( istop >= 16 )
{
for(i=0;i < bpp ; i++)
{
*rp = (png_byte)(((int)(*rp) +
(int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
rp++;
}
rp -= bpp;
pp -= bpp;
vec_ld_unaligned(pp_vec,pp);
rp_vec = vec_ld(0,rp);
rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4);
avg_vec = vec_avg(rp_part_vec,pp_part_vec);
avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
rp_vec = vec_add(rp_vec,avg_vec);
rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4);
avg_vec = vec_avg(rp_part_vec,pp_part_vec);
avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
rp_vec = vec_add(rp_vec,avg_vec);
rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4);
avg_vec = vec_avg(rp_part_vec,pp_part_vec);
avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
rp_vec = vec_add(rp_vec,avg_vec);
vec_st(rp_vec,0,rp);
rp += 16;
pp += 16;
istop -= 16;
}
if(istop > 0)
for (i = 0; i < istop % 16; i++)
{
*rp = (png_byte)(((int)(*rp) +
(int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
rp++;
}
}
void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
const png_byte bpp = 3;
vector unsigned char rp_vec;
vector unsigned char pp_vec;
vector unsigned char pp_part_vec;
vector unsigned char rp_part_vec;
vector unsigned char avg_vec;
vsx_declare_common_vars(row_info,row,prev_row,bpp)
rp -= bpp;
if(istop >= bpp)
istop -= bpp;
for (i = 0; i < bpp; i++)
{
*rp = (png_byte)(((int)(*rp) +
((int)(*pp++) / 2 )) & 0xff);
rp++;
}
/* Altivec operations require 16-byte aligned data
* but input can be unaligned. So we calculate
* unaligned part as usual.
*/
for (i = 0; i < unaligned_top; i++)
{
*rp = (png_byte)(((int)(*rp) +
(int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
rp++;
}
/* Using SIMD while we can */
while( istop >= 16 )
{
for(i=0;i < bpp ; i++)
{
*rp = (png_byte)(((int)(*rp) +
(int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
rp++;
}
rp -= bpp;
pp -= bpp;
vec_ld_unaligned(pp_vec,pp);
rp_vec = vec_ld(0,rp);
rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3);
avg_vec = vec_avg(rp_part_vec,pp_part_vec);
avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
rp_vec = vec_add(rp_vec,avg_vec);
rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3);
avg_vec = vec_avg(rp_part_vec,pp_part_vec);
avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
rp_vec = vec_add(rp_vec,avg_vec);
rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3);
avg_vec = vec_avg(rp_part_vec,pp_part_vec);
avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
rp_vec = vec_add(rp_vec,avg_vec);
rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3);
avg_vec = vec_avg(rp_part_vec,pp_part_vec);
avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
rp_vec = vec_add(rp_vec,avg_vec);
vec_st(rp_vec,0,rp);
rp += 15;
pp += 15;
istop -= 16;
/* Since 16 % bpp = 16 % 3 = 1, last element of array must
* be proceeded manually
*/
*rp = (png_byte)(((int)(*rp) +
(int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
rp++;
}
if(istop > 0)
for (i = 0; i < istop % 16; i++)
{
*rp = (png_byte)(((int)(*rp) +
(int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
rp++;
}
}
/* Bytewise c ? t : e. */
#define if_then_else(c,t,e) vec_sel(e,t,c)
#define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\
c = *(pp - bpp);\
a = *(rp - bpp);\
b = *pp++;\
p = b - c;\
pc = a - c;\
pa = vsx_abs(p);\
pb = vsx_abs(pc);\
pc = vsx_abs(p + pc);\
if (pb < pa) pa = pb, a = b;\
if (pc < pa) a = c;\
a += *rp;\
*rp++ = (png_byte)a;\
}
void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
const png_byte bpp = 4;
int a, b, c, pa, pb, pc, p;
vector unsigned char rp_vec;
vector unsigned char pp_vec;
vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
vsx_declare_common_vars(row_info,row,prev_row,bpp)
rp -= bpp;
if(istop >= bpp)
istop -= bpp;
/* Process the first pixel in the row completely (this is the same as 'up'
* because there is only one candidate predictor for the first row).
*/
for(i = 0; i < bpp ; i++)
{
*rp = (png_byte)( *rp + *pp);
rp++;
pp++;
}
for(i = 0; i < unaligned_top ; i++)
{
vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
}
while( istop >= 16)
{
for(i = 0; i < bpp ; i++)
{
vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
}
rp -= bpp;
pp -= bpp;
rp_vec = vec_ld(0,rp);
vec_ld_unaligned(pp_vec,pp);
a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4);
c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
pc_vec = vec_add(pa_vec,pb_vec);
pa_vec = vec_abs(pa_vec);
pb_vec = vec_abs(pb_vec);
pc_vec = vec_abs(pc_vec);
smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
nearest_vec = if_then_else(
vec_cmpeq(pa_vec,smallest_vec),
a_vec,
if_then_else(
vec_cmpeq(pb_vec,smallest_vec),
b_vec,
c_vec
)
);
rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4)));
a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4);
c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
pc_vec = vec_add(pa_vec,pb_vec);
pa_vec = vec_abs(pa_vec);
pb_vec = vec_abs(pb_vec);
pc_vec = vec_abs(pc_vec);
smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
nearest_vec = if_then_else(
vec_cmpeq(pa_vec,smallest_vec),
a_vec,
if_then_else(
vec_cmpeq(pb_vec,smallest_vec),
b_vec,
c_vec
)
);
rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4)));
a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4);
c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
pc_vec = vec_add(pa_vec,pb_vec);
pa_vec = vec_abs(pa_vec);
pb_vec = vec_abs(pb_vec);
pc_vec = vec_abs(pc_vec);
smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
nearest_vec = if_then_else(
vec_cmpeq(pa_vec,smallest_vec),
a_vec,
if_then_else(
vec_cmpeq(pb_vec,smallest_vec),
b_vec,
c_vec
)
);
rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4)));
vec_st(rp_vec,0,rp);
rp += 16;
pp += 16;
istop -= 16;
}
if(istop > 0)
for (i = 0; i < istop % 16; i++)
{
vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
}
}
void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
const png_byte bpp = 3;
int a, b, c, pa, pb, pc, p;
vector unsigned char rp_vec;
vector unsigned char pp_vec;
vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
vsx_declare_common_vars(row_info,row,prev_row,bpp)
rp -= bpp;
if(istop >= bpp)
istop -= bpp;
/* Process the first pixel in the row completely (this is the same as 'up'
* because there is only one candidate predictor for the first row).
*/
for(i = 0; i < bpp ; i++)
{
*rp = (png_byte)( *rp + *pp);
rp++;
pp++;
}
for(i = 0; i < unaligned_top ; i++)
{
vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
}
while( istop >= 16)
{
for(i = 0; i < bpp ; i++)
{
vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
}
rp -= bpp;
pp -= bpp;
rp_vec = vec_ld(0,rp);
vec_ld_unaligned(pp_vec,pp);
a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3);
c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
pc_vec = vec_add(pa_vec,pb_vec);
pa_vec = vec_abs(pa_vec);
pb_vec = vec_abs(pb_vec);
pc_vec = vec_abs(pc_vec);
smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
nearest_vec = if_then_else(
vec_cmpeq(pa_vec,smallest_vec),
a_vec,
if_then_else(
vec_cmpeq(pb_vec,smallest_vec),
b_vec,
c_vec
)
);
rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3)));
a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3);
c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
pc_vec = vec_add(pa_vec,pb_vec);
pa_vec = vec_abs(pa_vec);
pb_vec = vec_abs(pb_vec);
pc_vec = vec_abs(pc_vec);
smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
nearest_vec = if_then_else(
vec_cmpeq(pa_vec,smallest_vec),
a_vec,
if_then_else(
vec_cmpeq(pb_vec,smallest_vec),
b_vec,
c_vec
)
);
rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3)));
a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3);
c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
pc_vec = vec_add(pa_vec,pb_vec);
pa_vec = vec_abs(pa_vec);
pb_vec = vec_abs(pb_vec);
pc_vec = vec_abs(pc_vec);
smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
nearest_vec = if_then_else(
vec_cmpeq(pa_vec,smallest_vec),
a_vec,
if_then_else(
vec_cmpeq(pb_vec,smallest_vec),
b_vec,
c_vec
)
);
rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3)));
a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3);
c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
pc_vec = vec_add(pa_vec,pb_vec);
pa_vec = vec_abs(pa_vec);
pb_vec = vec_abs(pb_vec);
pc_vec = vec_abs(pc_vec);
smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
nearest_vec = if_then_else(
vec_cmpeq(pa_vec,smallest_vec),
a_vec,
if_then_else(
vec_cmpeq(pb_vec,smallest_vec),
b_vec,
c_vec
)
);
rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3)));
vec_st(rp_vec,0,rp);
rp += 15;
pp += 15;
istop -= 16;
/* Since 16 % bpp = 16 % 3 = 1, last element of array must
* be proceeded manually
*/
vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
}
if(istop > 0)
for (i = 0; i < istop % 16; i++)
{
vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
}
}
#endif /* PNG_POWERPC_VSX_OPT > 0 */
#endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */
#endif /* READ */

122
powerpc/powerpc_init.c Normal file
View File

@ -0,0 +1,122 @@
/* powerpc_init.c - POWERPC optimised filter functions
*
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*/
/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are
* called.
*/
#define _POSIX_SOURCE 1
#include <stdio.h>
#include "../pngpriv.h"
#ifdef PNG_READ_SUPPORTED
#if PNG_POWERPC_VSX_OPT > 0
#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED /* Do run-time checks */
/* WARNING: it is strongly recommended that you do not build libpng with
* run-time checks for CPU features if at all possible. In the case of the PowerPC
* VSX instructions there is no processor-specific way of detecting the
* presence of the required support, therefore run-time detection is extremely
* OS specific.
*
* You may set the macro PNG_POWERPC_VSX_FILE to the file name of file containing
* a fragment of C source code which defines the png_have_vsx function. There
* are a number of implementations in contrib/powerpc-vsx, but the only one that
* has partial support is contrib/powerpc-vsx/linux.c - a generic Linux
* implementation which reads /proc/cpufino.
*/
#ifndef PNG_POWERPC_VSX_FILE
# ifdef __linux__
# define PNG_POWERPC_VSX_FILE "contrib/powerpc-vsx/linux_aux.c"
# endif
#endif
#ifdef PNG_POWERPC_VSX_FILE
#include <signal.h> /* for sig_atomic_t */
static int png_have_vsx(png_structp png_ptr);
#include PNG_POWERPC_VSX_FILE
#else /* PNG_POWERPC_VSX_FILE */
# error "PNG_POWERPC_VSX_FILE undefined: no support for run-time POWERPC VSX checks"
#endif /* PNG_POWERPC_VSX_FILE */
#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */
void
png_init_filter_functions_vsx(png_structp pp, unsigned int bpp)
{
/* The switch statement is compiled in for POWERPC_VSX_API, the call to
* png_have_vsx is compiled in for POWERPC_VSX_CHECK. If both are defined
* the check is only performed if the API has not set the PowerPC option on
* or off explicitly. In this case the check controls what happens.
*/
#ifdef PNG_POWERPC_VSX_API_SUPPORTED
switch ((pp->options >> PNG_POWERPC_VSX) & 3)
{
case PNG_OPTION_UNSET:
/* Allow the run-time check to execute if it has been enabled -
* thus both API and CHECK can be turned on. If it isn't supported
* this case will fall through to the 'default' below, which just
* returns.
*/
#endif /* PNG_POWERPC_VSX_API_SUPPORTED */
#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED
{
static volatile sig_atomic_t no_vsx = -1; /* not checked */
if (no_vsx < 0)
no_vsx = !png_have_vsx(pp);
if (no_vsx)
return;
}
#ifdef PNG_POWERPC_VSX_API_SUPPORTED
break;
#endif
#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */
#ifdef PNG_POWERPC_VSX_API_SUPPORTED
default: /* OFF or INVALID */
return;
case PNG_OPTION_ON:
/* Option turned on */
break;
}
#endif
/* IMPORTANT: any new internal functions used here must be declared using
* PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the
* 'prefix' option to configure works:
*
* ./configure --with-libpng-prefix=foobar_
*
* Verify you have got this right by running the above command, doing a build
* and examining pngprefix.h; it must contain a #define for every external
* function you add. (Notice that this happens automatically for the
* initialization function.)
*/
pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_vsx;
if (bpp == 3)
{
pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_vsx;
pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_vsx;
pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_vsx;
}
else if (bpp == 4)
{
pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_vsx;
pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_vsx;
pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_vsx;
}
}
#endif /* PNG_POWERPC_VSX_OPT > 0 */
#endif /* READ */

View File

@ -229,6 +229,33 @@ option ARM_NEON_API disabled requires ALIGNED_MEMORY enables SET_OPTION,
option ARM_NEON_CHECK disabled requires ALIGNED_MEMORY, option ARM_NEON_CHECK disabled requires ALIGNED_MEMORY,
sets ARM_NEON_OPT 1 sets ARM_NEON_OPT 1
# These options are specific to the PowerPC VSX hardware optimizations.
#
# POWERPC_VSX_OPT: unset: check at compile time (__PPC64__,__ALTIVEC__,__VSX__
# must be defined by the compiler, typically as a result
# of specifying
# "-mvsx -maltivec" compiler flags)
# 0: disable (even if the CPU supports VSX.)
# 1: check at run time (via POWERPC_VSX_{API,CHECK})
# 2: switch on unconditionally (inadvisable - instead pass
# -mvsx -maltivec to compiler options)
# When building libpng avoid using any setting other than '0'; '1' is
# set automatically when either 'API' or 'CHECK' are configured in,
# '2' should not be necessary as "-mvsx -maltivec" will achieve the same
# effect as well as applying VSX optimizations to the rest of the
# libpng code.
# POWERPC_VSX_API: (PNG_POWERPC_VSX == 1) allow the optimization to be switched on
# with png_set_option
# POWERPC_VSX_CHECK: (PNG_POWERPC_VSX == 1) compile a run-time check to see if VSX
# extensions are supported. This is supported not for all OSes
# (see contrib/powerpc/README)
setting POWERPC_VSX_OPT
option POWERPC_VSX_API disabled enables SET_OPTION,
sets POWERPC_VSX_OPT 1
option POWERPC_VSX_CHECK disabled,
sets POWERPC_VSX_OPT 1
# These settings configure the default compression level (0-9) and 'strategy'; # These settings configure the default compression level (0-9) and 'strategy';
# strategy is as defined by the implementors of zlib. It describes the input # strategy is as defined by the implementors of zlib. It describes the input
# data and modifies the zlib parameters in an attempt to optimize the balance # data and modifies the zlib parameters in an attempt to optimize the balance

View File

@ -20,6 +20,8 @@
#define PNG_ALIGNED_MEMORY_SUPPORTED #define PNG_ALIGNED_MEMORY_SUPPORTED
/*#undef PNG_ARM_NEON_API_SUPPORTED*/ /*#undef PNG_ARM_NEON_API_SUPPORTED*/
/*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/ /*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/
/*#undef PNG_POWERPC_VSX_API_SUPPORTED*/
/*#undef PNG_POWERPC_VSX_CHECK_SUPPORTED*/
#define PNG_BENIGN_ERRORS_SUPPORTED #define PNG_BENIGN_ERRORS_SUPPORTED
#define PNG_BENIGN_READ_ERRORS_SUPPORTED #define PNG_BENIGN_READ_ERRORS_SUPPORTED
/*#undef PNG_BENIGN_WRITE_ERRORS_SUPPORTED*/ /*#undef PNG_BENIGN_WRITE_ERRORS_SUPPORTED*/