[libpng16] Fix copyright in sse code, use C-style comments

This commit is contained in:
Glenn Randers-Pehrson 2016-02-19 09:46:51 -06:00
parent d06d66e899
commit 52846504da
7 changed files with 119 additions and 92 deletions

View File

@ -67,6 +67,7 @@ Version 1.6.22beta03 [February 19, 2016]
Updated LICENSE to say files in the contrib directory are not
necessarily under the libpng license, and that some makefiles have
other copyright owners.
Added INTEL-SSE2 support (Mike Klein and Matt Sarett, Google, Inc.).
Send comments/corrections/commendations to png-mng-implement at lists.sf.net
(subscription required; visit

View File

@ -5526,6 +5526,7 @@ Version 1.6.22beta03 [February 19, 2016]
Updated LICENSE to say files in the contrib directory are not
necessarily under the libpng license, and that some makefiles have
other copyright owners.
Added INTEL-SSE2 support (Mike Klein and Matt Sarett, Google, Inc.).
Send comments/corrections/commendations to png-mng-implement at lists.sf.net
(subscription required; visit

View File

@ -1,5 +1,4 @@
Copyright (c) 2016 Google, Inc.
To enable SSE support, manually edit configure.ac and Makefile.am, following
the instructions in the configure.ac.patch and Makefile.am.patch files.
To enable SSE support in libpng, manually edit configure.ac and Makefile.am,
following the instructions in the configure.ac.patch and Makefile.am.patch
files, then configure with -DPNG_INTEL_SSE in CPPFLAGS.

View File

@ -1,15 +1,18 @@
#
# Copyright (c) 2016 Google, Inc.
# Written by Mike Klein and Matt Sarett
# Derived from the ARM supporting code in libpng/Makefile.am, which was
# Copyright (c) 2004-2015 Glenn Randers-Pehrson
#
# Last changed in libpng 1.6.22 [(PENDING RELEASE)]
#
# This code is released under the libpng license.
# For conditions of distribution and use, see the disclaimer
# and license in png.h
#
# In order to compile Intel SSE optimizations for libpng, please add
# the following code to Makefile.am directly beneath the
# "if PNG_ARM_NEON ... endif" statement.
# the following code to Makefile.am under HOST SPECIFIC OPTIONS
# directly beneath the "if PNG_ARM_NEON ... endif" statement.
if PNG_INTEL_SSE
libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\

View File

@ -1,17 +1,21 @@
#
# Copyright (c) 2016 Google, Inc.
# Written by Mike Klein and Matt Sarett
# Derived from the ARM supporting code in libpng/configure.ac, which was
# Copyright (c) 2004-2015 Glenn Randers-Pehrson
#
# Last changed in libpng 1.6.22 [(PENDING RELEASE)]
#
# This code is released under the libpng license.
# For conditions of distribution and use, see the disclaimer
# and license in png.h
#
# In order to compile Intel SSE optimizations for libpng, please add
# the following code to configure.ac under HOST SPECIFIC OPTIONS
# directly beneath the section for ARM.
# INTEL
# ===
# =====
#
# INTEL SSE (SIMD) support.

View File

@ -2,6 +2,9 @@
/* filter_sse2_intrinsics.c - SSE2 optimized filter functions
*
* Copyright (c) 2016 Google, Inc.
* Written by Mike Klein and Matt Sarett
* Derived from arm/filter_neon_intrinsics.c, which was
* Copyright (c) 2014 Glenn Randers-Pehrson
*
* Last changed in libpng 1.6.22 [(PENDING RELEASE)]
*
@ -18,12 +21,13 @@
#include <immintrin.h>
// Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
// They're positioned like this:
// prev: c b
// row: a d
// The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
// whichever of a, b, or c is closest to p=a+b-c.
/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
* They're positioned like this:
* prev: c b
* row: a d
* The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
* whichever of a, b, or c is closest to p=a+b-c.
*/
static __m128i load3(const void* p) {
png_uint_32 packed;
@ -47,9 +51,10 @@ static void store4(void* p, __m128i v) {
void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev)
{
// The Sub filter predicts each pixel as the previous pixel, a.
// There is no pixel to the left of the first pixel. It's encoded directly.
// That works with our main loop if we just say that left pixel was zero.
/* The Sub filter predicts each pixel as the previous pixel, a.
* There is no pixel to the left of the first pixel. It's encoded directly.
* That works with our main loop if we just say that left pixel was zero.
*/
__m128i a, d = _mm_setzero_si128();
int rb = row_info->rowbytes;
@ -66,9 +71,10 @@ void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev)
{
// The Sub filter predicts each pixel as the previous pixel, a.
// There is no pixel to the left of the first pixel. It's encoded directly.
// That works with our main loop if we just say that left pixel was zero.
/* The Sub filter predicts each pixel as the previous pixel, a.
* There is no pixel to the left of the first pixel. It's encoded directly.
* That works with our main loop if we just say that left pixel was zero.
*/
__m128i a, d = _mm_setzero_si128();
int rb = row_info->rowbytes;
@ -85,10 +91,11 @@ void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev)
{
// The Avg filter predicts each pixel as the (truncated) average of a and b.
// There's no pixel to the left of the first pixel. Luckily, it's
// predicted to be half of the pixel above it. So again, this works
// perfectly with our loop if we make sure a starts at zero.
/* The Avg filter predicts each pixel as the (truncated) average of a and b.
* There's no pixel to the left of the first pixel. Luckily, it's
* predicted to be half of the pixel above it. So again, this works
* perfectly with our loop if we make sure a starts at zero.
*/
const __m128i zero = _mm_setzero_si128();
__m128i b;
__m128i a, d = zero;
@ -98,9 +105,9 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
b = load3(prev);
a = d; d = load3(row );
// PNG requires a truncating average, so we can't just use _mm_avg_epu8...
/* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
__m128i avg = _mm_avg_epu8(a,b);
// ...but we can fix it up by subtracting off 1 if it rounded up.
/* ...but we can fix it up by subtracting off 1 if it rounded up. */
avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
_mm_set1_epi8(1)));
@ -116,10 +123,11 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev)
{
// The Avg filter predicts each pixel as the (truncated) average of a and b.
// There's no pixel to the left of the first pixel. Luckily, it's
// predicted to be half of the pixel above it. So again, this works
// perfectly with our loop if we make sure a starts at zero.
/* The Avg filter predicts each pixel as the (truncated) average of a and b.
* There's no pixel to the left of the first pixel. Luckily, it's
* predicted to be half of the pixel above it. So again, this works
* perfectly with our loop if we make sure a starts at zero.
*/
const __m128i zero = _mm_setzero_si128();
__m128i b;
__m128i a, d = zero;
@ -129,9 +137,9 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
b = load4(prev);
a = d; d = load4(row );
// PNG requires a truncating average, so we can't just use _mm_avg_epu8...
/* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
__m128i avg = _mm_avg_epu8(a,b);
// ...but we can fix it up by subtracting off 1 if it rounded up.
/* ...but we can fix it up by subtracting off 1 if it rounded up. */
avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
_mm_set1_epi8(1)));
@ -144,25 +152,26 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
}
}
// Returns |x| for 16-bit lanes.
/* Returns |x| for 16-bit lanes. */
static __m128i abs_i16(__m128i x) {
#if PNG_INTEL_SSE_IMPLEMENTATION >= 2
return _mm_abs_epi16(x);
#else
// Read this all as, return x<0 ? -x : x.
// To negate two's complement, you flip all the bits then add 1.
/* Read this all as, return x<0 ? -x : x.
* To negate two's complement, you flip all the bits then add 1.
*/
__m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
// Flip negative lanes.
/* Flip negative lanes. */
x = _mm_xor_si128(x, is_negative);
// +1 to negative lanes, else +0.
/* +1 to negative lanes, else +0. */
x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));
return x;
#endif
}
// Bytewise c ? t : e.
/* Bytewise c ? t : e. */
static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
#if PNG_INTEL_SSE_IMPLEMENTATION >= 3
return _mm_blendv_epi8(e,t,c);
@ -174,50 +183,52 @@ static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev)
{
// Paeth tries to predict pixel d using the pixel to the left of it, a,
// and two pixels from the previous row, b and c:
// prev: c b
// row: a d
// The Paeth function predicts d to be whichever of a, b, or c is nearest to
// p=a+b-c.
// The first pixel has no left context, and so uses an Up filter, p = b.
// This works naturally with our main loop's p = a+b-c if we force a and c
// to zero.
// Here we zero b and d, which become c and a respectively at the start of
// the loop.
/* Paeth tries to predict pixel d using the pixel to the left of it, a,
* and two pixels from the previous row, b and c:
* prev: c b
* row: a d
* The Paeth function predicts d to be whichever of a, b, or c is nearest to
* p=a+b-c.
*
* The first pixel has no left context, and so uses an Up filter, p = b.
* This works naturally with our main loop's p = a+b-c if we force a and c
* to zero.
* Here we zero b and d, which become c and a respectively at the start of
* the loop.
*/
const __m128i zero = _mm_setzero_si128();
__m128i c, b = zero,
a, d = zero;
int rb = row_info->rowbytes;
while (rb > 0) {
// It's easiest to do this math (particularly, deal with pc) with 16-bit
// intermediates.
/* It's easiest to do this math (particularly, deal with pc) with 16-bit
* intermediates.
*/
c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
// (p-a) == (a+b-c - a) == (b-c)
/* (p-a) == (a+b-c - a) == (b-c) */
__m128i pa = _mm_sub_epi16(b,c);
// (p-b) == (a+b-c - b) == (a-c)
/* (p-b) == (a+b-c - b) == (a-c) */
__m128i pb = _mm_sub_epi16(a,c);
// (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)
/* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
__m128i pc = _mm_add_epi16(pa,pb);
pa = abs_i16(pa); // |p-a|
pb = abs_i16(pb); // |p-b|
pc = abs_i16(pc); // |p-c|
pa = abs_i16(pa); /* |p-a| */
pb = abs_i16(pb); /* |p-b| */
pc = abs_i16(pc); /* |p-c| */
__m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
// Paeth breaks ties favoring a over b over c.
/* Paeth breaks ties favoring a over b over c. */
__m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
c));
// Note `_epi8`: we need addition to wrap modulo 255.
/* Note `_epi8`: we need addition to wrap modulo 255. */
d = _mm_add_epi8(d, nearest);
store3(row, _mm_packus_epi16(d,d));
@ -230,50 +241,52 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev)
{
// Paeth tries to predict pixel d using the pixel to the left of it, a,
// and two pixels from the previous row, b and c:
// prev: c b
// row: a d
// The Paeth function predicts d to be whichever of a, b, or c is nearest to
// p=a+b-c.
// The first pixel has no left context, and so uses an Up filter, p = b.
// This works naturally with our main loop's p = a+b-c if we force a and c
// to zero.
// Here we zero b and d, which become c and a respectively at the start of
// the loop.
/* Paeth tries to predict pixel d using the pixel to the left of it, a,
* and two pixels from the previous row, b and c:
* prev: c b
* row: a d
* The Paeth function predicts d to be whichever of a, b, or c is nearest to
* p=a+b-c.
*
* The first pixel has no left context, and so uses an Up filter, p = b.
* This works naturally with our main loop's p = a+b-c if we force a and c
* to zero.
* Here we zero b and d, which become c and a respectively at the start of
* the loop.
*/
const __m128i zero = _mm_setzero_si128();
__m128i c, b = zero,
a, d = zero;
int rb = row_info->rowbytes;
while (rb > 0) {
// It's easiest to do this math (particularly, deal with pc) with 16-bit
// intermediates.
/* It's easiest to do this math (particularly, deal with pc) with 16-bit
* intermediates.
*/
c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
// (p-a) == (a+b-c - a) == (b-c)
/* (p-a) == (a+b-c - a) == (b-c) */
__m128i pa = _mm_sub_epi16(b,c);
// (p-b) == (a+b-c - b) == (a-c)
/* (p-b) == (a+b-c - b) == (a-c) */
__m128i pb = _mm_sub_epi16(a,c);
// (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)
/* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
__m128i pc = _mm_add_epi16(pa,pb);
pa = abs_i16(pa); // |p-a|
pb = abs_i16(pb); // |p-b|
pc = abs_i16(pc); // |p-c|
pa = abs_i16(pa); /* |p-a| */
pb = abs_i16(pb); /* |p-b| */
pc = abs_i16(pc); /* |p-c| */
__m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
// Paeth breaks ties favoring a over b over c.
/* Paeth breaks ties favoring a over b over c. */
__m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
c));
// Note `_epi8`: we need addition to wrap modulo 255.
/* Note `_epi8`: we need addition to wrap modulo 255. */
d = _mm_add_epi8(d, nearest);
store4(row, _mm_packus_epi16(d,d));

View File

@ -2,6 +2,9 @@
/* intel_init.c - SSE2 optimized filter functions
*
* Copyright (c) 2016 Google, Inc.
* Written by Mike Klein and Matt Sarett
* Derived from arm/arm_init.c, which was
* Copyright (c) 2014 Glenn Randers-Pehrson
*
* Last changed in libpng 1.6.22 [(PENDING RELEASE)]
*
@ -18,13 +21,14 @@
void
png_init_filter_functions_sse2(png_structp pp, unsigned int bpp)
{
// The techniques used to implement each of these filters in SSE operate on
// one pixel at a time.
// So they generally speed up 3bpp images about 3x, 4bpp images about 4x.
// They can scale up to 6 and 8 bpp images and down to 2 bpp images,
// but they'd not likely have any benefit for 1bpp images.
// Most of these can be implemented using only MMX and 64-bit registers,
// but they end up a bit slower than using the equally-ubiquitous SSE2.
/* The techniques used to implement each of these filters in SSE operate on
* one pixel at a time.
* So they generally speed up 3bpp images about 3x, 4bpp images about 4x.
* They can scale up to 6 and 8 bpp images and down to 2 bpp images,
* but they'd not likely have any benefit for 1bpp images.
* Most of these can be implemented using only MMX and 64-bit registers,
* but they end up a bit slower than using the equally-ubiquitous SSE2.
*/
if (bpp == 3)
{
pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2;
@ -40,7 +44,9 @@ png_init_filter_functions_sse2(png_structp pp, unsigned int bpp)
png_read_filter_row_paeth4_sse2;
}
// No need optimize PNG_FILTER_VALUE_UP. The compiler should autovectorize.
/* No need optimize PNG_FILTER_VALUE_UP. The compiler should
* autovectorize.
*/
}
#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */