[libpng16] Fix copyright in sse code, use C-style comments

This commit is contained in:
Glenn Randers-Pehrson 2016-02-19 09:46:51 -06:00
parent d06d66e899
commit 52846504da
7 changed files with 119 additions and 92 deletions

View File

@ -67,6 +67,7 @@ Version 1.6.22beta03 [February 19, 2016]
Updated LICENSE to say files in the contrib directory are not Updated LICENSE to say files in the contrib directory are not
necessarily under the libpng license, and that some makefiles have necessarily under the libpng license, and that some makefiles have
other copyright owners. other copyright owners.
Added INTEL-SSE2 support (Mike Klein and Matt Sarett, Google, Inc.).
Send comments/corrections/commendations to png-mng-implement at lists.sf.net Send comments/corrections/commendations to png-mng-implement at lists.sf.net
(subscription required; visit (subscription required; visit

View File

@ -5526,6 +5526,7 @@ Version 1.6.22beta03 [February 19, 2016]
Updated LICENSE to say files in the contrib directory are not Updated LICENSE to say files in the contrib directory are not
necessarily under the libpng license, and that some makefiles have necessarily under the libpng license, and that some makefiles have
other copyright owners. other copyright owners.
Added INTEL-SSE2 support (Mike Klein and Matt Sarett, Google, Inc.).
Send comments/corrections/commendations to png-mng-implement at lists.sf.net Send comments/corrections/commendations to png-mng-implement at lists.sf.net
(subscription required; visit (subscription required; visit

View File

@ -1,5 +1,4 @@
Copyright (c) 2016 Google, Inc. To enable SSE support in libpng, manually edit configure.ac and Makefile.am,
following the instructions in the configure.ac.patch and Makefile.am.patch
To enable SSE support, manually edit configure.ac and Makefile.am, following files, then configure with -DPNG_INTEL_SSE in CPPFLAGS.
the instructions in the configure.ac.patch and Makefile.am.patch files.

View File

@ -1,15 +1,18 @@
#
# Copyright (c) 2016 Google, Inc. # Copyright (c) 2016 Google, Inc.
# Written by Mike Klein and Matt Sarett
# Derived from the ARM supporting code in libpng/Makefile.am, which was
# Copyright (c) 2004-2015 Glenn Randers-Pehrson
#
# Last changed in libpng 1.6.22 [(PENDING RELEASE)]
# #
# This code is released under the libpng license. # This code is released under the libpng license.
# For conditions of distribution and use, see the disclaimer # For conditions of distribution and use, see the disclaimer
# and license in png.h # and license in png.h
# #
# In order to compile Intel SSE optimizations for libpng, please add # In order to compile Intel SSE optimizations for libpng, please add
# the following code to Makefile.am directly beneath the # the following code to Makefile.am under HOST SPECIFIC OPTIONS
# "if PNG_ARM_NEON ... endif" statement. # directly beneath the "if PNG_ARM_NEON ... endif" statement.
if PNG_INTEL_SSE if PNG_INTEL_SSE
libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\ libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\

View File

@ -1,17 +1,21 @@
# #
# Copyright (c) 2016 Google, Inc. # Copyright (c) 2016 Google, Inc.
# Written by Mike Klein and Matt Sarett
# Derived from the ARM supporting code in libpng/configure.ac, which was
# Copyright (c) 2004-2015 Glenn Randers-Pehrson
#
# Last changed in libpng 1.6.22 [(PENDING RELEASE)]
# #
# This code is released under the libpng license. # This code is released under the libpng license.
# For conditions of distribution and use, see the disclaimer # For conditions of distribution and use, see the disclaimer
# and license in png.h # and license in png.h
# #
# In order to compile Intel SSE optimizations for libpng, please add # In order to compile Intel SSE optimizations for libpng, please add
# the following code to configure.ac under HOST SPECIFIC OPTIONS # the following code to configure.ac under HOST SPECIFIC OPTIONS
# directly beneath the section for ARM. # directly beneath the section for ARM.
# INTEL # INTEL
# === # =====
# #
# INTEL SSE (SIMD) support. # INTEL SSE (SIMD) support.

View File

@ -2,6 +2,9 @@
/* filter_sse2_intrinsics.c - SSE2 optimized filter functions /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
* *
* Copyright (c) 2016 Google, Inc. * Copyright (c) 2016 Google, Inc.
* Written by Mike Klein and Matt Sarett
* Derived from arm/filter_neon_intrinsics.c, which was
* Copyright (c) 2014 Glenn Randers-Pehrson
* *
* Last changed in libpng 1.6.22 [(PENDING RELEASE)] * Last changed in libpng 1.6.22 [(PENDING RELEASE)]
* *
@ -18,12 +21,13 @@
#include <immintrin.h> #include <immintrin.h>
// Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
// They're positioned like this: * They're positioned like this:
// prev: c b * prev: c b
// row: a d * row: a d
// The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
// whichever of a, b, or c is closest to p=a+b-c. * whichever of a, b, or c is closest to p=a+b-c.
*/
static __m128i load3(const void* p) { static __m128i load3(const void* p) {
png_uint_32 packed; png_uint_32 packed;
@ -47,9 +51,10 @@ static void store4(void* p, __m128i v) {
void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev) png_const_bytep prev)
{ {
// The Sub filter predicts each pixel as the previous pixel, a. /* The Sub filter predicts each pixel as the previous pixel, a.
// There is no pixel to the left of the first pixel. It's encoded directly. * There is no pixel to the left of the first pixel. It's encoded directly.
// That works with our main loop if we just say that left pixel was zero. * That works with our main loop if we just say that left pixel was zero.
*/
__m128i a, d = _mm_setzero_si128(); __m128i a, d = _mm_setzero_si128();
int rb = row_info->rowbytes; int rb = row_info->rowbytes;
@ -66,9 +71,10 @@ void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev) png_const_bytep prev)
{ {
// The Sub filter predicts each pixel as the previous pixel, a. /* The Sub filter predicts each pixel as the previous pixel, a.
// There is no pixel to the left of the first pixel. It's encoded directly. * There is no pixel to the left of the first pixel. It's encoded directly.
// That works with our main loop if we just say that left pixel was zero. * That works with our main loop if we just say that left pixel was zero.
*/
__m128i a, d = _mm_setzero_si128(); __m128i a, d = _mm_setzero_si128();
int rb = row_info->rowbytes; int rb = row_info->rowbytes;
@ -85,10 +91,11 @@ void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev) png_const_bytep prev)
{ {
// The Avg filter predicts each pixel as the (truncated) average of a and b. /* The Avg filter predicts each pixel as the (truncated) average of a and b.
// There's no pixel to the left of the first pixel. Luckily, it's * There's no pixel to the left of the first pixel. Luckily, it's
// predicted to be half of the pixel above it. So again, this works * predicted to be half of the pixel above it. So again, this works
// perfectly with our loop if we make sure a starts at zero. * perfectly with our loop if we make sure a starts at zero.
*/
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i b; __m128i b;
__m128i a, d = zero; __m128i a, d = zero;
@ -98,9 +105,9 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
b = load3(prev); b = load3(prev);
a = d; d = load3(row ); a = d; d = load3(row );
// PNG requires a truncating average, so we can't just use _mm_avg_epu8... /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
__m128i avg = _mm_avg_epu8(a,b); __m128i avg = _mm_avg_epu8(a,b);
// ...but we can fix it up by subtracting off 1 if it rounded up. /* ...but we can fix it up by subtracting off 1 if it rounded up. */
avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
_mm_set1_epi8(1))); _mm_set1_epi8(1)));
@ -116,10 +123,11 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev) png_const_bytep prev)
{ {
// The Avg filter predicts each pixel as the (truncated) average of a and b. /* The Avg filter predicts each pixel as the (truncated) average of a and b.
// There's no pixel to the left of the first pixel. Luckily, it's * There's no pixel to the left of the first pixel. Luckily, it's
// predicted to be half of the pixel above it. So again, this works * predicted to be half of the pixel above it. So again, this works
// perfectly with our loop if we make sure a starts at zero. * perfectly with our loop if we make sure a starts at zero.
*/
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i b; __m128i b;
__m128i a, d = zero; __m128i a, d = zero;
@ -129,9 +137,9 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
b = load4(prev); b = load4(prev);
a = d; d = load4(row ); a = d; d = load4(row );
// PNG requires a truncating average, so we can't just use _mm_avg_epu8... /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
__m128i avg = _mm_avg_epu8(a,b); __m128i avg = _mm_avg_epu8(a,b);
// ...but we can fix it up by subtracting off 1 if it rounded up. /* ...but we can fix it up by subtracting off 1 if it rounded up. */
avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
_mm_set1_epi8(1))); _mm_set1_epi8(1)));
@ -144,25 +152,26 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
} }
} }
// Returns |x| for 16-bit lanes. /* Returns |x| for 16-bit lanes. */
static __m128i abs_i16(__m128i x) { static __m128i abs_i16(__m128i x) {
#if PNG_INTEL_SSE_IMPLEMENTATION >= 2 #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
return _mm_abs_epi16(x); return _mm_abs_epi16(x);
#else #else
// Read this all as, return x<0 ? -x : x. /* Read this all as, return x<0 ? -x : x.
// To negate two's complement, you flip all the bits then add 1. * To negate two's complement, you flip all the bits then add 1.
*/
__m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
// Flip negative lanes. /* Flip negative lanes. */
x = _mm_xor_si128(x, is_negative); x = _mm_xor_si128(x, is_negative);
// +1 to negative lanes, else +0. /* +1 to negative lanes, else +0. */
x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));
return x; return x;
#endif #endif
} }
// Bytewise c ? t : e. /* Bytewise c ? t : e. */
static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
#if PNG_INTEL_SSE_IMPLEMENTATION >= 3 #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
return _mm_blendv_epi8(e,t,c); return _mm_blendv_epi8(e,t,c);
@ -174,50 +183,52 @@ static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev) png_const_bytep prev)
{ {
// Paeth tries to predict pixel d using the pixel to the left of it, a, /* Paeth tries to predict pixel d using the pixel to the left of it, a,
// and two pixels from the previous row, b and c: * and two pixels from the previous row, b and c:
// prev: c b * prev: c b
// row: a d * row: a d
// The Paeth function predicts d to be whichever of a, b, or c is nearest to * The Paeth function predicts d to be whichever of a, b, or c is nearest to
// p=a+b-c. * p=a+b-c.
*
// The first pixel has no left context, and so uses an Up filter, p = b. * The first pixel has no left context, and so uses an Up filter, p = b.
// This works naturally with our main loop's p = a+b-c if we force a and c * This works naturally with our main loop's p = a+b-c if we force a and c
// to zero. * to zero.
// Here we zero b and d, which become c and a respectively at the start of * Here we zero b and d, which become c and a respectively at the start of
// the loop. * the loop.
*/
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i c, b = zero, __m128i c, b = zero,
a, d = zero; a, d = zero;
int rb = row_info->rowbytes; int rb = row_info->rowbytes;
while (rb > 0) { while (rb > 0) {
// It's easiest to do this math (particularly, deal with pc) with 16-bit /* It's easiest to do this math (particularly, deal with pc) with 16-bit
// intermediates. * intermediates.
*/
c = b; b = _mm_unpacklo_epi8(load3(prev), zero); c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
a = d; d = _mm_unpacklo_epi8(load3(row ), zero); a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
// (p-a) == (a+b-c - a) == (b-c) /* (p-a) == (a+b-c - a) == (b-c) */
__m128i pa = _mm_sub_epi16(b,c); __m128i pa = _mm_sub_epi16(b,c);
// (p-b) == (a+b-c - b) == (a-c) /* (p-b) == (a+b-c - b) == (a-c) */
__m128i pb = _mm_sub_epi16(a,c); __m128i pb = _mm_sub_epi16(a,c);
// (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
__m128i pc = _mm_add_epi16(pa,pb); __m128i pc = _mm_add_epi16(pa,pb);
pa = abs_i16(pa); // |p-a| pa = abs_i16(pa); /* |p-a| */
pb = abs_i16(pb); // |p-b| pb = abs_i16(pb); /* |p-b| */
pc = abs_i16(pc); // |p-c| pc = abs_i16(pc); /* |p-c| */
__m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
// Paeth breaks ties favoring a over b over c. /* Paeth breaks ties favoring a over b over c. */
__m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
if_then_else(_mm_cmpeq_epi16(smallest, pb), b, if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
c)); c));
// Note `_epi8`: we need addition to wrap modulo 255. /* Note `_epi8`: we need addition to wrap modulo 255. */
d = _mm_add_epi8(d, nearest); d = _mm_add_epi8(d, nearest);
store3(row, _mm_packus_epi16(d,d)); store3(row, _mm_packus_epi16(d,d));
@ -230,50 +241,52 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
png_const_bytep prev) png_const_bytep prev)
{ {
// Paeth tries to predict pixel d using the pixel to the left of it, a, /* Paeth tries to predict pixel d using the pixel to the left of it, a,
// and two pixels from the previous row, b and c: * and two pixels from the previous row, b and c:
// prev: c b * prev: c b
// row: a d * row: a d
// The Paeth function predicts d to be whichever of a, b, or c is nearest to * The Paeth function predicts d to be whichever of a, b, or c is nearest to
// p=a+b-c. * p=a+b-c.
*
// The first pixel has no left context, and so uses an Up filter, p = b. * The first pixel has no left context, and so uses an Up filter, p = b.
// This works naturally with our main loop's p = a+b-c if we force a and c * This works naturally with our main loop's p = a+b-c if we force a and c
// to zero. * to zero.
// Here we zero b and d, which become c and a respectively at the start of * Here we zero b and d, which become c and a respectively at the start of
// the loop. * the loop.
*/
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i c, b = zero, __m128i c, b = zero,
a, d = zero; a, d = zero;
int rb = row_info->rowbytes; int rb = row_info->rowbytes;
while (rb > 0) { while (rb > 0) {
// It's easiest to do this math (particularly, deal with pc) with 16-bit /* It's easiest to do this math (particularly, deal with pc) with 16-bit
// intermediates. * intermediates.
*/
c = b; b = _mm_unpacklo_epi8(load4(prev), zero); c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
a = d; d = _mm_unpacklo_epi8(load4(row ), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
// (p-a) == (a+b-c - a) == (b-c) /* (p-a) == (a+b-c - a) == (b-c) */
__m128i pa = _mm_sub_epi16(b,c); __m128i pa = _mm_sub_epi16(b,c);
// (p-b) == (a+b-c - b) == (a-c) /* (p-b) == (a+b-c - b) == (a-c) */
__m128i pb = _mm_sub_epi16(a,c); __m128i pb = _mm_sub_epi16(a,c);
// (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
__m128i pc = _mm_add_epi16(pa,pb); __m128i pc = _mm_add_epi16(pa,pb);
pa = abs_i16(pa); // |p-a| pa = abs_i16(pa); /* |p-a| */
pb = abs_i16(pb); // |p-b| pb = abs_i16(pb); /* |p-b| */
pc = abs_i16(pc); // |p-c| pc = abs_i16(pc); /* |p-c| */
__m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
// Paeth breaks ties favoring a over b over c. /* Paeth breaks ties favoring a over b over c. */
__m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
if_then_else(_mm_cmpeq_epi16(smallest, pb), b, if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
c)); c));
// Note `_epi8`: we need addition to wrap modulo 255. /* Note `_epi8`: we need addition to wrap modulo 255. */
d = _mm_add_epi8(d, nearest); d = _mm_add_epi8(d, nearest);
store4(row, _mm_packus_epi16(d,d)); store4(row, _mm_packus_epi16(d,d));

View File

@ -2,6 +2,9 @@
/* intel_init.c - SSE2 optimized filter functions /* intel_init.c - SSE2 optimized filter functions
* *
* Copyright (c) 2016 Google, Inc. * Copyright (c) 2016 Google, Inc.
* Written by Mike Klein and Matt Sarett
* Derived from arm/arm_init.c, which was
* Copyright (c) 2014 Glenn Randers-Pehrson
* *
* Last changed in libpng 1.6.22 [(PENDING RELEASE)] * Last changed in libpng 1.6.22 [(PENDING RELEASE)]
* *
@ -18,13 +21,14 @@
void void
png_init_filter_functions_sse2(png_structp pp, unsigned int bpp) png_init_filter_functions_sse2(png_structp pp, unsigned int bpp)
{ {
// The techniques used to implement each of these filters in SSE operate on /* The techniques used to implement each of these filters in SSE operate on
// one pixel at a time. * one pixel at a time.
// So they generally speed up 3bpp images about 3x, 4bpp images about 4x. * So they generally speed up 3bpp images about 3x, 4bpp images about 4x.
// They can scale up to 6 and 8 bpp images and down to 2 bpp images, * They can scale up to 6 and 8 bpp images and down to 2 bpp images,
// but they'd not likely have any benefit for 1bpp images. * but they'd not likely have any benefit for 1bpp images.
// Most of these can be implemented using only MMX and 64-bit registers, * Most of these can be implemented using only MMX and 64-bit registers,
// but they end up a bit slower than using the equally-ubiquitous SSE2. * but they end up a bit slower than using the equally-ubiquitous SSE2.
*/
if (bpp == 3) if (bpp == 3)
{ {
pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2; pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2;
@ -40,7 +44,9 @@ png_init_filter_functions_sse2(png_structp pp, unsigned int bpp)
png_read_filter_row_paeth4_sse2; png_read_filter_row_paeth4_sse2;
} }
// No need optimize PNG_FILTER_VALUE_UP. The compiler should autovectorize. /* No need optimize PNG_FILTER_VALUE_UP. The compiler should
* autovectorize.
*/
} }
#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */