From 52846504da1f519cf1bb89636caf9110e8677443 Mon Sep 17 00:00:00 2001 From: Glenn Randers-Pehrson Date: Fri, 19 Feb 2016 09:46:51 -0600 Subject: [PATCH] [libpng16] Fix copyright in sse code, use C-style comments --- ANNOUNCE | 1 + CHANGES | 1 + contrib/intel/INSTALL | 7 +- contrib/intel/Makefile.am.patch | 11 +- contrib/intel/configure.ac.patch | 8 +- contrib/intel/filter_sse2_intrinsics.c | 161 +++++++++++++------------ contrib/intel/intel_init.c | 22 ++-- 7 files changed, 119 insertions(+), 92 deletions(-) diff --git a/ANNOUNCE b/ANNOUNCE index c5ba025f5..8ac8b3a1b 100644 --- a/ANNOUNCE +++ b/ANNOUNCE @@ -67,6 +67,7 @@ Version 1.6.22beta03 [February 19, 2016] Updated LICENSE to say files in the contrib directory are not necessarily under the libpng license, and that some makefiles have other copyright owners. + Added INTEL-SSE2 support (Mike Klein and Matt Sarett, Google, Inc.). Send comments/corrections/commendations to png-mng-implement at lists.sf.net (subscription required; visit diff --git a/CHANGES b/CHANGES index f9c8cf2de..3bf817905 100644 --- a/CHANGES +++ b/CHANGES @@ -5526,6 +5526,7 @@ Version 1.6.22beta03 [February 19, 2016] Updated LICENSE to say files in the contrib directory are not necessarily under the libpng license, and that some makefiles have other copyright owners. + Added INTEL-SSE2 support (Mike Klein and Matt Sarett, Google, Inc.). Send comments/corrections/commendations to png-mng-implement at lists.sf.net (subscription required; visit diff --git a/contrib/intel/INSTALL b/contrib/intel/INSTALL index abcfae8e6..d13675449 100644 --- a/contrib/intel/INSTALL +++ b/contrib/intel/INSTALL @@ -1,5 +1,4 @@ -Copyright (c) 2016 Google, Inc. - -To enable SSE support, manually edit configure.ac and Makefile.am, following -the instructions in the configure.ac.patch and Makefile.am.patch files. +To enable SSE support in libpng, manually edit configure.ac and Makefile.am, +following the instructions in the configure.ac.patch and Makefile.am.patch +files, then configure with -DPNG_INTEL_SSE in CPPFLAGS. diff --git a/contrib/intel/Makefile.am.patch b/contrib/intel/Makefile.am.patch index 3921f274d..3ac57ec82 100644 --- a/contrib/intel/Makefile.am.patch +++ b/contrib/intel/Makefile.am.patch @@ -1,15 +1,18 @@ -# # Copyright (c) 2016 Google, Inc. +# Written by Mike Klein and Matt Sarett +# Derived from the ARM supporting code in libpng/Makefile.am, which was +# Copyright (c) 2004-2015 Glenn Randers-Pehrson +# +# Last changed in libpng 1.6.22 [(PENDING RELEASE)] # # This code is released under the libpng license. # For conditions of distribution and use, see the disclaimer # and license in png.h # - # In order to compile Intel SSE optimizations for libpng, please add -# the following code to Makefile.am directly beneath the -# "if PNG_ARM_NEON ... endif" statement. +# the following code to Makefile.am under HOST SPECIFIC OPTIONS +# directly beneath the "if PNG_ARM_NEON ... endif" statement. if PNG_INTEL_SSE libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\ diff --git a/contrib/intel/configure.ac.patch b/contrib/intel/configure.ac.patch index fec654953..cd5d91b27 100644 --- a/contrib/intel/configure.ac.patch +++ b/contrib/intel/configure.ac.patch @@ -1,17 +1,21 @@ # # Copyright (c) 2016 Google, Inc. +# Written by Mike Klein and Matt Sarett +# Derived from the ARM supporting code in libpng/configure.ac, which was +# Copyright (c) 2004-2015 Glenn Randers-Pehrson +# +# Last changed in libpng 1.6.22 [(PENDING RELEASE)] # # This code is released under the libpng license. # For conditions of distribution and use, see the disclaimer # and license in png.h # - # In order to compile Intel SSE optimizations for libpng, please add # the following code to configure.ac under HOST SPECIFIC OPTIONS # directly beneath the section for ARM. # INTEL -# === +# ===== # # INTEL SSE (SIMD) support. diff --git a/contrib/intel/filter_sse2_intrinsics.c b/contrib/intel/filter_sse2_intrinsics.c index b4ca61fa5..7c359b580 100644 --- a/contrib/intel/filter_sse2_intrinsics.c +++ b/contrib/intel/filter_sse2_intrinsics.c @@ -2,6 +2,9 @@ /* filter_sse2_intrinsics.c - SSE2 optimized filter functions * * Copyright (c) 2016 Google, Inc. + * Written by Mike Klein and Matt Sarett + * Derived from arm/filter_neon_intrinsics.c, which was + * Copyright (c) 2014 Glenn Randers-Pehrson * * Last changed in libpng 1.6.22 [(PENDING RELEASE)] * @@ -18,12 +21,13 @@ #include -// Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). -// They're positioned like this: -// prev: c b -// row: a d -// The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be -// whichever of a, b, or c is closest to p=a+b-c. +/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). + * They're positioned like this: + * prev: c b + * row: a d + * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be + * whichever of a, b, or c is closest to p=a+b-c. + */ static __m128i load3(const void* p) { png_uint_32 packed; @@ -47,9 +51,10 @@ static void store4(void* p, __m128i v) { void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { - // The Sub filter predicts each pixel as the previous pixel, a. - // There is no pixel to the left of the first pixel. It's encoded directly. - // That works with our main loop if we just say that left pixel was zero. + /* The Sub filter predicts each pixel as the previous pixel, a. + * There is no pixel to the left of the first pixel. It's encoded directly. + * That works with our main loop if we just say that left pixel was zero. + */ __m128i a, d = _mm_setzero_si128(); int rb = row_info->rowbytes; @@ -66,9 +71,10 @@ void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { - // The Sub filter predicts each pixel as the previous pixel, a. - // There is no pixel to the left of the first pixel. It's encoded directly. - // That works with our main loop if we just say that left pixel was zero. + /* The Sub filter predicts each pixel as the previous pixel, a. + * There is no pixel to the left of the first pixel. It's encoded directly. + * That works with our main loop if we just say that left pixel was zero. + */ __m128i a, d = _mm_setzero_si128(); int rb = row_info->rowbytes; @@ -85,10 +91,11 @@ void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { - // The Avg filter predicts each pixel as the (truncated) average of a and b. - // There's no pixel to the left of the first pixel. Luckily, it's - // predicted to be half of the pixel above it. So again, this works - // perfectly with our loop if we make sure a starts at zero. + /* The Avg filter predicts each pixel as the (truncated) average of a and b. + * There's no pixel to the left of the first pixel. Luckily, it's + * predicted to be half of the pixel above it. So again, this works + * perfectly with our loop if we make sure a starts at zero. + */ const __m128i zero = _mm_setzero_si128(); __m128i b; __m128i a, d = zero; @@ -98,9 +105,9 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, b = load3(prev); a = d; d = load3(row ); - // PNG requires a truncating average, so we can't just use _mm_avg_epu8... + /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ __m128i avg = _mm_avg_epu8(a,b); - // ...but we can fix it up by subtracting off 1 if it rounded up. + /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); @@ -116,10 +123,11 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { - // The Avg filter predicts each pixel as the (truncated) average of a and b. - // There's no pixel to the left of the first pixel. Luckily, it's - // predicted to be half of the pixel above it. So again, this works - // perfectly with our loop if we make sure a starts at zero. + /* The Avg filter predicts each pixel as the (truncated) average of a and b. + * There's no pixel to the left of the first pixel. Luckily, it's + * predicted to be half of the pixel above it. So again, this works + * perfectly with our loop if we make sure a starts at zero. + */ const __m128i zero = _mm_setzero_si128(); __m128i b; __m128i a, d = zero; @@ -129,9 +137,9 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, b = load4(prev); a = d; d = load4(row ); - // PNG requires a truncating average, so we can't just use _mm_avg_epu8... + /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ __m128i avg = _mm_avg_epu8(a,b); - // ...but we can fix it up by subtracting off 1 if it rounded up. + /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); @@ -144,25 +152,26 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, } } -// Returns |x| for 16-bit lanes. +/* Returns |x| for 16-bit lanes. */ static __m128i abs_i16(__m128i x) { #if PNG_INTEL_SSE_IMPLEMENTATION >= 2 return _mm_abs_epi16(x); #else - // Read this all as, return x<0 ? -x : x. - // To negate two's complement, you flip all the bits then add 1. + /* Read this all as, return x<0 ? -x : x. + * To negate two's complement, you flip all the bits then add 1. + */ __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); - // Flip negative lanes. + /* Flip negative lanes. */ x = _mm_xor_si128(x, is_negative); - // +1 to negative lanes, else +0. + /* +1 to negative lanes, else +0. */ x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); return x; #endif } -// Bytewise c ? t : e. +/* Bytewise c ? t : e. */ static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { #if PNG_INTEL_SSE_IMPLEMENTATION >= 3 return _mm_blendv_epi8(e,t,c); @@ -174,50 +183,52 @@ static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { - // Paeth tries to predict pixel d using the pixel to the left of it, a, - // and two pixels from the previous row, b and c: - // prev: c b - // row: a d - // The Paeth function predicts d to be whichever of a, b, or c is nearest to - // p=a+b-c. - - // The first pixel has no left context, and so uses an Up filter, p = b. - // This works naturally with our main loop's p = a+b-c if we force a and c - // to zero. - // Here we zero b and d, which become c and a respectively at the start of - // the loop. + /* Paeth tries to predict pixel d using the pixel to the left of it, a, + * and two pixels from the previous row, b and c: + * prev: c b + * row: a d + * The Paeth function predicts d to be whichever of a, b, or c is nearest to + * p=a+b-c. + * + * The first pixel has no left context, and so uses an Up filter, p = b. + * This works naturally with our main loop's p = a+b-c if we force a and c + * to zero. + * Here we zero b and d, which become c and a respectively at the start of + * the loop. + */ const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { - // It's easiest to do this math (particularly, deal with pc) with 16-bit - // intermediates. + /* It's easiest to do this math (particularly, deal with pc) with 16-bit + * intermediates. + */ c = b; b = _mm_unpacklo_epi8(load3(prev), zero); a = d; d = _mm_unpacklo_epi8(load3(row ), zero); - // (p-a) == (a+b-c - a) == (b-c) + /* (p-a) == (a+b-c - a) == (b-c) */ __m128i pa = _mm_sub_epi16(b,c); - // (p-b) == (a+b-c - b) == (a-c) + /* (p-b) == (a+b-c - b) == (a-c) */ __m128i pb = _mm_sub_epi16(a,c); - // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) + /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ __m128i pc = _mm_add_epi16(pa,pb); - pa = abs_i16(pa); // |p-a| - pb = abs_i16(pb); // |p-b| - pc = abs_i16(pc); // |p-c| + pa = abs_i16(pa); /* |p-a| */ + pb = abs_i16(pb); /* |p-b| */ + pc = abs_i16(pc); /* |p-c| */ __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); - // Paeth breaks ties favoring a over b over c. + /* Paeth breaks ties favoring a over b over c. */ __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); - // Note `_epi8`: we need addition to wrap modulo 255. + /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store3(row, _mm_packus_epi16(d,d)); @@ -230,50 +241,52 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { - // Paeth tries to predict pixel d using the pixel to the left of it, a, - // and two pixels from the previous row, b and c: - // prev: c b - // row: a d - // The Paeth function predicts d to be whichever of a, b, or c is nearest to - // p=a+b-c. - - // The first pixel has no left context, and so uses an Up filter, p = b. - // This works naturally with our main loop's p = a+b-c if we force a and c - // to zero. - // Here we zero b and d, which become c and a respectively at the start of - // the loop. + /* Paeth tries to predict pixel d using the pixel to the left of it, a, + * and two pixels from the previous row, b and c: + * prev: c b + * row: a d + * The Paeth function predicts d to be whichever of a, b, or c is nearest to + * p=a+b-c. + * + * The first pixel has no left context, and so uses an Up filter, p = b. + * This works naturally with our main loop's p = a+b-c if we force a and c + * to zero. + * Here we zero b and d, which become c and a respectively at the start of + * the loop. + */ const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { - // It's easiest to do this math (particularly, deal with pc) with 16-bit - // intermediates. + /* It's easiest to do this math (particularly, deal with pc) with 16-bit + * intermediates. + */ c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); - // (p-a) == (a+b-c - a) == (b-c) + /* (p-a) == (a+b-c - a) == (b-c) */ __m128i pa = _mm_sub_epi16(b,c); - // (p-b) == (a+b-c - b) == (a-c) + /* (p-b) == (a+b-c - b) == (a-c) */ __m128i pb = _mm_sub_epi16(a,c); - // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) + /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ __m128i pc = _mm_add_epi16(pa,pb); - pa = abs_i16(pa); // |p-a| - pb = abs_i16(pb); // |p-b| - pc = abs_i16(pc); // |p-c| + pa = abs_i16(pa); /* |p-a| */ + pb = abs_i16(pb); /* |p-b| */ + pc = abs_i16(pc); /* |p-c| */ __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); - // Paeth breaks ties favoring a over b over c. + /* Paeth breaks ties favoring a over b over c. */ __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); - // Note `_epi8`: we need addition to wrap modulo 255. + /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store4(row, _mm_packus_epi16(d,d)); diff --git a/contrib/intel/intel_init.c b/contrib/intel/intel_init.c index 80fcfc2ec..fc0d9abfd 100644 --- a/contrib/intel/intel_init.c +++ b/contrib/intel/intel_init.c @@ -2,6 +2,9 @@ /* intel_init.c - SSE2 optimized filter functions * * Copyright (c) 2016 Google, Inc. + * Written by Mike Klein and Matt Sarett + * Derived from arm/arm_init.c, which was + * Copyright (c) 2014 Glenn Randers-Pehrson * * Last changed in libpng 1.6.22 [(PENDING RELEASE)] * @@ -18,13 +21,14 @@ void png_init_filter_functions_sse2(png_structp pp, unsigned int bpp) { - // The techniques used to implement each of these filters in SSE operate on - // one pixel at a time. - // So they generally speed up 3bpp images about 3x, 4bpp images about 4x. - // They can scale up to 6 and 8 bpp images and down to 2 bpp images, - // but they'd not likely have any benefit for 1bpp images. - // Most of these can be implemented using only MMX and 64-bit registers, - // but they end up a bit slower than using the equally-ubiquitous SSE2. + /* The techniques used to implement each of these filters in SSE operate on + * one pixel at a time. + * So they generally speed up 3bpp images about 3x, 4bpp images about 4x. + * They can scale up to 6 and 8 bpp images and down to 2 bpp images, + * but they'd not likely have any benefit for 1bpp images. + * Most of these can be implemented using only MMX and 64-bit registers, + * but they end up a bit slower than using the equally-ubiquitous SSE2. + */ if (bpp == 3) { pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2; @@ -40,7 +44,9 @@ png_init_filter_functions_sse2(png_structp pp, unsigned int bpp) png_read_filter_row_paeth4_sse2; } - // No need optimize PNG_FILTER_VALUE_UP. The compiler should autovectorize. + /* No need optimize PNG_FILTER_VALUE_UP. The compiler should + * autovectorize. + */ } #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */