[libpng16] Moved SSE2 optimization code into the main libpng source directory.

Configure libpng with "configure --enable-intel-sse" or compile libpng with "-DPNG_INTEL_SSE" in CPPFLAGS to enable it. This patch was previously applied to libpng-1.6.28rc03 but withdrawn to allow time for QA.
2025-07-10 18:04:09 +02:00 · 2017-01-05 18:09:33 -06:00
parent ebede25ceb
commit bef76802de
8 changed files with 80 additions and 367 deletions
--- a/contrib/intel/INSTALL
+++ b/contrib/intel/INSTALL
@@ -1,158 +0,0 @@
-Enabling SSE support
-
-Copyright (c) 2016 Google, Inc.
-Written by Mike Klein, Matt Sarett
-
-This INSTALL file written by Glenn Randers-Pehrson, 2016.
-
-If you have moved intel_init.c and filter_sse2_intrinsics.c to a different
-directory, be sure to update the '#include "../../pngpriv.h"' line in both
-files if necessary to point to the correct relative location of pngpriv.h
-with respect to the new location of those files.
-
-To enable SSE support in libpng, follow the instructions in I, II, or III,
-below:
-
-I. Using patched "configure" scripts:
-
-First, apply intel_sse.patch in your build directory.
-
-   patch -i contrib/intel/intel_sse.patch -p1
-
-Then, if you are not building in a new GIT clone, e.g., in a tar
-distribution, remove any existing pre-built configure scripts:
-
-   ./configure --enable-maintainer-mode
-   make maintainer-clean
-   ./autogen.sh --maintainer --clean
-
-Finally, configure libpng with -DPNG_INTEL_SSE in CPPFLAGS:
-
-   ./autogen.sh --maintainer
-   CPPFLAGS="-DPNG_INTEL_SSE" ./configure [options]
-   make CPPFLAGS="-DPNG_INTEL_SSE" [options]
-   make
-
-II. Using a custom makefile:
-
-If you are using a custom makefile makefile, you will have to update it
-manually to include contrib/intel/*.o in the dependencies, and to define
-PNG_INTEL_SSE.
-
-III. Using manually updated "configure" scripts:
-
-If you prefer, manually edit pngpriv.h, configure.ac, and Makefile.am,
-following the instructions below, then follow the instructions in
-section II of INSTALL in the main libpng directory, then configure libpng
-with -DPNG_INTEL_SSE in CPPFLAGS.
-
-1. Add the following code to configure.ac under HOST SPECIFIC OPTIONS
-directly beneath the section for ARM:
-
-----------------cut----------------
-# INTEL
-# =====
-#
-# INTEL SSE (SIMD) support.
-
-AC_ARG_ENABLE([intel-sse],
-   AS_HELP_STRING([[[--enable-intel-sse]]],
-      [Enable Intel SSE optimizations: =no/off, yes/on:]
-      [no/off: disable the optimizations;]
-      [yes/on: enable the optimizations.]
-      [If not specified: determined by the compiler.]),
-   [case "$enableval" in
-      no|off)
-         # disable the default enabling:
-         AC_DEFINE([PNG_INTEL_SSE_OPT], [0],
-                   [Disable Intel SSE optimizations])
-         # Prevent inclusion of the assembler files below:
-         enable_intel_sse=no;;
-      yes|on)
-         AC_DEFINE([PNG_INTEL_SSE_OPT], [1],
-                   [Enable Intel SSE optimizations]);;
-      *)
-         AC_MSG_ERROR([--enable-intel-sse=${enable_intel_sse}: invalid value])
-   esac])
-
-# Add Intel specific files to all builds where the host_cpu is Intel ('x86*')
-# or where Intel optimizations were explicitly requested (this allows a
-# fallback if a future host CPU does not match 'x86*')
-AM_CONDITIONAL([PNG_INTEL_SSE],
-   [test "$enable_intel_sse" != 'no' &&
-    case "$host_cpu" in
-      i?86|x86_64) :;;
-      *)    test "$enable_intel_sse" != '';;
-    esac])
-----------------cut----------------
-
-2. Add the following code to Makefile.am under HOST SPECIFIC OPTIONS
-directly beneath the "if PNG_ARM_NEON ... endif" statement:
-
-----------------cut----------------
-if PNG_INTEL_SSE
-libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\
-    contrib/intel/filter_sse2_intrinsics.c
-endif
-----------------cut----------------
-
-3. Add the following lines to pngpriv.h, following the PNG_ARM_NEON_OPT
-code:
-
-----------------cut----------------
-#ifndef PNG_INTEL_SSE_OPT
-#   ifdef PNG_INTEL_SSE
-      /* Only check for SSE if the build configuration has been modified to
-       * enable SSE optimizations.  This means that these optimizations will
-       * be off by default.  See contrib/intel for more details.
-       */
-#     if defined(__SSE4_1__) || defined(__AVX__) || defined(__SSSE3__) || \
-       defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
-       (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-#         define PNG_INTEL_SSE_OPT 1
-#      endif
-#   endif
-#endif
-
-#if PNG_INTEL_SSE_OPT > 0
-#   ifndef PNG_INTEL_SSE_IMPLEMENTATION
-#      if defined(__SSE4_1__) || defined(__AVX__)
-          /* We are not actually using AVX, but checking for AVX is the best
-             way we can detect SSE4.1 and SSSE3 on MSVC.
-          */
-#         define PNG_INTEL_SSE_IMPLEMENTATION 3
-#      elif defined(__SSSE3__)
-#         define PNG_INTEL_SSE_IMPLEMENTATION 2
-#      elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
-       (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-#         define PNG_INTEL_SSE_IMPLEMENTATION 1
-#      else
-#         define PNG_INTEL_SSE_IMPLEMENTATION 0
-#      endif
-#   endif
-
-#   if PNG_INTEL_SSE_IMPLEMENTATION > 0
-#      define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_sse2
-#   endif
-#endif
-
-----------------cut----------------
-
-4. Add the following lines to pngpriv.h, following the prototype for
-png_read_filter_row_paeth4_neon:
-
-----------------cut----------------
-PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop
-    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_sse2,(png_row_infop
-    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_sse2,(png_row_infop
-    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_sse2,(png_row_infop
-    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_sse2,(png_row_infop
-    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_sse2,(png_row_infop
-    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-
-----------------cut----------------
--- a/contrib/intel/filter_sse2_intrinsics.c
+++ b/contrib/intel/filter_sse2_intrinsics.c
@@ -1,379 +0,0 @@
-
-/* filter_sse2_intrinsics.c - SSE2 optimized filter functions
- *
- * Copyright (c) 2016 Google, Inc.
- * Written by Mike Klein and Matt Sarett
- * Derived from arm/filter_neon_intrinsics.c, which was
- * Copyright (c) 2014,2016 Glenn Randers-Pehrson
- *
- * Last changed in libpng 1.6.24 [August 4, 2016]
- *
- * This code is released under the libpng license.
- * For conditions of distribution and use, see the disclaimer
- * and license in png.h
- */
-
-#include "../../pngpriv.h"
-
-#ifdef PNG_READ_SUPPORTED
-
-#if PNG_INTEL_SSE_IMPLEMENTATION > 0
-
-#include <immintrin.h>
-
-/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
- * They're positioned like this:
- *    prev:  c b
- *    row:   a d
- * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
- * whichever of a, b, or c is closest to p=a+b-c.
- */
-
-static __m128i load4(const void* p) {
-   return _mm_cvtsi32_si128(*(const int*)p);
-}
-
-static void store4(void* p, __m128i v) {
-   *(int*)p = _mm_cvtsi128_si32(v);
-}
-
-static __m128i load3(const void* p) {
-   /* We'll load 2 bytes, then 1 byte,
-    * then mask them together, and finally load into SSE.
-    */
-   const png_uint_16* p01 = p;
-   const png_byte*    p2  = (const png_byte*)(p01+1);
-
-   png_uint_32 v012 = (png_uint_32)(*p01)
-                    | (png_uint_32)(*p2) << 16;
-   return load4(&v012);
-}
-
-static void store3(void* p, __m128i v) {
-   /* We'll pull from SSE as a 32-bit int, then write
-    * its bottom two bytes, then its third byte.
-    */
-   png_uint_32 v012;
-   store4(&v012, v);
-
-   png_uint_16* p01 = p;
-   png_byte*    p2  = (png_byte*)(p01+1);
-   *p01 = v012;
-   *p2  = v012 >> 16;
-}
-
-void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
-   png_const_bytep prev)
-{
-   /* The Sub filter predicts each pixel as the previous pixel, a.
-    * There is no pixel to the left of the first pixel.  It's encoded directly.
-    * That works with our main loop if we just say that left pixel was zero.
-    */
-   png_debug(1, "in png_read_filter_row_sub3_sse2");
-   __m128i a, d = _mm_setzero_si128();
-
-   int rb = row_info->rowbytes;
-   while (rb >= 4) {
-      a = d; d = load4(row);
-      d = _mm_add_epi8(d, a);
-      store3(row, d);
-
-      row += 3;
-      rb  -= 3;
-   }
-   if (rb > 0) {
-      a = d; d = load3(row);
-      d = _mm_add_epi8(d, a);
-      store3(row, d);
-
-      row += 3;
-      rb  -= 3;
-   }
-}
-
-void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
-   png_const_bytep prev)
-{
-   /* The Sub filter predicts each pixel as the previous pixel, a.
-    * There is no pixel to the left of the first pixel.  It's encoded directly.
-    * That works with our main loop if we just say that left pixel was zero.
-    */
-   png_debug(1, "in png_read_filter_row_sub4_sse2");
-   __m128i a, d = _mm_setzero_si128();
-
-   int rb = row_info->rowbytes;
-   while (rb > 0) {
-      a = d; d = load4(row);
-      d = _mm_add_epi8(d, a);
-      store4(row, d);
-
-      row += 4;
-      rb  -= 4;
-   }
-}
-
-void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
-   png_const_bytep prev)
-{
-   /* The Avg filter predicts each pixel as the (truncated) average of a and b.
-    * There's no pixel to the left of the first pixel.  Luckily, it's
-    * predicted to be half of the pixel above it.  So again, this works
-    * perfectly with our loop if we make sure a starts at zero.
-    */
-   png_debug(1, "in png_read_filter_row_avg3_sse2");
-   const __m128i zero = _mm_setzero_si128();
-   __m128i    b;
-   __m128i a, d = zero;
-
-   int rb = row_info->rowbytes;
-   while (rb >= 4) {
-             b = load4(prev);
-      a = d; d = load4(row );
-
-      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
-      __m128i avg = _mm_avg_epu8(a,b);
-      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
-      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
-                                            _mm_set1_epi8(1)));
-      d = _mm_add_epi8(d, avg);
-      store3(row, d);
-
-      prev += 3;
-      row  += 3;
-      rb   -= 3;
-   }
-   if (rb > 0) {
-             b = load3(prev);
-      a = d; d = load3(row );
-
-      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
-      __m128i avg = _mm_avg_epu8(a,b);
-      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
-      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
-                                            _mm_set1_epi8(1)));
-
-      d = _mm_add_epi8(d, avg);
-      store3(row, d);
-
-      prev += 3;
-      row  += 3;
-      rb   -= 3;
-   }
-}
-
-void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
-   png_const_bytep prev)
-{
-   /* The Avg filter predicts each pixel as the (truncated) average of a and b.
-    * There's no pixel to the left of the first pixel.  Luckily, it's
-    * predicted to be half of the pixel above it.  So again, this works
-    * perfectly with our loop if we make sure a starts at zero.
-    */
-   png_debug(1, "in png_read_filter_row_avg4_sse2");
-   const __m128i zero = _mm_setzero_si128();
-   __m128i    b;
-   __m128i a, d = zero;
-
-   int rb = row_info->rowbytes;
-   while (rb > 0) {
-             b = load4(prev);
-      a = d; d = load4(row );
-
-      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
-      __m128i avg = _mm_avg_epu8(a,b);
-      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
-      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
-                                            _mm_set1_epi8(1)));
-
-      d = _mm_add_epi8(d, avg);
-      store4(row, d);
-
-      prev += 4;
-      row  += 4;
-      rb   -= 4;
-   }
-}
-
-/* Returns |x| for 16-bit lanes. */
-static __m128i abs_i16(__m128i x) {
-#if PNG_INTEL_SSE_IMPLEMENTATION >= 2
-   return _mm_abs_epi16(x);
-#else
-   /* Read this all as, return x<0 ? -x : x.
-   * To negate two's complement, you flip all the bits then add 1.
-    */
-   __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
-
-   /* Flip negative lanes. */
-   x = _mm_xor_si128(x, is_negative);
-
-   /* +1 to negative lanes, else +0. */
-   x = _mm_sub_epi16(x, is_negative);
-   return x;
-#endif
-}
-
-/* Bytewise c ? t : e. */
-static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
-#if PNG_INTEL_SSE_IMPLEMENTATION >= 3
-   return _mm_blendv_epi8(e,t,c);
-#else
-   return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
-#endif
-}
-
-void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
-   png_const_bytep prev)
-{
-   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
-    * and two pixels from the previous row, b and c:
-    *   prev: c b
-    *   row:  a d
-    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
-    * p=a+b-c.
-    *
-    * The first pixel has no left context, and so uses an Up filter, p = b.
-    * This works naturally with our main loop's p = a+b-c if we force a and c
-    * to zero.
-    * Here we zero b and d, which become c and a respectively at the start of
-    * the loop.
-    */
-   png_debug(1, "in png_read_filter_row_paeth3_sse2");
-   const __m128i zero = _mm_setzero_si128();
-   __m128i c, b = zero,
-           a, d = zero;
-
-   int rb = row_info->rowbytes;
-   while (rb >= 4) {
-      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
-       * intermediates.
-       */
-      c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
-      a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
-
-      /* (p-a) == (a+b-c - a) == (b-c) */
-      __m128i pa = _mm_sub_epi16(b,c);
-
-      /* (p-b) == (a+b-c - b) == (a-c) */
-      __m128i pb = _mm_sub_epi16(a,c);
-
-      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
-      __m128i pc = _mm_add_epi16(pa,pb);
-
-      pa = abs_i16(pa);  /* |p-a| */
-      pb = abs_i16(pb);  /* |p-b| */
-      pc = abs_i16(pc);  /* |p-c| */
-
-      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
-
-      /* Paeth breaks ties favoring a over b over c. */
-      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
-                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
-                                                                     c));
-
-      /* Note `_epi8`: we need addition to wrap modulo 255. */
-      d = _mm_add_epi8(d, nearest);
-      store3(row, _mm_packus_epi16(d,d));
-
-      prev += 3;
-      row  += 3;
-      rb   -= 3;
-   }
-   if (rb > 0) {
-      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
-       * intermediates.
-       */
-      c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
-      a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
-
-      /* (p-a) == (a+b-c - a) == (b-c) */
-      __m128i pa = _mm_sub_epi16(b,c);
-
-      /* (p-b) == (a+b-c - b) == (a-c) */
-      __m128i pb = _mm_sub_epi16(a,c);
-
-      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
-      __m128i pc = _mm_add_epi16(pa,pb);
-
-      pa = abs_i16(pa);  /* |p-a| */
-      pb = abs_i16(pb);  /* |p-b| */
-      pc = abs_i16(pc);  /* |p-c| */
-
-      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
-
-      /* Paeth breaks ties favoring a over b over c. */
-      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
-                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
-                                                                     c));
-
-      /* Note `_epi8`: we need addition to wrap modulo 255. */
-      d = _mm_add_epi8(d, nearest);
-      store3(row, _mm_packus_epi16(d,d));
-
-      prev += 3;
-      row  += 3;
-      rb   -= 3;
-   }
-}
-
-void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
-   png_const_bytep prev)
-{
-   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
-    * and two pixels from the previous row, b and c:
-    *   prev: c b
-    *   row:  a d
-    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
-    * p=a+b-c.
-    *
-    * The first pixel has no left context, and so uses an Up filter, p = b.
-    * This works naturally with our main loop's p = a+b-c if we force a and c
-    * to zero.
-    * Here we zero b and d, which become c and a respectively at the start of
-    * the loop.
-    */
-   png_debug(1, "in png_read_filter_row_paeth4_sse2");
-   const __m128i zero = _mm_setzero_si128();
-   __m128i c, b = zero,
-           a, d = zero;
-
-   int rb = row_info->rowbytes;
-   while (rb > 0) {
-      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
-       * intermediates.
-       */
-      c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
-      a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
-
-      /* (p-a) == (a+b-c - a) == (b-c) */
-      __m128i pa = _mm_sub_epi16(b,c);
-
-      /* (p-b) == (a+b-c - b) == (a-c) */
-      __m128i pb = _mm_sub_epi16(a,c);
-
-      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
-      __m128i pc = _mm_add_epi16(pa,pb);
-
-      pa = abs_i16(pa);  /* |p-a| */
-      pb = abs_i16(pb);  /* |p-b| */
-      pc = abs_i16(pc);  /* |p-c| */
-
-      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
-
-      /* Paeth breaks ties favoring a over b over c. */
-      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
-                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
-                                                                     c));
-
-      /* Note `_epi8`: we need addition to wrap modulo 255. */
-      d = _mm_add_epi8(d, nearest);
-      store4(row, _mm_packus_epi16(d,d));
-
-      prev += 4;
-      row  += 4;
-      rb   -= 4;
-   }
-}
-
-#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
-#endif /* READ */
--- a/contrib/intel/intel_init.c
+++ b/contrib/intel/intel_init.c
@@ -1,54 +0,0 @@
-
-/* intel_init.c - SSE2 optimized filter functions
- *
- * Copyright (c) 2016 Google, Inc.
- * Written by Mike Klein and Matt Sarett
- * Derived from arm/arm_init.c, which was
- * Copyright (c) 2014,2016 Glenn Randers-Pehrson
- *
- * Last changed in libpng 1.6.22 [May 26, 2016]
- *
- * This code is released under the libpng license.
- * For conditions of distribution and use, see the disclaimer
- * and license in png.h
- */
-
-#include "../../pngpriv.h"
-
-#ifdef PNG_READ_SUPPORTED
-#if PNG_INTEL_SSE_IMPLEMENTATION > 0
-
-void
-png_init_filter_functions_sse2(png_structp pp, unsigned int bpp)
-{
-   /* The techniques used to implement each of these filters in SSE operate on
-    * one pixel at a time.
-    * So they generally speed up 3bpp images about 3x, 4bpp images about 4x.
-    * They can scale up to 6 and 8 bpp images and down to 2 bpp images,
-    * but they'd not likely have any benefit for 1bpp images.
-    * Most of these can be implemented using only MMX and 64-bit registers,
-    * but they end up a bit slower than using the equally-ubiquitous SSE2.
-   */
-   png_debug(1, "in png_init_filter_functions_sse2");
-   if (bpp == 3)
-   {
-      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2;
-      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_sse2;
-      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
-         png_read_filter_row_paeth3_sse2;
-   }
-   else if (bpp == 4)
-   {
-      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_sse2;
-      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_sse2;
-      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
-          png_read_filter_row_paeth4_sse2;
-   }
-
-   /* No need optimize PNG_FILTER_VALUE_UP.  The compiler should
-    * autovectorize.
-    */
-}
-
-#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
-#endif /* PNG_READ_SUPPORTED */
--- a/contrib/intel/intel_sse.patch
+++ b/contrib/intel/intel_sse.patch
@@ -1,190 +0,0 @@
-diff --git a/configure.ac b/configure.ac
--- a/configure.ac	2016-08-29 11:46:27.000000000 -0400
-+++ b/configure.ac	2016-08-29 16:57:03.866355018 -0400
-@@ -386,16 +386,51 @@ AC_ARG_ENABLE([mips-msa],
- # future host CPU does not match 'mips*')
- 
- AM_CONDITIONAL([PNG_MIPS_MSA],
-    [test "$enable_mips_msa" != 'no' &&
-     case "$host_cpu" in
-       mipsel*|mips64el*) :;;
-     esac])
- 
-+# INTEL
-+# =====
-+#
-+# INTEL SSE (SIMD) support.
-+
-+AC_ARG_ENABLE([intel-sse],
-+   AS_HELP_STRING([[[--enable-intel-sse]]],
-+      [Enable Intel SSE optimizations: =no/off, yes/on:]
-+      [no/off: disable the optimizations;]
-+      [yes/on: enable the optimizations.]
-+      [If not specified: determined by the compiler.]),
-+   [case "$enableval" in
-+      no|off)
-+         # disable the default enabling:
-+         AC_DEFINE([PNG_INTEL_SSE_OPT], [0],
-+                   [Disable Intel SSE optimizations])
-+         # Prevent inclusion of the assembler files below:
-+         enable_intel_sse=no;;
-+      yes|on)
-+         AC_DEFINE([PNG_INTEL_SSE_OPT], [1],
-+                   [Enable Intel SSE optimizations]);;
-+      *)
-+         AC_MSG_ERROR([--enable-intel-sse=${enable_intel_sse}: invalid value])
-+   esac])
-+
-+# Add Intel specific files to all builds where the host_cpu is Intel ('x86*')
-+# or where Intel optimizations were explicitly requested (this allows a
-+# fallback if a future host CPU does not match 'x86*')
-+AM_CONDITIONAL([PNG_INTEL_SSE],
-+   [test "$enable_intel_sse" != 'no' &&
-+    case "$host_cpu" in
-+      i?86|x86_64) :;;
-+      *)    test "$enable_intel_sse" != '';;
-+    esac])
-+
- AC_MSG_NOTICE([[Extra options for compiler: $PNG_COPTS]])
- 
- # Config files, substituting as above
- AC_CONFIG_FILES([Makefile libpng.pc:libpng.pc.in])
- AC_CONFIG_FILES([libpng-config:libpng-config.in],
-    [chmod +x libpng-config])
- 
- AC_OUTPUT
-diff --git a/Makefile.am b/Makefile.am
--- a/Makefile.am	2016-08-29 11:46:27.000000000 -0400
-+++ b/Makefile.am	2016-08-29 16:57:45.955528215 -0400
-@@ -97,16 +97,21 @@ libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SO
- 	arm/filter_neon.S arm/filter_neon_intrinsics.c
- endif
- 
- if PNG_MIPS_MSA
- libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += mips/mips_init.c\
- 	mips/filter_msa_intrinsics.c
- endif
- 
-+if PNG_INTEL_SSE
-+libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\
-+    contrib/intel/filter_sse2_intrinsics.c
-+endif
-+
- nodist_libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES = pnglibconf.h
- 
- libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_LDFLAGS = -no-undefined -export-dynamic \
- 	-version-number @PNGLIB_MAJOR@@PNGLIB_MINOR@:@PNGLIB_RELEASE@:0
- 
- if HAVE_LD_VERSION_SCRIPT
- #   Versioned symbols and restricted exports
- if HAVE_SOLARIS_LD
-diff --git a/pngpriv.h b/pngpriv.h
--- debug16/pngpriv.h	2016-08-30 10:46:36.000000000 -0400
-+++ libpng16/pngpriv.h	2016-08-30 11:57:25.672280202 -0400
-@@ -185,16 +185,52 @@
- #ifndef PNG_MIPS_MSA_OPT
- #  if defined(__mips_msa) && (__mips_isa_rev >= 5) && defined(PNG_ALIGNED_MEMORY_SUPPORTED)
- #     define PNG_MIPS_MSA_OPT 2
- #  else
- #     define PNG_MIPS_MSA_OPT 0
- #  endif
- #endif
- 
-+#ifndef PNG_INTEL_SSE_OPT
-+#   ifdef PNG_INTEL_SSE
-+      /* Only check for SSE if the build configuration has been modified to
-+       * enable SSE optimizations.  This means that these optimizations will
-+       * be off by default.  See contrib/intel for more details.
-+       */
-+#     if defined(__SSE4_1__) || defined(__AVX__) || defined(__SSSE3__) || \
-+       defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
-+       (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-+#         define PNG_INTEL_SSE_OPT 1
-+#      endif
-+#   endif
-+#endif
-+
-+#if PNG_INTEL_SSE_OPT > 0
-+#   ifndef PNG_INTEL_SSE_IMPLEMENTATION
-+#      if defined(__SSE4_1__) || defined(__AVX__)
-+          /* We are not actually using AVX, but checking for AVX is the best
-+             way we can detect SSE4.1 and SSSE3 on MSVC.
-+          */
-+#         define PNG_INTEL_SSE_IMPLEMENTATION 3
-+#      elif defined(__SSSE3__)
-+#         define PNG_INTEL_SSE_IMPLEMENTATION 2
-+#      elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
-+       (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-+#         define PNG_INTEL_SSE_IMPLEMENTATION 1
-+#      else
-+#         define PNG_INTEL_SSE_IMPLEMENTATION 0
-+#      endif
-+#   endif
-+
-+#   if PNG_INTEL_SSE_IMPLEMENTATION > 0
-+#      define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_sse2
-+#   endif
-+#endif
-+
- #if PNG_MIPS_MSA_OPT > 0
- #  define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_msa
- #  ifndef PNG_MIPS_MSA_IMPLEMENTATION
- #     if defined(__mips_msa)
- #        if defined(__clang__)
- #        elif defined(__GNUC__)
- #           if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 7)
- #              define PNG_MIPS_MSA_IMPLEMENTATION 2
-@@ -1251,16 +1287,31 @@ PNG_INTERNAL_FUNCTION(void,png_read_filt
- PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_msa,(png_row_infop
-     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
- PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_msa,(png_row_infop
-     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
- PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_msa,(png_row_infop
-     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
- #endif
- 
-+#if PNG_INTEL_SSE_IMPLEMENTATION > 0
-+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop
-+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_sse2,(png_row_infop
-+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_sse2,(png_row_infop
-+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_sse2,(png_row_infop
-+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_sse2,(png_row_infop
-+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_sse2,(png_row_infop
-+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
-+#endif
-+
- /* Choose the best filter to use and filter the row data */
- PNG_INTERNAL_FUNCTION(void,png_write_find_filter,(png_structrp png_ptr,
-     png_row_infop row_info),PNG_EMPTY);
- 
- #ifdef PNG_SEQUENTIAL_READ_SUPPORTED
- PNG_INTERNAL_FUNCTION(void,png_read_IDAT_data,(png_structrp png_ptr,
-    png_bytep output, png_alloc_size_t avail_out),PNG_EMPTY);
-    /* Read 'avail_out' bytes of data from the IDAT stream.  If the output buffer
-@@ -1986,16 +2037,21 @@ PNG_INTERNAL_FUNCTION(void, PNG_FILTER_O
- PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_neon,
-    (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
- #endif
- 
- #if PNG_MIPS_MSA_OPT > 0
- PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_msa,
-    (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
- #endif
-+
-+#  if PNG_INTEL_SSE_IMPLEMENTATION > 0
-+PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_sse2,
-+   (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
-+#  endif
- #endif
- 
- PNG_INTERNAL_FUNCTION(png_uint_32, png_check_keyword, (png_structrp png_ptr,
-    png_const_charp key, png_bytep new_key), PNG_EMPTY);
- 
- /* Maintainer: Put new private prototypes here ^ */
- 
- #include "pngdebug.h"