diff --git a/README.txt b/README.txt
index f327eeba7..1ea87fe8b 100644
--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,8 @@
 Pngcrush documentation
 
-This is the copyright notice, disclaimer, and license:
+This is is a copy of the copyright notice, disclaimer, and license, for
+your convenience (the actual notice appears in the file pngcrush.c; in
+case of any discrepancy, the copy in pngcrush.c shall prevail):
 
 /*
  * COPYRIGHT NOTICE, DISCLAIMER, AND LICENSE:
@@ -38,12 +40,13 @@ This is the copyright notice, disclaimer, and license:
 
 This is the output of "pngcrush" and "pngcrush -help":
 
- | pngcrush 1.5.1, Copyright (C) 1998, 1999, 2000 Glenn Randers-Pehrson
+
+ | pngcrush 1.5.2, Copyright (C) 1998, 1999, 2000 Glenn Randers-Pehrson
  | This is a free, open-source program.  Permission is
  | irrevocably granted to everyone to use this version
  | of pngcrush without payment of any fee.
- | This program was built with libpng version 1.0.8,
- | and is running with  libpng version 1.0.8 - July 24, 2000 (header)
+ | This program was built with libpng version 1.0.9beta2,
+ | and is running with  libpng version 1.0.9beta2 - November 19, 2000 (header)
  |    Copyright (C) 1998, 1999, 2000 Glenn Randers-Pehrson,
  |    Copyright (C) 1996, 1997 Andreas Dilger,
  |    Copyright (C) 1995, Guy Eric Schalnat, Group 42 Inc.,
@@ -66,14 +69,16 @@ options:
           -fix (fix otherwise fatal conditions such as bad CRCs)
         -force (Write a new output file even if larger than input)
             -g gamma (float or fixed*100000, e.g., 0.45455 or 45455)
+         -iccp length "Profile Name" iccp_file
          -itxt b[efore_IDAT]|a[fter_IDAT] "keyword" "text"
             -l zlib_compression_level [0-9]
             -m method [0 through 200]
-          -max maximum_IDAT_size [1 through 524288]
+          -max maximum_IDAT_size [default 524288]
         -no_cc (no color counting)
             -n (no save; does not do compression or write output PNG)
      -plte_len n (truncate PLTE)
             -q (quiet)
+       -reduce (do lossless color type or bit depth reduction)
           -rem chunkname (or "alla" or "allb")
 -replace_gamma gamma (float or fixed*100000) even if gAMA is present.
           -res dpi
@@ -86,29 +91,16 @@ options:
             -h (help and legal notices)
             -p (pause)
 
+options (Note: any option can be spelled out for clarity, e.g.,
+          "pngcrush -dir New -method 7 -remove bkgd *.png"
+          is the same as "pngcrush -d New -m 7 -rem bkgd *.png"):
 
- | pngcrush 1.5.1, Copyright (C) 1998, 1999, 2000 Glenn Randers-Pehrson
- | This is a free, open-source program.  Permission is
- | irrevocably granted to everyone to use this version
- | of pngcrush without payment of any fee.
- | This program was built with libpng version 1.0.8,
- | and is running with  libpng version 1.0.8 - July 24, 2000 (header)
- |    Copyright (C) 1998, 1999, 2000 Glenn Randers-Pehrson,
- |    Copyright (C) 1996, 1997 Andreas Dilger,
- |    Copyright (C) 1995, Guy Eric Schalnat, Group 42 Inc.,
- | and zlib version 1.1.3pc, Copyright (C) 1998,
- |    Jean-loup Gailly and Mark Adler.
-
-
-usage: pngcrush [options] infile.png outfile.png
-       pngcrush -e ext [other options] files.png ...
-       pngcrush -d dir [other options] files.png ...
-
-options:
       -already already_crushed_size [e.g., 8192]
 
                If file has an IDAT greater than this size, it
-               will be considered to be already crushed.
+               will be considered to be already crushed and will
+               not be processed, unless you are making other changes
+               or the "-force" option is present.
 
         -brute (Use brute-force, try 114 different methods [11-124])
 
@@ -168,6 +160,10 @@ options:
                file has no gAMA chunk.  To replace an existing
                gAMA chunk, use the '-replace_gamma' option.
 
+         -iccp length "Profile Name" iccp_file
+
+               file with ICC profile to insert in an iCCP chunk.
+
          -itxt b[efore_IDAT]|a[fter_IDAT] "keyword" "text"
 
                Uncompressed iTXt chunk to insert (see -text).
@@ -188,7 +184,7 @@ options:
                1, 4, and 7 use no filtering; methods 11 and up use 
                specified filter, compression level, and strategy.
 
-          -max maximum_IDAT_size [1 through 524288]
+          -max maximum_IDAT_size [default 524288]
 
         -no_cc (no color counting)
 
@@ -199,11 +195,14 @@ options:
      -plte_len n (truncate PLTE)
 
                Truncates the PLTE.  Be sure not to truncate it to
-
                less than the greatest index present in IDAT.
 
             -q (quiet)
 
+       -reduce (do lossless color type or bit depth reduction)
+
+          (if possible)
+
           -rem chunkname (or "alla" or "allb")
 
                Name of an ancillary chunk or optional PLTE to be
@@ -249,6 +248,9 @@ options:
 
       -version (display the pngcrush version)
 
+               Look for the most recent version of pngcrush at
+               http://pmt.sourceforge.net
+
             -w compression_window_size [32, 16, 8, 4, 2, 1, 512]
 
                Size of the sliding compression window, in kbytes
@@ -262,6 +264,8 @@ options:
                zlib compression strategy to use with the preceding
                '-m method' argument.
 
+         -zmem zlib_compression_mem_level [1-9, default 9]
+
         -zitxt b[efore_IDAT]|a[fter_IDAT] "keyword" "text"
 
                Compressed iTXt chunk to insert (see -text).
@@ -280,3 +284,4 @@ options:
                e.g., type 'pngcrush -pause -help', if the help
                screen scrolls out of sight.
 
+
diff --git a/compress.c b/compress.c
new file mode 100644
index 000000000..1cee47091
--- /dev/null
+++ b/compress.c
@@ -0,0 +1,68 @@
+/* compress.c -- compress a memory buffer
+ * Copyright (C) 1995-1998 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h 
+ */
+
+/* @(#) $Id$ */
+
+#include "zlib.h"
+
+/* ===========================================================================
+     Compresses the source buffer into the destination buffer. The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer. Upon entry, destLen is the total size of the
+   destination buffer, which must be at least 0.1% larger than sourceLen plus
+   12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+int ZEXPORT compress2 (dest, destLen, source, sourceLen, level)
+    Bytef *dest;
+    uLongf *destLen;
+    const Bytef *source;
+    uLong sourceLen;
+    int level;
+{
+    z_stream stream;
+    int err;
+
+    stream.next_in = (Bytef*)source;
+    stream.avail_in = (uInt)sourceLen;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
+#endif
+    stream.next_out = dest;
+    stream.avail_out = (uInt)*destLen;
+    if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
+
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+
+    err = deflateInit(&stream, level);
+    if (err != Z_OK) return err;
+
+    err = deflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        deflateEnd(&stream);
+        return err == Z_OK ? Z_BUF_ERROR : err;
+    }
+    *destLen = stream.total_out;
+
+    err = deflateEnd(&stream);
+    return err;
+}
+
+/* ===========================================================================
+ */
+int ZEXPORT compress (dest, destLen, source, sourceLen)
+    Bytef *dest;
+    uLongf *destLen;
+    const Bytef *source;
+    uLong sourceLen;
+{
+    return compress2(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
+}
diff --git a/png.c b/png.c
index 1b2531cd1..164ebf382 100644
--- a/png.c
+++ b/png.c
@@ -1,7 +1,7 @@
 
 /* png.c - location for general purpose libpng functions
  *
- * libpng version 1.0.8 - July 24, 2000
+ * libpng version 1.0.9beta2 - November 19, 2000
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
@@ -13,18 +13,18 @@
 #include "png.h"
 
 /* Generate a compiler error if there is an old png.h in the search path. */
-typedef version_1_0_8 Your_png_h_is_not_version_1_0_8;
+typedef version_1_0_9beta2 Your_png_h_is_not_version_1_0_9beta2;
 
 /* Version information for C files.  This had better match the version
  * string defined in png.h.  */
 
 #ifdef PNG_USE_GLOBAL_ARRAYS
 /* png_libpng_ver was changed to a function in version 1.0.5c */
-char png_libpng_ver[12] = "1.0.8";
+const char png_libpng_ver[18] = "1.0.9beta2";
 
 /* png_sig was changed to a function in version 1.0.5c */
 /* Place to hold the signature string for a PNG file. */
-png_byte FARDATA png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+const png_byte FARDATA png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
 
 /* Invoke global declarations for constant strings for known chunk types */
 PNG_IHDR;
@@ -52,32 +52,33 @@ PNG_zTXt;
 /* arrays to facilitate easy interlacing - use pass (0 - 6) as index */
 
 /* start of interlace block */
-int FARDATA png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
+const int FARDATA png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
 
 /* offset to next interlace block */
-int FARDATA png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
+const int FARDATA png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
 
 /* start of interlace block in the y direction */
-int FARDATA png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
+const int FARDATA png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
 
 /* offset to next interlace block in the y direction */
-int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
+const int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
 
 /* width of interlace block (used in assembler routines only) */
 #ifdef PNG_HAVE_ASSEMBLER_COMBINE_ROW
-int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
+const int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
 #endif
 
 /* Height of interlace block.  This is not currently used - if you need
  * it, uncomment it here and in png.h
-int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
+const int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
 */
 
 /* Mask to determine which pixels are valid in a pass */
-int FARDATA png_pass_mask[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
+const int FARDATA png_pass_mask[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
 
 /* Mask to determine which pixels to overwrite while displaying */
-int FARDATA png_pass_dsp_mask[] = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
+const int FARDATA png_pass_dsp_mask[]
+   = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
 
 #endif
 
@@ -625,7 +626,7 @@ png_charp PNGAPI
 png_get_copyright(png_structp png_ptr)
 {
    if (png_ptr != NULL || png_ptr == NULL)  /* silence compiler warning */
-   return ((png_charp) "\n libpng version 1.0.8 - July 24, 2000\n\
+   return ((png_charp) "\n libpng version 1.0.9beta2 - November 19, 2000\n\
    Copyright (c) 1998-2000 Glenn Randers-Pehrson\n\
    Copyright (c) 1996, 1997 Andreas Dilger\n\
    Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.\n");
@@ -643,8 +644,8 @@ png_get_libpng_ver(png_structp png_ptr)
 {
    /* Version of *.c files used when building libpng */
    if(png_ptr != NULL) /* silence compiler warning about unused png_ptr */
-      return((png_charp) "1.0.8");
-   return((png_charp) "1.0.8");
+      return((png_charp) "1.0.9beta2");
+   return((png_charp) "1.0.9beta2");
 }
 
 png_charp PNGAPI
@@ -689,9 +690,10 @@ png_reset_zstream(png_structp png_ptr)
    return (inflateReset(&png_ptr->zstream));
 }
 
+/* This function was added to libpng-1.0.7 */
 png_uint_32 PNGAPI
 png_access_version_number(void)
 {
    /* Version of *.c files used when building libpng */
-   return((png_uint_32) 10008L);
+   return((png_uint_32) 10009L);
 }
diff --git a/png.h b/png.h
index 0d8e0dc1b..ca4c8a945 100644
--- a/png.h
+++ b/png.h
@@ -1,7 +1,7 @@
 
 /* png.h - header file for PNG reference library
  *
- * libpng version 1.0.8 - July 24, 2000
+ * libpng version 1.0.9beta2 - November 19, 2000
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
@@ -9,7 +9,7 @@
  * Authors and maintainers:
  *  libpng versions 0.71, May 1995, through 0.88, January 1996: Guy Schalnat
  *  libpng versions 0.89c, June 1996, through 0.96, May 1997: Andreas Dilger
- *  libpng versions 0.97, January 1998, through 1.0.8 - July 24, 2000: Glenn
+ *  libpng versions 0.97, January 1998, through 1.0.9beta2 - November 19, 2000: Glenn
  *  See also "Contributing Authors", below.
  *
  * Note about libpng version numbers:
@@ -58,6 +58,7 @@
  *    1.0.8beta1-4                  10008  2.1.0.8beta1-4
  *    1.0.8rc1                      10008  2.1.0.8rc1
  *    1.0.8                         10008  2.1.0.8
+ *    1.0.9beta1-2                  10009  2.1.0.9
  *
  *    Henceforth the source version will match the shared-library major
  *    and minor numbers; the shared-library major version number will be
@@ -84,7 +85,7 @@
  * If you modify libpng you may insert additional notices immediately following
  * this sentence.
  *
- * libpng versions 1.0.7, July 1, 2000, through  1.0.8, July 24, 2000, are
+ * libpng versions 1.0.7, July 1, 2000, through  1.0.9beta2, November 19, 2000, are
  * Copyright (c) 2000 Glenn Randers-Pehrson, and are
  * distributed according to the same disclaimer and license as libpng-1.0.6
  * with the following individuals added to the list of Contributing Authors
@@ -189,13 +190,13 @@
  * Y2K compliance in libpng:
  * =========================
  *
- *    July 24, 2000
+ *    November 19, 2000
  *
  *    Since the PNG Development group is an ad-hoc body, we can't make
  *    an official declaration.
  *
  *    This is your unofficial assurance that libpng from version 0.71 and
- *    upward through 1.0.8 are Y2K compliant.  It is my belief that earlier
+ *    upward through 1.0.9beta2 are Y2K compliant.  It is my belief that earlier
  *    versions were also Y2K compliant.
  *
  *    Libpng only has three year fields.  One is a 2-byte unsigned integer
@@ -251,26 +252,25 @@
  */
 
 /* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.0.8"
+#define PNG_LIBPNG_VER_STRING "1.0.9beta2"
 
 #define PNG_LIBPNG_VER_SONUM   2
 
 /* These should match the first 3 components of PNG_LIBPNG_VER_STRING: */
 #define PNG_LIBPNG_VER_MAJOR   1
 #define PNG_LIBPNG_VER_MINOR   0
-#define PNG_LIBPNG_VER_RELEASE 8
+#define PNG_LIBPNG_VER_RELEASE 9
 /* This should match the numeric part of the final component of
  * PNG_LIBPNG_VER_STRING, omitting any leading zero: */
-#define PNG_LIBPNG_VER_BUILD   0
+#define PNG_LIBPNG_VER_BUILD  2
+#define PNG_LIBPNG_BUILD_TYPE beta /* alpha, beta, rc, stable, patched */
 
 /* Careful here.  At one time, Guy wanted to use 082, but that would be octal.
  * We must not include leading zeros.
  * Versions 0.7 through 1.0.0 were in the range 0 to 100 here (only
  * version 1.0.0 was mis-numbered 100 instead of 10000).  From
  * version 1.0.1 it's    xxyyzz, where x=major, y=minor, z=release */
-#define PNG_LIBPNG_VER 10008 /* 1.0.8 */
-
-/* Note to maintainer: update this number in scripts/pngdef.pas as well */
+#define PNG_LIBPNG_VER 10009 /* 1.0.9 */
 
 #ifndef PNG_VERSION_INFO_ONLY
 
@@ -302,8 +302,8 @@ extern "C" {
  * the version above.
  */
 #ifdef PNG_USE_GLOBAL_ARRAYS
-PNG_EXPORT_VAR (char) png_libpng_ver[12]; /* need room for 99.99.99-patch-aa0*/
-  /* Note to maintainer: increase to 18 at the next opportunity */
+PNG_EXPORT_VAR (const char) png_libpng_ver[18];
+  /* need room for 99.99.99beta99z*/
 #else
 #define png_libpng_ver png_get_header_ver(NULL)
 #endif
@@ -311,17 +311,17 @@ PNG_EXPORT_VAR (char) png_libpng_ver[12]; /* need room for 99.99.99-patch-aa0*/
 #ifdef PNG_USE_GLOBAL_ARRAYS
 /* This was removed in version 1.0.5c */
 /* Structures to facilitate easy interlacing.  See png.c for more details */
-PNG_EXPORT_VAR (int FARDATA) png_pass_start[7];
-PNG_EXPORT_VAR (int FARDATA) png_pass_inc[7];
-PNG_EXPORT_VAR (int FARDATA) png_pass_ystart[7];
-PNG_EXPORT_VAR (int FARDATA) png_pass_yinc[7];
-PNG_EXPORT_VAR (int FARDATA) png_pass_mask[7];
-PNG_EXPORT_VAR (int FARDATA) png_pass_dsp_mask[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_start[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_inc[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_ystart[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_yinc[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_mask[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_dsp_mask[7];
 #ifdef PNG_HAVE_ASSEMBLER_COMBINE_ROW
-extern int FARDATA png_pass_width[7];   /* now used in pngvcrd.c, pnggccrd.c */
+PNG_EXPORT_VAR (const int FARDATA) png_pass_width[7];
 #endif
 /* This isn't currently used.  If you need it, see png.c for more details.
-extern int FARDATA png_pass_height[7];
+PNG_EXPORT_VAR (const int FARDATA) png_pass_height[7];
 */
 #endif
 
@@ -523,6 +523,7 @@ typedef struct png_info_struct
    png_uint_16 num_trans;   /* number of transparent palette color (tRNS) */
    png_byte bit_depth;      /* 1, 2, 4, 8, or 16 bits/channel (from IHDR) */
    png_byte color_type;     /* see PNG_COLOR_TYPE_ below (from IHDR) */
+   /* The following three should have been named *_method not *_type */
    png_byte compression_type; /* must be PNG_COMPRESSION_TYPE_BASE (IHDR) */
    png_byte filter_type;    /* must be PNG_FILTER_TYPE_BASE (from IHDR) */
    png_byte interlace_type; /* One of PNG_INTERLACE_NONE, PNG_INTERLACE_ADAM7 */
@@ -764,6 +765,9 @@ typedef png_info FAR * FAR * png_infopp;
 #define PNG_COLOR_TYPE_RGB        (PNG_COLOR_MASK_COLOR)
 #define PNG_COLOR_TYPE_RGB_ALPHA  (PNG_COLOR_MASK_COLOR | PNG_COLOR_MASK_ALPHA)
 #define PNG_COLOR_TYPE_GRAY_ALPHA (PNG_COLOR_MASK_ALPHA)
+/* aliases */
+#define PNG_COLOR_TYPE_RGBA  PNG_COLOR_TYPE_RGB_ALPHA
+#define PNG_COLOR_TYPE_GA  PNG_COLOR_TYPE_GRAY_ALPHA
 
 /* This is for compression type. PNG 1.0-1.2 only define the single type. */
 #define PNG_COMPRESSION_TYPE_BASE 0 /* Deflate method 8, 32K window */
@@ -904,6 +908,10 @@ typedef void (PNGAPI *png_unknown_chunk_ptr) PNGARG((png_structp));
 #define PNG_TRANSFORM_INVERT_ALPHA   0x0400    /* read and write */
 #define PNG_TRANSFORM_STRIP_FILLER   0x0800    /* WRITE only */
 
+/* Flags for MNG supported features */
+#define PNG_FLAG_MNG_EMPTY_PLTE 0x01
+#define PNG_ALL_MNG_FEATURES    0x01
+
 typedef png_voidp (*png_malloc_ptr) PNGARG((png_structp, png_size_t));
 typedef void (*png_free_ptr) PNGARG((png_structp, png_voidp));
 
@@ -1137,9 +1145,11 @@ struct png_struct_def
    png_uint_16 rgb_to_gray_blue_coeff;
 #endif
 
-#if defined(PNG_READ_EMPTY_PLTE_SUPPORTED) || \
+#if defined(PNG_MNG_FEATURES_SUPPORTED) || \
+    defined(PNG_READ_EMPTY_PLTE_SUPPORTED) || \
     defined(PNG_WRITE_EMPTY_PLTE_SUPPORTED)
-   png_byte empty_plte_permitted;
+/* Note to maintainer: change this to png_uint_32 at next opportunity */
+   png_byte mng_features_permitted;
 #endif
 
 #if defined(PNG_READ_GAMMA_SUPPORTED) || defined(PNG_READ_BACKGROUND_SUPPORTED)
@@ -1149,9 +1159,9 @@ struct png_struct_def
 };
 
 /* This prevents a compiler error in png_get_copyright() in png.c if png.c
-and png.h are both at * version 1.0.8
+and png.h are both at * version 1.0.9beta2
  */
-typedef png_structp version_1_0_8;
+typedef png_structp version_1_0_9beta2;
 
 typedef png_struct FAR * FAR * png_structpp;
 
@@ -1387,6 +1397,7 @@ extern PNG_EXPORT(void,png_set_gamma) PNGARG((png_structp png_ptr,
 #if defined(PNG_READ_EMPTY_PLTE_SUPPORTED) || \
     defined(PNG_WRITE_EMPTY_PLTE_SUPPORTED)
 /* Permit or disallow empty PLTE (0: not permitted, 1: permitted) */
+/* Deprecated and will be removed.  Use png_permit_mng_features() instead. */
 extern PNG_EXPORT(void,png_permit_empty_plte) PNGARG((png_structp png_ptr,
    int empty_plte_permitted));
 #endif
@@ -1928,12 +1939,13 @@ extern PNG_EXPORT(void,png_set_hIST) PNGARG((png_structp png_ptr,
 
 extern PNG_EXPORT(png_uint_32,png_get_IHDR) PNGARG((png_structp png_ptr,
    png_infop info_ptr, png_uint_32 *width, png_uint_32 *height,
-   int *bit_depth, int *color_type, int *interlace_type,
-   int *compression_type, int *filter_type));
+   int *bit_depth, int *color_type, int *interlace_method,
+   int *compression_method, int *filter_method));
 
 extern PNG_EXPORT(void,png_set_IHDR) PNGARG((png_structp png_ptr,
    png_infop info_ptr, png_uint_32 width, png_uint_32 height, int bit_depth,
-   int color_type, int interlace_type, int compression_type, int filter_type));
+   int color_type, int interlace_method, int compression_method,
+   int filter_method));
 
 #if defined(PNG_READ_oFFs_SUPPORTED)
 extern PNG_EXPORT(png_uint_32,png_get_oFFs) PNGARG((png_structp png_ptr,
@@ -2117,11 +2129,11 @@ extern PNG_EXPORT(void, png_set_invalid) PNGARG((png_structp png_ptr,
 extern PNG_EXPORT(void, png_read_png) PNGARG((png_structp png_ptr,
                         png_infop info_ptr,
                         int transforms,
-                        voidp params));
+                        png_voidp params));
 extern PNG_EXPORT(void, png_write_png) PNGARG((png_structp png_ptr,
                         png_infop info_ptr,
                         int transforms,
-                        voidp params));
+                        png_voidp params));
 #endif
 
 /* Define PNG_DEBUG at compile time for debugging information.  Higher
@@ -2182,8 +2194,13 @@ extern PNG_EXPORT(png_charp,png_get_header_ver) PNGARG((png_structp png_ptr));
 extern PNG_EXPORT(png_charp,png_get_header_version) PNGARG((png_structp png_ptr));
 extern PNG_EXPORT(png_charp,png_get_libpng_ver) PNGARG((png_structp png_ptr));
 
+#ifdef PNG_MNG_FEATURES_SUPPORTED
+extern PNG_EXPORT(png_uint_32,png_permit_mng_features) PNGARG((png_structp
+   png_ptr, png_uint_32 mng_features_permitted));
+#endif
+
 #define PNG_HEADER_VERSION_STRING \
-   " libpng version 1.0.8 - July 24, 2000 (header)\n"
+   " libpng version 1.0.9beta2 - November 19, 2000 (header)\n"
 
 #ifdef PNG_READ_COMPOSITE_NODIV_SUPPORTED
 /* With these routines we avoid an integer divide, which will be slower on
@@ -2330,7 +2347,7 @@ extern PNG_EXPORT(png_charp,png_get_libpng_ver) PNGARG((png_structp png_ptr));
 #if !defined(PNG_NO_EXTERN) || defined(PNG_ALWAYS_EXTERN)
 /* place to hold the signature string for a PNG file. */
 #ifdef PNG_USE_GLOBAL_ARRAYS
-   PNG_EXPORT_VAR (png_byte FARDATA) png_sig[8];
+   PNG_EXPORT_VAR (const png_byte FARDATA) png_sig[8];
 #else
 #define png_sig png_sig_bytes(NULL)
 #endif
@@ -2516,8 +2533,8 @@ PNG_EXTERN void png_write_sig PNGARG((png_structp png_ptr));
  */
 PNG_EXTERN void png_write_IHDR PNGARG((png_structp png_ptr, png_uint_32 width,
    png_uint_32 height,
-   int bit_depth, int color_type, int compression_type, int filter_type,
-   int interlace_type));
+   int bit_depth, int color_type, int compression_method, int filter_method,
+   int interlace_method));
 
 PNG_EXTERN void png_write_PLTE PNGARG((png_structp png_ptr, png_colorp palette,
    png_uint_32 num_pal));
diff --git a/pngasmrd.h b/pngasmrd.h
index 35fe3b242..0293920bb 100644
--- a/pngasmrd.h
+++ b/pngasmrd.h
@@ -1,6 +1,6 @@
 /* pngasmrd.h - assembler version of utilities to read a PNG file
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1999, 2000 Glenn Randers-Pehrson
  *
diff --git a/pngconf.h b/pngconf.h
index 4769cdf5f..9b64657f2 100644
--- a/pngconf.h
+++ b/pngconf.h
@@ -1,6 +1,6 @@
 /* pngconf.h - machine configurable file for libpng
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -400,6 +400,7 @@
 #define PNG_NO_WRITE_USER_TRANSFORM
 #define PNG_NO_USER_MEM
 #define PNG_NO_READ_EMPTY_PLTE
+#define PNG_NO_MNG_FEATURES
 #define PNG_NO_FIXED_POINT_SUPPORTED
 #endif
 
@@ -492,8 +493,12 @@
 #endif
 #endif
 
+#ifndef PNG_NO_MNG_FEATURES
+#define PNG_MNG_FEATURES_SUPPORTED  /* Useful for MNG applications */
+#endif
+/* Deprecated, will be removed */
 #ifndef PNG_NO_READ_EMPTY_PLTE
-#define PNG_READ_EMPTY_PLTE_SUPPORTED  /* useful for MNG applications */
+#define PNG_READ_EMPTY_PLTE_SUPPORTED
 #endif
 
 #ifdef PNG_WRITE_TRANSFORMS_SUPPORTED
@@ -549,8 +554,9 @@ defined(PNG_WRITE_USER_TRANSFORM_SUPPORTED)
 #define PNG_WRITE_FLUSH_SUPPORTED
 #endif
 
+/* Deprecated, see PNG_MNG_FEATURES_SUPPORTED, above */
 #ifndef PNG_NO_WRITE_EMPTY_PLTE
-#define PNG_WRITE_EMPTY_PLTE_SUPPORTED  /* useful for MNG applications */
+#define PNG_WRITE_EMPTY_PLTE_SUPPORTED
 #endif
 
 #ifndef PNG_NO_STDIO
@@ -1134,6 +1140,13 @@ typedef z_stream FAR *  png_zstreamp;
 #endif
 #endif
 
+#ifndef PNGAPI
+#  define PNGAPI
+#endif
+#ifndef PNG_IMPEXP
+#  define PNG_IMPEXP
+#endif
+
 #ifndef PNG_EXPORT
 #  define PNG_EXPORT(type,symbol) PNG_IMPEXP type PNGAPI symbol
 #endif
diff --git a/pngcrush.c b/pngcrush.c
index 12f8b1f00..5272c7e3a 100644
--- a/pngcrush.c
+++ b/pngcrush.c
@@ -1,9 +1,12 @@
 /* pngcrush.c - recompresses png files
  * Copyright (C) 1998, 1999, 2000 Glenn Randers-Pehrson (randeg@alum.rpi.edu)
  *
+ * The most recent version of pngcrush can be found at
+ * http://pmt.sourceforge.net/pngcrush/
+ *
  * This program reads in a PNG image, and writes it out again, with the
- * optimum filter_type and zlib_level.  It uses brute force (trying
- * filter_type none, and libpng adaptive filtering, with compression
+ * optimum filter_method and zlib_level.  It uses brute force (trying
+ * filter_method none, and libpng adaptive filtering, with compression
  * levels 3 and 9).  It does the most time-consuming method last in case
  * it turns out to be the best.
  *
@@ -14,9 +17,13 @@
  *
  * Thanks to Greg Roelofs for various bug fixes, suggestions, and
  * occasionally creating Linux executables.
+ *
+ * Thanks to Stephan Levavej for some helpful suggestions about gcc compiler
+ * options and for a suggestion to increase the Z_MEM_LEVEL from default.
+ *
  */
 
-#define PNGCRUSH_VERSION "1.5.1"
+#define PNGCRUSH_VERSION "1.5.2"
 
 /*
 */
@@ -57,6 +64,26 @@
  */
 
 /* Change log:
+ *
+ * Version 1.5.2 (built with libpng-1.0.9beta1)
+ *
+ *   Added "-iccp" option.
+ *
+ *   Increased the zlib memory level, which improves compression (typically
+ *   about 1.3 percent for photos) at the expense of increased memory usage.
+ *
+ *   Enabled the "-max max_idat_size" option, even when max_idat_size
+ *   exceeds the default 1/2 megabyte size.
+ *
+ *   Added missing "png_ptr" argument to png_error() call
+ *
+ *   Revised the "-help" output slightly and improved the "-version" output.
+ *
+ *   The "-already[_crushed]" option is now ignored if the "-force" option
+ *   is present or if chunks are being added, deleted, or modified.
+ *
+ *   Improved "things_have_changed" behavior (now, when set in a particular
+ *   file, it is not set for all remaining files)
  *
  * Version 1.5.1 (built with libpng-1.0.8)
  *
@@ -272,26 +299,31 @@
 
 /* To do:
  *
- * Version 1.4.*: check for unused alpha channel and ok-to-reduce-depth.
- *   Rearrange palette to put most-used color first and transparent color
- *   second (see ImageMagick 5.1.1 and later).
- *   Finish pplt (partial palette) feature.
+ *   Check for unused alpha channel and ok-to-reduce-depth.
  *   Take care that sBIT and bKGD data aren't lost when reducing images
  *   from truecolor to grayscale.
  *
- * Version 1.4.*: Use an alternate write function for the trial passes, that
+ *   Rearrange palette to put most-used color first and transparent color
+ *   second (see ImageMagick 5.1.1 and later).
+ *
+ *   Finish pplt (partial palette) feature.
+ *
+ *   Use an alternate write function for the trial passes, that
  *   simply counts bytes rather than actually writing to a file, to save wear
  *   and tear on disk drives.
  *
- * Version 1.4.*: Allow in-place file replacement or as a filter, as in
+ *   Allow in-place file replacement or as a filter, as in
  *    "pngcrush -overwrite file.png"
  *    "pngcreator | pngcrush > output.png"
  *
- * Version 1.4.*: Remove text-handling and color-handling features and put
+ *   Remove text-handling and color-handling features and put
  *   those in a separate program or programs, to avoid unnecessary
  *   recompressing.
  *
+ *   Move the Photoshop-fixing stuff into a separate program.
+ *
  *   add "-time" directive
+ *
  */
 
 #define PNG_INTERNAL
@@ -301,6 +333,11 @@
  * so they are ifdef'ed out in a special version of pngconf.h, which
  * includes pngcrush.h and is included by png.h */
 
+/* defined so I can write to a file on gui/windowing platforms */
+/*  #define STDERR stderr  */
+#define STDERR stdout   /* for DOS */
+
+
 #ifndef PNGCRUSH_LIBPNG_VER
 #  define PNGCRUSH_LIBPNG_VER PNG_LIBPNG_VER
 #endif
@@ -427,10 +464,6 @@ main()
 #  define TIME_T float
 #endif
 
-/* defined so I can write to a file on gui/windowing platforms */
-/*  #define STDERR stderr  */
-#define STDERR stdout   /* for DOS */
-
 /* input and output filenames */
 static PNG_CONST char *progname = "pngtest" DOT "png";
 static PNG_CONST char *inname = "pngtest" DOT "png";
@@ -438,6 +471,7 @@ static PNG_CONST char *outname = "pngout" DOT "png";
 static PNG_CONST char *directory_name = "pngcrush" DOT "bak";
 static PNG_CONST char *extension = "_C" DOT "png";
 
+
 static png_uint_32 width, height;
 static png_uint_32 measured_idat_length;
 static int pngcrush_must_exit=0;
@@ -465,6 +499,15 @@ char text_keyword[800];
 char text_lang[800];
 char text_lang_key[800];
 #endif
+#if (PNG_LIBPNG_VER < 10009)
+#undef PNG_iCCP_SUPPORTED
+#endif
+#ifdef PNG_iCCP_SUPPORTED
+int iccp_length = 0;
+char *iccp_text;
+char *iccp_file;
+char iccp_name[80];
+#endif
 int best;
 
 char buffer[256];
@@ -498,8 +541,10 @@ static int verbose=1;
 static int help=0;
 static int fix=0;
 static int things_have_changed=0;
+static int global_things_have_changed=0;
 static int default_compression_window=15;
 static int force_compression_window=0;
+static int compression_mem_level=9;
 static int final_method=0;
 static int brute_force=0;
 static int brute_force_level=0;
@@ -562,6 +607,7 @@ static png_infop write_end_info_ptr;
 static FILE *fpin, *fpout;
 png_uint_32 measure_idats(FILE *fpin);
 static int do_color_count;
+static int reduction_ok=0;
 #ifdef PNGCRUSH_COUNT_COLORS
 int count_colors(FILE *fpin);
 static int num_rgba, reduce_to_gray, it_is_opaque;
@@ -571,11 +617,15 @@ png_uint_32 png_measure_idat(png_structp png_ptr);
 # define MAX_METHODSP1 201
 # define DEFAULT_METHODS 10
 static png_uint_32 idat_length[MAX_METHODSP1];
-static int filter_method, zlib_level;
+static int filter_type, zlib_level;
 static png_bytep png_row_filters=NULL;
 static TIME_T t_start, t_stop, t_decode, t_encode, t_misc;
 
+#if (PNG_LIBPNG_VER >= 10000)
+static png_uint_32 max_idat_size = 524288;
+#else
 static png_uint_32 max_idat_size = PNG_ZBUF_SIZE;
+#endif
 static png_uint_32 crushed_idat_size = 0x3ffffffL;
 static int already_crushed = 0;
 int ia;
@@ -622,8 +672,8 @@ png_set_compression_buffer_size(png_structp png_ptr, png_uint_32 size)
        png_free(png_ptr, png_ptr->zbuf); png_ptr->zbuf=NULL;
     png_ptr->zbuf_size = (png_size_t)size;
     png_ptr->zbuf = (png_bytep)png_malloc(png_ptr, size);
-    if(png_ptr->zbuf)
-       png_error("Unable to malloc zbuf");
+    if(!png_ptr->zbuf)
+       png_error(png_ptr,"Unable to malloc zbuf");
 }
 
 #if defined(PNG_UNKNOWN_CHUNKS_SUPPORTED)
@@ -1096,7 +1146,7 @@ main(int argc, char *argv[])
    else if(!strncmp(argv[i],"-dou",4))
       {
          double_gamma++;
-         things_have_changed=1;
+         global_things_have_changed=1;
       }
 #endif
    else if(!strncmp(argv[i],"-d",2))
@@ -1114,7 +1164,7 @@ main(int argc, char *argv[])
          extension= argv[names++];
       }
    else if(!strncmp(argv[i],"-force",6))
-         things_have_changed=1;
+         global_things_have_changed=1;
    else if(!strncmp(argv[i],"-fix",4))
       fix++;
    else if(!strncmp(argv[i],"-f",2))
@@ -1231,12 +1281,46 @@ main(int argc, char *argv[])
          help++;
          verbose++;
       }
+   else if(!strncmp(argv[i],"-iccp",5))
+      {
+#ifdef PNG_iCCP_SUPPORTED
+         FILE *iccp_fn;
+         if(iccp_length)
+            free(iccp_text);
+         iccp_length=atoi(argv[++i]);
+         names+=3;
+         strcpy(iccp_name,argv[++i]);
+         iccp_file=argv[++i];
+         if ((iccp_fn = FOPEN(iccp_file, "rb")) == NULL)
+         {
+            fprintf(STDERR, "Could not find file: %s\n", iccp_file);
+            iccp_length=0;
+         }
+         else
+         {
+            int ic;
+            iccp_text=malloc(iccp_length+1);
+            iccp_text[iccp_length]=(char)0x00;
+            for (ic=0; ic<iccp_length; ic++)
+            {
+                  png_size_t num_in;
+                  num_in = fread(buffer, 1, 1, iccp_fn);
+                  if (!num_in)
+                     break;
+                  iccp_text[ic]=buffer[0];
+            }
+         }
+#else
+         names+=3;
+         i+=3;
+         fprintf(STDERR, "libpng-1.0.9 or later is required to support iCCP.\n");
+#endif
+      }
    else if(!strncmp(argv[i],"-max",4))
       {
          names++;
          BUMP_I;
          max_idat_size = (png_uint_32)atoi(argv[i]);
-         if (max_idat_size > PNG_ZBUF_SIZE) max_idat_size=PNG_ZBUF_SIZE;
       }
    else if(!strncmp(argv[i],"-m",2))
       {
@@ -1269,7 +1353,7 @@ main(int argc, char *argv[])
          do_pplt++;
          BUMP_I;
          strcpy(pplt_string,argv[i]);
-         things_have_changed=1;
+         global_things_have_changed=1;
       }
    else if(!strncmp(argv[i],"-p",2))
       {
@@ -1277,6 +1361,10 @@ main(int argc, char *argv[])
       }
    else if(!strncmp(argv[i],"-q",2))
          verbose=0;
+   else if(!strncmp(argv[i],"-reduce",7))
+      {
+        reduction_ok++;
+      }
 #ifdef PNG_gAMA_SUPPORTED
    else if(!strncmp(argv[i],"-rep",4))
       {
@@ -1309,7 +1397,7 @@ main(int argc, char *argv[])
             force_specified_gamma=atof(argv[i]);
 #endif
          }
-         things_have_changed=1;
+         global_things_have_changed=1;
       }
 #endif
 #ifdef PNG_pHYs_SUPPORTED
@@ -1318,6 +1406,7 @@ main(int argc, char *argv[])
          names++;
          BUMP_I;
          resolution=atoi(argv[i]);
+         global_things_have_changed=1;
       }
 #endif
 #ifdef PNGCRUSH_MULTIPLE_ROWS
@@ -1448,11 +1537,14 @@ main(int argc, char *argv[])
 #endif
    else if(!strncmp(argv[i],"-version",8))
       {
-         fprintf(STDERR,"libpng ");
+         fprintf(STDERR, " pngcrush ");
+         fprintf(STDERR, PNGCRUSH_VERSION );
+         fprintf(STDERR,", uses libpng ");
          fprintf(STDERR, PNG_LIBPNG_VER_STRING );
-         fprintf(STDERR,", uses zlib ");
+         fprintf(STDERR,"and zlib ");
          fprintf(STDERR, ZLIB_VERSION );
-         fprintf(STDERR,"\n");
+         fprintf(STDERR, "\n Check http://pmt.sourceforge.net\n");
+         fprintf(STDERR, " for the most recent version.\n");
       }
    else if(!strncmp(argv[i],"-v",2))
       {
@@ -1464,6 +1556,11 @@ main(int argc, char *argv[])
          force_compression_window++;
          names++;
       }
+   else if(!strncmp(argv[i],"-zm",3))
+      {
+         compression_mem_level=atoi(argv[++i]);
+         names++;
+      }
    else if(!strncmp(argv[i],"-z",2))
       {
          int lev, strat, filt;
@@ -1593,7 +1690,12 @@ main(int argc, char *argv[])
      if(verbose > 1)
      {
       png_crush_pause();
-        fprintf(STDERR, "\noptions:\n");
+        fprintf(STDERR,
+      "\noptions (Note: any option can be spelled out for clarity, e.g.,\n");
+        fprintf(STDERR,
+      "          \"pngcrush -dir New -method 7 -remove bkgd *.png\"\n");
+        fprintf(STDERR,
+      "          is the same as \"pngcrush -d New -m 7 -rem bkgd *.png\"):\n\n");
      }
      else
         fprintf(STDERR, "options:\n");
@@ -1604,7 +1706,11 @@ main(int argc, char *argv[])
      fprintf(STDERR,
        "\n               If file has an IDAT greater than this size, it\n");
      fprintf(STDERR,
-       "               will be considered to be already crushed.\n\n");
+       "               will be considered to be already crushed and will\n");
+     fprintf(STDERR,
+       "               not be processed, unless you are making other changes\n");
+     fprintf(STDERR,
+       "               or the \"-force\" option is present.\n\n");
      }
      fprintf(STDERR,
        "        -brute (Use brute-force, try 114 different methods [11-124])\n");
@@ -1722,6 +1828,16 @@ main(int argc, char *argv[])
      fprintf(STDERR,
        "               gAMA chunk, use the '-replace_gamma' option.\n\n");
      png_crush_pause();
+#ifdef PNG_iCCP_SUPPORTED
+     fprintf(STDERR,
+       "         -iccp length \"Profile Name\" iccp_file\n");
+     if(verbose > 1)
+     {
+     fprintf(STDERR,
+       "\n               file with ICC profile to insert in an iCCP chunk.");
+     fprintf(STDERR, "\n\n");
+     }
+#endif
 #ifdef PNG_iTXt_SUPPORTED
      fprintf(STDERR,
        "         -itxt b[efore_IDAT]|a[fter_IDAT] \"keyword\" \"text\"\n");
@@ -1763,7 +1879,7 @@ main(int argc, char *argv[])
      }
 
      fprintf(STDERR,
-       "          -max maximum_IDAT_size [1 through %d]\n",PNG_ZBUF_SIZE);
+       "          -max maximum_IDAT_size [default %d]\n",PNG_ZBUF_SIZE);
      if(verbose > 1)
         fprintf(STDERR,"\n");
 #if 0
@@ -1791,13 +1907,18 @@ main(int argc, char *argv[])
      fprintf(STDERR,
        "\n               Truncates the PLTE.  Be sure not to truncate it to\n");
      fprintf(STDERR,
-       "\n               less than the greatest index present in IDAT.\n\n");
+       "               less than the greatest index present in IDAT.\n\n");
  
      }
      fprintf(STDERR,
        "            -q (quiet)\n");
      if(verbose > 1)
         fprintf(STDERR,"\n");
+     fprintf(STDERR,
+       "       -reduce (do lossless color type or bit depth reduction)\n");
+     if(verbose > 1)
+     fprintf(STDERR,
+       "\n          (if possible)\n\n");
      fprintf(STDERR,
        "          -rem chunkname (or \"alla\" or \"allb\")\n");
      if(verbose > 1)
@@ -1890,7 +2011,6 @@ main(int argc, char *argv[])
        "               color type, scaled to the output bit depth.\n\n");
      }
 #endif
-
      fprintf(STDERR,
        "            -v (display more detailed information)\n");
      if(verbose > 1)
@@ -1899,7 +2019,12 @@ main(int argc, char *argv[])
      fprintf(STDERR,
        "      -version (display the pngcrush version)\n");
      if(verbose > 1)
-        fprintf(STDERR,"\n");
+     {
+     fprintf(STDERR,
+       "\n               Look for the most recent version of pngcrush at\n");
+     fprintf(STDERR,
+       "               http://pmt.sourceforge.net\n\n");
+     }
      fprintf(STDERR,
        "            -w compression_window_size [32, 16, 8, 4, 2, 1, 512]\n");
      if(verbose > 1)
@@ -1923,6 +2048,10 @@ main(int argc, char *argv[])
      fprintf(STDERR,
        "               '-m method' argument.\n\n");
      }
+     fprintf(STDERR,
+       "         -zmem zlib_compression_mem_level [1-9, default 9]\n");
+     if(verbose > 1)
+        fprintf(STDERR,"\n");
 #ifdef PNG_iTXt_SUPPORTED
      fprintf(STDERR,
        "        -zitxt b[efore_IDAT]|a[fter_IDAT] \"keyword\" \"text\"\n");
@@ -2013,6 +2142,8 @@ main(int argc, char *argv[])
   {
       first_trial = 1;
 
+      things_have_changed=global_things_have_changed;
+
       if(png_row_filters != NULL)
       {
          free(png_row_filters); png_row_filters=NULL;
@@ -2121,7 +2252,7 @@ main(int argc, char *argv[])
          if(already_crushed)
          {
             fprintf(STDERR, "File has already been crushed: %s\n", inname);
-            continue;
+            if(!things_have_changed) continue;
          }
 
          if(verbose > 0)
@@ -2142,6 +2273,7 @@ main(int argc, char *argv[])
       if (do_color_count)
       {
       if (force_output_color_type == 8 && (input_color_type == 2 ||
+          (input_color_type == 3) ||
           input_color_type == 4 || input_color_type == 6))
       /* check for unused alpha channel or single transparent color */
       {
@@ -2298,7 +2430,7 @@ main(int argc, char *argv[])
          }
          else
          {
-             filter_method=fm[best];
+             filter_type=fm[best];
              zlib_level=lv[best];
              if(zs[best] == 0)z_strategy=Z_DEFAULT_STRATEGY;
              if(zs[best] == 1)z_strategy=Z_FILTERED;
@@ -2317,7 +2449,7 @@ main(int argc, char *argv[])
              if((trial == 6 || trial == 9 || trial == 10) && best_of_three != 3)
                 continue;
           }
-          filter_method=fm[trial];
+          filter_type=fm[trial];
           zlib_level=lv[trial];
           if(zs[trial] == 0)z_strategy=Z_DEFAULT_STRATEGY;
           if(zs[trial] == 1)z_strategy=Z_FILTERED;
@@ -2325,7 +2457,7 @@ main(int argc, char *argv[])
           final_method=trial;
           if(nosave == 0)
             P2("   Begin trial %d, filter %d, strategy %d, level %d\n",
-              trial, filter_method, z_strategy, zlib_level);
+              trial, filter_type, z_strategy, zlib_level);
       }
 
       P2("prepare to open files.\n");
@@ -2463,7 +2595,7 @@ main(int argc, char *argv[])
       }
       if(nosave == 0)
        {
-         if(png_get_compression_buffer_size(write_ptr) < max_idat_size)
+         if(png_get_compression_buffer_size(write_ptr) != max_idat_size)
          {
             P2("reinitializing write zbuf.\n");
             png_set_compression_buffer_size(write_ptr, max_idat_size);
@@ -2511,6 +2643,9 @@ main(int argc, char *argv[])
 
           png_set_keep_unknown_chunks(write_ptr, HANDLE_CHUNK_IF_SAFE,
             (png_bytep)NULL, 0);
+
+/* Process the following chunks as if safe-to-copy since it is known that
+   recompressing the IDAT chunks has no effect on them */
 #if !defined(PNG_cHRM_SUPPORTED)
           png_set_keep_unknown_chunks(write_ptr, HANDLE_CHUNK_ALWAYS, 
             (png_bytep)png_cHRM, 1);
@@ -2593,30 +2728,44 @@ main(int argc, char *argv[])
 #endif  /* PNG_WRITE_UNKNOWN_CHUNKS_SUPPORTED */
 
       png_debug(0, "Reading info struct\n");
+   {
+      png_byte png_signature[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+
+      png_read_data(read_ptr, png_signature, 8);
+      png_set_sig_bytes(read_ptr, 8);
+
+      if (png_sig_cmp(png_signature, 0, 8))
+      {
+         if (png_sig_cmp(png_signature, 0, 4))
+            png_error(read_ptr, "Not a PNG file!");
+         else
+            png_error(read_ptr, "PNG file corrupted by ASCII conversion");
+      }
+   }
       png_read_info(read_ptr, read_info_ptr);
 
 #if (PNG_LIBPNG_VER > 90)
       png_debug(0, "Transferring info struct\n");
       {
-         int interlace_type, compression_type, filter_type;
+         int interlace_method, compression_method, filter_method;
 
          if (png_get_IHDR(read_ptr, read_info_ptr, &width, &height, &bit_depth,
-             &color_type, &interlace_type, &compression_type, &filter_type))
+             &color_type, &interlace_method, &compression_method, &filter_method))
          {
             int compression_window;
             int need_expand = 0;
-            int output_interlace_type=interlace_type;
+            int output_interlace_method=interlace_method;
             input_color_type=color_type;
             input_bit_depth=bit_depth;
             if(nointerlace)
-               output_interlace_type=0;
+               output_interlace_method=0;
             if(verbose > 1 && first_trial)
             {
                fprintf(STDERR, "   IHDR chunk data:\n");
                fprintf(STDERR, "      Width=%ld, height=%ld\n", width, height);
                fprintf(STDERR, "      Bit depth =%d\n", bit_depth);
                fprintf(STDERR, "      Color type=%d\n", color_type);
-               fprintf(STDERR, "      Interlace =%d\n", interlace_type);
+               fprintf(STDERR, "      Interlace =%d\n", interlace_method);
             }
 
             if(output_color_type > 7)
@@ -2720,6 +2869,7 @@ main(int argc, char *argv[])
                int channels=0;
 
                png_set_compression_strategy(write_ptr, z_strategy);
+               png_set_compression_mem_level(write_ptr, compression_mem_level);
 
                if (output_color_type == 0)channels=1;
                if (output_color_type == 2)channels=3;
@@ -2757,10 +2907,10 @@ main(int argc, char *argv[])
                 fprintf(STDERR, "   Setting IHDR\n");
 
             png_set_IHDR(write_ptr, write_info_ptr, width, height,
-              output_bit_depth, output_color_type, output_interlace_type,
-              compression_type, filter_type);
+              output_bit_depth, output_color_type, output_interlace_method,
+              compression_method, filter_method);
 
-            if(output_color_type != input_color_type) things_have_changed++;
+            if(output_color_type != input_color_type) things_have_changed=1;
          }
       }
 #if defined(PNG_READ_bKGD_SUPPORTED) && defined(PNG_WRITE_bKGD_SUPPORTED)
@@ -2953,15 +3103,26 @@ main(int argc, char *argv[])
       png_charp name;
       png_charp profile;
       png_uint_32 proflen;
-      int compression_type;
+      int compression_method;
 
-      if (png_get_iCCP(read_ptr, read_info_ptr, &name, &compression_type, 
+      if (png_get_iCCP(read_ptr, read_info_ptr, &name, &compression_method, 
                       &profile, &proflen))
       {
+         P1 ("Got iccp chunk, proflen=%lu\n",proflen);
          if(keep_chunk("iCCP",argv))
-            png_set_iCCP(write_ptr, write_info_ptr, name, compression_type, 
+            png_set_iCCP(write_ptr, write_info_ptr, name, compression_method, 
                       profile, proflen);
+         
       }
+#ifdef PNG_iCCP_SUPPORTED
+      else if (iccp_length)
+      {
+          png_set_iCCP(write_ptr, write_info_ptr, iccp_name, 0, 
+              iccp_text, iccp_length);
+         P1 ("Wrote iccp chunk, proflen=%d\n",iccp_length);
+      }
+#endif
+
    }
 #endif
 #if defined(PNG_READ_oFFs_SUPPORTED) && defined(PNG_WRITE_oFFs_SUPPORTED)
@@ -3014,9 +3175,17 @@ main(int argc, char *argv[])
          {
             if (png_get_pHYs(read_ptr, read_info_ptr, &res_x, &res_y,
                 &unit_type))
+            {
+            if(res_x == 0 && res_y == 0)
+            {
+               if(verbose > 0 && first_trial)
+                  fprintf(STDERR, "   Deleting useless pHYs 0 0 chunk\n");
+            }
+            else
             {
                if(keep_chunk("pHYs",argv))
-            png_set_pHYs(write_ptr, write_info_ptr, res_x, res_y, unit_type);
+               png_set_pHYs(write_ptr, write_info_ptr, res_x, res_y, unit_type);
+            }
             }
          }
          else
@@ -3366,12 +3535,12 @@ main(int argc, char *argv[])
       {
       png_set_compression_level(write_ptr, zlib_level);
 
-      if     (filter_method == 0)png_set_filter(write_ptr,0,PNG_FILTER_NONE);
-      else if(filter_method == 1)png_set_filter(write_ptr,0,PNG_FILTER_SUB);
-      else if(filter_method == 2)png_set_filter(write_ptr,0,PNG_FILTER_UP);
-      else if(filter_method == 3)png_set_filter(write_ptr,0,PNG_FILTER_AVG);
-      else if(filter_method == 4)png_set_filter(write_ptr,0,PNG_FILTER_PAETH);
-      else if(filter_method == 5)png_set_filter(write_ptr,0,PNG_ALL_FILTERS);
+      if     (filter_type == 0)png_set_filter(write_ptr,0,PNG_FILTER_NONE);
+      else if(filter_type == 1)png_set_filter(write_ptr,0,PNG_FILTER_SUB);
+      else if(filter_type == 2)png_set_filter(write_ptr,0,PNG_FILTER_UP);
+      else if(filter_type == 3)png_set_filter(write_ptr,0,PNG_FILTER_AVG);
+      else if(filter_type == 4)png_set_filter(write_ptr,0,PNG_FILTER_PAETH);
+      else if(filter_type == 5)png_set_filter(write_ptr,0,PNG_ALL_FILTERS);
       else                       png_set_filter(write_ptr,0,PNG_FILTER_NONE);
 
 
@@ -3464,7 +3633,7 @@ main(int argc, char *argv[])
 
       {
       /* check for sufficient memory: we need 2*zlib_window
-         and, if filter_method == 5, 4*rowbytes in separate allocations.
+         and, if filter_type == 5, 4*rowbytes in separate allocations.
          If it's not enough we can drop the "average" filter and
          we can reduce the zlib_window for writing.  We can't change
          the input zlib_window because the input file might have
@@ -3766,6 +3935,8 @@ main(int argc, char *argv[])
       if(nosave)
          break;
 
+      first_trial=0;
+
       if (nosave == 0)
       {
          png_debug(0, "Opening file for length measurement\n");
@@ -3789,11 +3960,10 @@ main(int argc, char *argv[])
          {
          fprintf(STDERR,
          "   IDAT length with method %d (fm %d zl %d zs %d)= %8lu\n",
-             trial,filter_method,zlib_level,z_strategy,idat_length[trial]);
+             trial,filter_type,zlib_level,z_strategy,idat_length[trial]);
          fflush(STDERR);
          }
 
-         first_trial=0;
       } /* end of trial-loop */
 
       if (fpin)
@@ -3847,6 +4017,10 @@ main(int argc, char *argv[])
             free(png_row_filters); png_row_filters=NULL;
          }
          if(verbose > 0) show_result();
+#ifdef PNG_iCCP_SUPPORTED
+         if(iccp_length)
+            free(iccp_text);
+#endif
          if(pngcrush_must_exit)
             exit(0);
          return(0);
@@ -3874,6 +4048,7 @@ measure_idats(FILE *fpin)
 #else
    png_set_read_fn(read_ptr, (png_voidp)fpin, png_default_read_data);
 #endif
+
    png_set_sig_bytes(read_ptr, 0);
    measured_idat_length=png_measure_idat(read_ptr);
    P2("measure_idats: IDAT length=%lu\n",measured_idat_length);
@@ -3898,7 +4073,6 @@ png_measure_idat(png_structp png_ptr)
    /* Copyright (C) 1999, 2000 Glenn Randers-Pehrson (randeg@alum.rpi.edu)
       See notice in pngcrush.c for conditions of use and distribution */
    png_uint_32 sum_idat_length=0;
-   png_debug(1, "in png_read_info\n");
 
    {
       png_byte png_signature[8] = {137, 80, 78, 71, 13, 10, 26, 10};
@@ -3909,7 +4083,7 @@ png_measure_idat(png_structp png_ptr)
       if (png_sig_cmp(png_signature, 0, 8))
       {
          if (png_sig_cmp(png_signature, 0, 4))
-            png_error(png_ptr, "Not a PNG file");
+            png_error(png_ptr, "Not a PNG file..");
          else
             png_error(png_ptr, "PNG file corrupted by ASCII conversion");
       }
@@ -3927,7 +4101,11 @@ png_measure_idat(png_structp png_ptr)
       PNG_IDAT;
       PNG_IEND;
       PNG_IHDR;
+#ifdef PNG_iCCP_SUPPORTED
       PNG_iCCP;
+#else
+      const png_byte png_iCCP[5]={105, 67, 67, 80, '\0'};
+#endif
 #endif
 #endif
       png_byte chunk_name[5];
@@ -3970,6 +4148,7 @@ png_measure_idat(png_structp png_ptr)
          input_color_type=buffer[9];
       }
 
+#ifdef PNG_iCCP_SUPPORTED
       /* check for bad photoshop iccp chunk */
 #ifdef PNG_UINT_IDAT
       if (png_get_uint_32(chunk_name) == PNG_UINT_iCCP)
@@ -3977,6 +4156,12 @@ png_measure_idat(png_structp png_ptr)
       if (!png_memcmp(chunk_name, png_iCCP, 4))
 #endif
       {
+        /* Check for bad Photoshop iCCP chunk.  Libpng will reject the
+         * bad chunk because the Adler-32 bytes are missing, but we check
+         * here to see if it's really the sRGB profile, and if so, set the
+         * "intent" flag and gamma so pngcrush will write an sRGB chunk
+         * and a gamma chunk.
+         */
          if (length == 2615)
          {
              png_crc_read(png_ptr, buffer, 22);
@@ -3997,6 +4182,7 @@ png_measure_idat(png_structp png_ptr)
              }
          }
       }
+#endif
 
       png_crc_finish(png_ptr, length);
 
@@ -4016,8 +4202,9 @@ count_colors(FILE *fpin)
 {
    /* Copyright (C) 2000 Glenn Randers-Pehrson (randeg@alum.rpi.edu)
       See notice in pngcrush.c for conditions of use and distribution */
-   int bit_depth, color_type, interlace_type, filter_type, compression_type;
-   png_uint_32 rowbytes, channels;
+   int bit_depth, color_type, interlace_method, filter_method, compression_method;
+   png_uint_32 rowbytes;
+   volatile png_uint_32 channels;
 
    int i;
    int pass, num_pass;
@@ -4025,6 +4212,7 @@ count_colors(FILE *fpin)
    volatile int result, hashmiss, hashinserts;
 
    png_uint_32 rgba_frequency[257];
+
    png_uint_32 rgba_hi[257]; /* Actually contains ARGB not RGBA */
 #if 0
    png_uint_32 rgba_lo[257]; /* Low bytes of ARGB in 16-bit PNGs */
@@ -4053,7 +4241,9 @@ count_colors(FILE *fpin)
 
    num_rgba=0;
    for (i=0; i<257; i++)
+   {
       rgba_frequency[i]=0;
+   }
 
    P2("Checking alphas:\n");
    png_debug(0, "Allocating read structure\n");
@@ -4075,10 +4265,12 @@ count_colors(FILE *fpin)
 
 #ifdef USE_HASHCODE
    int hash[16385];
+#endif
+
+#ifdef USE_HASHCODE
    for (i=0; i<16385; i++)
       hash[i]=-1;
 #endif
-
    end_info_ptr = NULL;
 
 #if !defined(PNG_NO_STDIO)
@@ -4087,6 +4279,20 @@ count_colors(FILE *fpin)
    png_set_read_fn(read_ptr, (png_voidp)fpin, png_default_read_data);
 #endif
 
+   {
+      png_byte png_signature[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+
+      png_read_data(read_ptr, png_signature, 8);
+      png_set_sig_bytes(read_ptr, 8);
+
+      if (png_sig_cmp(png_signature, 0, 8))
+      {
+         if (png_sig_cmp(png_signature, 0, 4))
+            png_error(read_ptr, "Not a PNG file.");
+         else
+            png_error(read_ptr, "PNG file corrupted by ASCII conversion");
+      }
+   }
    png_read_info(read_ptr, read_info_ptr);
 
 #ifdef PNG_CRC_QUIET_USE
@@ -4094,7 +4300,7 @@ count_colors(FILE *fpin)
 #endif
 
    png_get_IHDR(read_ptr, read_info_ptr, &width, &height, &bit_depth,
-             &color_type, &interlace_type, &compression_type, &filter_type);
+             &color_type, &interlace_method, &compression_method, &filter_method);
 
    if (color_type == 2)
       channels = 3;
@@ -4106,11 +4312,11 @@ count_colors(FILE *fpin)
       channels=1;
 
    if(color_type == 0 || color_type == 3 || color_type == 4)
-      reduce_to_gray = 0;
+      reduce_to_gray = 1;
 
    if(bit_depth == 8)
    {
-      if(interlace_type)
+      if(interlace_method)
          num_pass=7;
       else
          num_pass = 1;
@@ -4126,7 +4332,7 @@ count_colors(FILE *fpin)
          png_uint_32 pass_height, pass_width, y;
          png_debug(0, "\nBegin Pass\n");
 
-         if (interlace_type)
+         if (interlace_method)
          {
             pass_height = (height - png_pass_ystart[pass]
                        + png_pass_yinc[pass] - 1) / png_pass_yinc[pass];
@@ -4143,7 +4349,8 @@ count_colors(FILE *fpin)
          {
             png_uint_32 x;
             png_read_row(read_ptr, row_buf, (png_bytep)NULL);
-            if(result < 2 || it_is_opaque || reduce_to_gray)
+            if(result < 2 || it_is_opaque || 
+                 reduce_to_gray)
             {
               if(color_type==2)
               {
@@ -4160,6 +4367,11 @@ count_colors(FILE *fpin)
                    if(reduce_to_gray &&
                      ((*(rp)) != (*(rp+1)) || (*(rp)) != (*(rp+2))))
                         reduce_to_gray=0;
+
+                   if (result > 1 || !it_is_opaque)
+                      continue;
+
+
 #ifdef USE_HASHCODE
                    /*
                     *      R      G      B     mask
@@ -4242,6 +4454,9 @@ count_colors(FILE *fpin)
                         reduce_to_gray=0;
                    if(it_is_opaque && (*(rp+3)) != 255)
                       it_is_opaque=0;
+
+                   if (result > 1)
+                      continue;
 #ifdef USE_HASHCODE
                    /*
                     *  A     R     G    B    mask
@@ -4384,8 +4599,6 @@ count_colors(FILE *fpin)
               }
               else /* other color type */
               {
-                  /* to do: check color type 3 for max sample that is present
-                     and reduce palette if possible */
                   result=2;
               }
             }
@@ -4436,14 +4649,30 @@ count_colors(FILE *fpin)
         P2 ("hashcode misses=%d, inserts=%d\n",hashmiss,
            hashinserts);
       }
-   if(reduce_to_gray)
-     P1 ("The truecolor image is all gray and will be reduced.\n");
    if(color_type == 0 || color_type == 2)
      it_is_opaque=0;
-   if(it_is_opaque)
-     P1 ("The image is opaque and the alpha channel will be removed.\n");
+   if(reduction_ok)
+   {
+     if(reduce_to_gray)
+        P1 ("The truecolor image is all gray and will be reduced.\n");
+     if(it_is_opaque)
+       P1 ("The image is opaque and the alpha channel will be removed.\n");
+   }
+   else
+   {
+     if(reduce_to_gray)
+        P1 ("The truecolor image is all gray and could be reduced.\n");
+     if(it_is_opaque)
+       P1 ("The image is opaque and the alpha channel could be removed.\n");
+     if (reduce_to_gray || it_is_opaque)
+       P1 ("Rerun pngcrush with the \"-reduce\" option to do so.\n");
+     reduce_to_gray = 0;
+     it_is_opaque = 0;
+
+   }
    P2 ("Finished checking alphas, result=%d\n",result);
    }
+
    ret=result;
    return (ret);
 }
diff --git a/pngcrush.h b/pngcrush.h
index 33b31e37e..845e52a57 100644
--- a/pngcrush.h
+++ b/pngcrush.h
@@ -26,6 +26,8 @@
 #  define PNG_USER_MEM_SUPPORTED
 #endif
 
+#define MNG_EXTENSIONS_SUPPORTED /* extra filter types */
+
 #ifndef PNG_NO_LEGACY_SUPPORTED
 #  define PNG_NO_LEGACY_SUPPORTED
 #endif
diff --git a/pngerror.c b/pngerror.c
index 3e4210f75..43bf597b3 100644
--- a/pngerror.c
+++ b/pngerror.c
@@ -1,7 +1,7 @@
 
 /* pngerror.c - stub functions for i/o and memory allocation
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pnggccrd.c b/pnggccrd.c
index ea4f972b5..d6e49b4bd 100644
--- a/pnggccrd.c
+++ b/pnggccrd.c
@@ -6,14 +6,14 @@
  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
  *     for Intel's performance analysis of the MMX vs. non-MMX code.
  *
- * libpng version 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * Copyright (c) 1998, Intel Corporation
  *
  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
  * Interface to libpng contributed by Gilles Vollant, 1999.
- * GNU C port by Greg Roelofs, 1999.
+ * GNU C port by Greg Roelofs, 1999-2000.
  *
  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
  *
@@ -43,8 +43,8 @@
  */
 
 /*
- * NOTES (mostly by Greg Roelofs)
- * =====
+ * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
+ * =====================================
  *
  * 19991006:
  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
@@ -55,13 +55,13 @@
  *     - write MMX code for 48-bit case (pixel_bytes == 6)
  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
  *        why subtract 8 from width_mmx in the pass 4/5 case?
- *        (only width_mmx case)
+ *        (only width_mmx case) (near line 1606)
  *     x [DONE] replace pixel_bytes within each block with the true
  *        constant value (or are compilers smart enough to do that?)
  *     - rewrite all MMX interlacing code so it's aligned with
  *        the *beginning* of the row buffer, not the end.  This
  *        would not only allow one to eliminate half of the memory
- *        writes for odd passes (i.e., pass == odd), it may also
+ *        writes for odd passes (that is, pass == odd), it may also
  *        eliminate some unaligned-data-access exceptions (assuming
  *        there's a penalty for not aligning 64-bit accesses on
  *        64-bit boundaries).  The only catch is that the "leftover"
@@ -113,7 +113,7 @@
  *
  * 19991107:
  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
- *     "AuthenticAMD", etc.) placed in EBX:ECX:EDX.  Still need to polish.
+ *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
  *
  * 19991120:
  *  - made "diff" variable (now "_dif") global to simplify conversion of
@@ -123,14 +123,14 @@
  *     macro determines which is used); original not yet tested.
  *
  * 20000213:
- *  - When compiling with gcc, be sure to use  -fomit-frame-pointer
+ *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
  *
  * 20000319:
  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
  *     pass == 4 or 5, that caused visible corruption of interlaced images
  *
  * 20000623:
- *  -  Various problems were reported with gcc 2.95.2 in the Cygwin environment,
+ *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
  *     Chuck Wilson supplied a patch involving dummy output registers.  See
@@ -147,10 +147,78 @@
  *       pnggccrd.c:1177: more than 10 operands in `asm'
  *     They are all the same problem and can be worked around by using the
  *     global _unmask variable unconditionally, not just in the -fPIC case.
- *     Apparently earlier versions of gcc also have the problem with more than
+ *     Reportedly earlier versions of gcc also have the problem with more than
  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
+ *
+ * 20000729:
+ *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
+ *     MMX routine); began converting png_read_filter_row_mmx_sub()
+ *  - to finish remaining sections:
+ *     - clean up indentation and comments
+ *     - preload local variables
+ *     - add output and input regs (order of former determines numerical
+ *        mapping of latter)
+ *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
+ *     - remove "$" from addressing of Shift and Mask variables [20000823]
+ *
+ * 20000731:
+ *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
+ *
+ * 20000822:
+ *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
+ *     shared-library (-fPIC) version!  Code works just fine as part of static
+ *     library.  Damn damn damn damn damn, should have tested that sooner.
+ *     ebx is getting clobbered again (explicitly this time); need to save it
+ *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
+ *
+ * 20000823:
+ *  - first section was trickiest; all remaining sections have ebx -> edx now.
+ *     (-fPIC works again.)  Also added missing underscores to various Shift*
+ *     and *Mask* globals and got rid of leading "$" signs.
+ *
+ * 20000826:
+ *  - added visual separators to help navigate microscopic printed copies
+ *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
+ *     on png_read_filter_row_mmx_avg()
+ *
+ * 20000828:
+ *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
+ *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
+ *     cleaned up/shortened in either routine, but functionality is complete
+ *     and seems to be working fine.
+ *
+ * 20000829:
+ *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
+ *     as an input reg (with dummy output variables, etc.), then it *cannot*
+ *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
+ *     is simple enough...
+ *
+ * 20000914:
+ *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
+ *     correctly (but 48-bit RGB just fine)
+ *
+ * 20000916:
+ *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:  
+ *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
+ *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
+ *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
+ *
+ * STILL TO DO:
+ *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
+ *     - write MMX code for 48-bit case (pixel_bytes == 6)
+ *     - figure out what's up with 24-bit case (pixel_bytes == 3):
+ *        why subtract 8 from width_mmx in the pass 4/5 case?
+ *        (only width_mmx case) (near line 1606)
+ *     - rewrite all MMX interlacing code so it's aligned with beginning
+ *        of the row buffer, not the end (see 19991007 for details)
+ *     - pick one version of mmxsupport() and get rid of the other
+ *     - add error messages to any remaining bogus default cases
+ *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
+ *     - add support for runtime enable/disable/query of various MMX routines
  */
 
+//#define PNG_DEBUG 2   // GRR
+
 #define PNG_INTERNAL
 #include "png.h"
 
@@ -161,36 +229,46 @@ int mmxsupport(void);
 static int mmx_supported = 2;
 
 #ifdef PNG_USE_LOCAL_ARRAYS
-static const int png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
-static const int png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
-static const int png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
+static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
+static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
+static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
 #endif
 
 // djgpp, Win32, and Cygwin add their own underscores to global variables,
 // so define them without:
 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
-#  define _unmask      unmask
-#  define _const4      const4
-#  define _const6      const6
-#  define _mask8_0     mask8_0  
-#  define _mask16_1    mask16_1 
-#  define _mask16_0    mask16_0 
-#  define _mask24_2    mask24_2 
-#  define _mask24_1    mask24_1 
-#  define _mask24_0    mask24_0 
-#  define _mask32_3    mask32_3 
-#  define _mask32_2    mask32_2 
-#  define _mask32_1    mask32_1 
-#  define _mask32_0    mask32_0 
-#  define _mask48_5    mask48_5 
-#  define _mask48_4    mask48_4 
-#  define _mask48_3    mask48_3 
-#  define _mask48_2    mask48_2 
-#  define _mask48_1    mask48_1 
-#  define _mask48_0    mask48_0 
-#  define _FullLength  FullLength
-#  define _MMXLength   MMXLength
-#  define _dif         dif
+#  define _unmask         unmask
+#  define _const4         const4
+#  define _const6         const6
+#  define _mask8_0        mask8_0  
+#  define _mask16_1       mask16_1 
+#  define _mask16_0       mask16_0 
+#  define _mask24_2       mask24_2 
+#  define _mask24_1       mask24_1 
+#  define _mask24_0       mask24_0 
+#  define _mask32_3       mask32_3 
+#  define _mask32_2       mask32_2 
+#  define _mask32_1       mask32_1 
+#  define _mask32_0       mask32_0 
+#  define _mask48_5       mask48_5 
+#  define _mask48_4       mask48_4 
+#  define _mask48_3       mask48_3 
+#  define _mask48_2       mask48_2 
+#  define _mask48_1       mask48_1 
+#  define _mask48_0       mask48_0 
+#  define _FullLength     FullLength
+#  define _MMXLength      MMXLength
+#  define _dif            dif
+#  define _LBCarryMask    LBCarryMask
+#  define _HBClearMask    HBClearMask
+#  define _ActiveMask     ActiveMask
+#  define _ActiveMask2    ActiveMask2
+#  define _ActiveMaskEnd  ActiveMaskEnd
+#  define _ShiftBpp       ShiftBpp
+#  define _ShiftRem       ShiftRem
+#  define _patemp         patemp
+#  define _pbtemp         pbtemp
+#  define _pctemp         pctemp
 #endif
 
 /* These constants are used in the inlined MMX assembly code.
@@ -235,6 +313,9 @@ static unsigned long long _const6   = 0x00000000000000FFLL;
 static png_uint_32  _FullLength;
 static png_uint_32  _MMXLength;
 static int          _dif;
+static int          _patemp;	// temp variables for Paeth routine
+static int          _pbtemp;
+static int          _pctemp;
 
 
 void /* PRIVATE */
@@ -242,6 +323,14 @@ png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info,
    png_bytep row, png_bytep prev_row, int filter);
 
 
+
+
+//===========================================================================//
+//                                                                           //
+//                       P N G _ C O M B I N E _ R O W                       //
+//                                                                           //
+//===========================================================================//
+
 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
 
 /* Combines the row recently read in with the previous row.
@@ -266,10 +355,6 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
    if (mmx_supported == 2)
        mmx_supported = mmxsupport();
 
-/*
-fprintf(stderr, "GRR DEBUG:  png_combine_row() pixel_depth = %d, mask = 0x%02x, unmask = 0x%02x\n", png_ptr->row_info.pixel_depth, mask, ~mask);
-fflush(stderr);
- */
    if (mask == 0xff)
    {
       png_memcpy(row, png_ptr->row_buf + 1,
@@ -533,9 +618,8 @@ fflush(stderr);
                     "2" (len),         // ecx
                     "1" (mask)         // edx
 
-//                  :          // clobber list
 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
-                  : "%mm0", "%mm4", "%mm6", "%mm7"
+                  : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
 #endif
                );
             }
@@ -652,23 +736,22 @@ fflush(stderr);
                 "end16:                       \n\t"
                   "EMMS                       \n\t" // DONE
 
-                  : "=a" (dummy_value_a),              // output regs (dummy)
-                    "=d" (dummy_value_d),
+                  : "=a" (dummy_value_a),           // output regs (dummy)
                     "=c" (dummy_value_c),
+                    "=d" (dummy_value_d),
                     "=S" (dummy_value_S),
                     "=D" (dummy_value_D)
 
-                  : "3" (srcptr),      // esi       // input regs
-                    "4" (dstptr),      // edi
-                    "0" (diff),        // eax
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "2" (len),         // ecx
-                    "1" (mask)         // edx
+                  : "0" (diff),        // eax       // input regs
+// was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
+                    "1" (len),         // ecx
+                    "2" (mask),        // edx
+                    "3" (srcptr),      // esi
+                    "4" (dstptr)       // edi
 
-//                  :          // clobber list
 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
-                  : "%mm0", "%mm1",
-                    "%mm4", "%mm5", "%mm6", "%mm7"
+                  : "%mm0", "%mm1", "%mm4"          // clobber list
+                  , "%mm5", "%mm6", "%mm7"
 #endif
                );
             }
@@ -800,7 +883,7 @@ fflush(stderr);
                 "end24:                       \n\t"
                   "EMMS                       \n\t" // DONE
 
-                  : "=a" (dummy_value_a),              // output regs (dummy)
+                  : "=a" (dummy_value_a),           // output regs (dummy)
                     "=d" (dummy_value_d),
                     "=c" (dummy_value_c),
                     "=S" (dummy_value_S),
@@ -813,10 +896,9 @@ fflush(stderr);
                     "2" (len),         // ecx
                     "1" (mask)         // edx
 
-//                  :          // clobber list
 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
-                  : "%mm0", "%mm1", "%mm2",
-                    "%mm4", "%mm5", "%mm6", "%mm7"
+                  : "%mm0", "%mm1", "%mm2"          // clobber list
+                  , "%mm4", "%mm5", "%mm6", "%mm7"
 #endif
                );
             }
@@ -955,7 +1037,7 @@ fflush(stderr);
                 "end32:                       \n\t"
                   "EMMS                       \n\t" // DONE
 
-                  : "=a" (dummy_value_a),              // output regs (dummy)
+                  : "=a" (dummy_value_a),           // output regs (dummy)
                     "=d" (dummy_value_d),
                     "=c" (dummy_value_c),
                     "=S" (dummy_value_S),
@@ -968,10 +1050,9 @@ fflush(stderr);
                     "2" (len),         // ecx
                     "1" (mask)         // edx
 
-//                  :          // clobber list
 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
-                  : "%mm0", "%mm1", "%mm2", "%mm3",
-                    "%mm4", "%mm5", "%mm6", "%mm7"
+                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
+                  , "%mm4", "%mm5", "%mm6", "%mm7"
 #endif
                );
             }
@@ -1127,7 +1208,7 @@ fflush(stderr);
                 "end48:                       \n\t"
                   "EMMS                       \n\t" // DONE
 
-                  : "=a" (dummy_value_a),              // output regs (dummy)
+                  : "=a" (dummy_value_a),           // output regs (dummy)
                     "=d" (dummy_value_d),
                     "=c" (dummy_value_c),
                     "=S" (dummy_value_S),
@@ -1140,10 +1221,9 @@ fflush(stderr);
                     "2" (len),         // ecx
                     "1" (mask)         // edx
 
-//                  :         // clobber list
 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
-                  : "%mm0", "%mm1", "%mm2", "%mm3",
-                    "%mm4", "%mm5", "%mm6", "%mm7"
+                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
+                  , "%mm4", "%mm5", "%mm6", "%mm7"
 #endif
                );
             }
@@ -1216,6 +1296,13 @@ fflush(stderr);
 
 
 
+
+//===========================================================================//
+//                                                                           //
+//                 P N G _ D O _ R E A D _ I N T E R L A C E                 //
+//                                                                           //
+//===========================================================================//
+
 #if defined(PNG_READ_INTERLACING_SUPPORTED)
 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
 
@@ -1227,41 +1314,17 @@ void /* PRIVATE */
 png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
    png_uint_32 transformations)
 {
-/*
-fprintf(stderr, "GRR DEBUG:  entering png_do_read_interlace()\n");
-if (row == NULL) fprintf(stderr, "GRR DEBUG:  row == NULL\n");
-if (row_info == NULL) fprintf(stderr, "GRR DEBUG:  row_info == NULL\n");
-fflush(stderr);
- */
    png_debug(1,"in png_do_read_interlace\n");
 
    if (mmx_supported == 2)
        mmx_supported = mmxsupport();
-/*
-{
-fprintf(stderr, "GRR DEBUG:  calling mmxsupport()\n");
-fprintf(stderr, "GRR DEBUG:  done with mmxsupport() (mmx_supported = %d)\n", mmx_supported);
-}
- */
 
-/*
-this one happened on first row due to weirdness with mmxsupport():
-if (row == NULL) fprintf(stderr, "GRR DEBUG:  now row == NULL!!!\n");
-  row was in ebx, and even though nothing touched ebx, it still got wiped...
-  [weird side effect of CPUID instruction?]
-if (row_info == NULL) fprintf(stderr, "GRR DEBUG:  now row_info == NULL!!!\n");
- */
    if (row != NULL && row_info != NULL)
    {
       png_uint_32 final_width;
 
       final_width = row_info->width * png_pass_inc[pass];
 
-/*
-fprintf(stderr, "GRR DEBUG:  png_do_read_interlace() row_info->width = %d, final_width = %d\n", row_info->width, final_width);
-fprintf(stderr, "GRR DEBUG:  png_do_read_interlace() pixel_depth = %d\n", row_info->pixel_depth);
-fflush(stderr);
- */
       switch (row_info->pixel_depth)
       {
          case 1:
@@ -1467,6 +1530,7 @@ fflush(stderr);
                      int dummy_value_c;   // fix 'forbidden register spilled'
                      int dummy_value_S;
                      int dummy_value_D;
+
                      __asm__ __volatile__ (
                         "subl $21, %%edi         \n\t"
                                      // (png_pass_inc[pass] - 1)*pixel_bytes
@@ -1496,7 +1560,7 @@ fflush(stderr);
                         "jnz .loop3_pass0        \n\t"
                         "EMMS                    \n\t" // DONE
 
-                        : "=c" (dummy_value_c),           // output regs (dummy)
+                        : "=c" (dummy_value_c),        // output regs (dummy)
                           "=S" (dummy_value_S),
                           "=D" (dummy_value_D)
 
@@ -1505,9 +1569,9 @@ fflush(stderr);
                           "0" (width)      // ecx
 // doesn't work           "i" (0x0000000000FFFFFFLL)   // %1 (a.k.a. _const4)
 
-//                        :        // clobber list
 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                        : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4"
+                        : "%mm0", "%mm1", "%mm2"       // clobber list
+                        , "%mm3", "%mm4"
 #endif
                      );
                   }
@@ -1516,6 +1580,7 @@ fflush(stderr);
                      int dummy_value_c;   // fix 'forbidden register spilled'
                      int dummy_value_S;
                      int dummy_value_D;
+
                      __asm__ __volatile__ (
                         "subl $9, %%edi          \n\t"
                                      // (png_pass_inc[pass] - 1)*pixel_bytes
@@ -1539,7 +1604,7 @@ fflush(stderr);
                         "jnz .loop3_pass2        \n\t"
                         "EMMS                    \n\t" // DONE
 
-                        : "=c" (dummy_value_c),           // output regs (dummy)
+                        : "=c" (dummy_value_c),        // output regs (dummy)
                           "=S" (dummy_value_S),
                           "=D" (dummy_value_D)
 
@@ -1547,9 +1612,8 @@ fflush(stderr);
                           "2" (dp),        // edi
                           "0" (width)      // ecx
 
-//                        :        // clobber list
 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                        : "%mm0", "%mm1", "%mm2"
+                        : "%mm0", "%mm1", "%mm2"       // clobber list
 #endif
                      );
                   }
@@ -1567,6 +1631,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $3, %%esi          \n\t"
                            "subl $9, %%edi          \n\t"
@@ -1593,7 +1658,7 @@ fflush(stderr);
                            "jnz .loop3_pass4        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -1601,9 +1666,9 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1", "%mm2", "%mm3"
+                           : "%mm0", "%mm1"               // clobber list
+                           , "%mm2", "%mm3"
 #endif
                         );
                      }
@@ -1638,6 +1703,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $3, %%esi          \n\t"
                            "subl $31, %%edi         \n\t"
@@ -1665,7 +1731,7 @@ fflush(stderr);
                            "jnz .loop1_pass0        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -1673,9 +1739,9 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :       // clobber list
 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4"
+                           : "%mm0", "%mm1", "%mm2"       // clobber list
+                           , "%mm3", "%mm4"
 #endif
                         );
                      }
@@ -1718,6 +1784,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $3, %%esi          \n\t"
                            "subl $15, %%edi         \n\t"
@@ -1736,7 +1803,7 @@ fflush(stderr);
                            "jnz .loop1_pass2        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -1744,9 +1811,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1"
+                           : "%mm0", "%mm1"               // clobber list
 #endif
                         );
                      }
@@ -1771,6 +1837,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $7, %%esi          \n\t"
                            "subl $15, %%edi         \n\t"
@@ -1788,7 +1855,7 @@ fflush(stderr);
                            "jnz .loop1_pass4        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (none)
+                           : "=c" (dummy_value_c),        // output regs (none)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -1796,9 +1863,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1"
+                           : "%mm0", "%mm1"               // clobber list
 #endif
                         );
                      }
@@ -1828,6 +1894,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $2, %%esi          \n\t"
                            "subl $30, %%edi         \n\t"
@@ -1848,7 +1915,7 @@ fflush(stderr);
                            "jnz .loop2_pass0        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -1856,9 +1923,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1"
+                           : "%mm0", "%mm1"               // clobber list
 #endif
                         );
                      }
@@ -1887,6 +1953,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $2, %%esi          \n\t"
                            "subl $14, %%edi         \n\t"
@@ -1905,7 +1972,7 @@ fflush(stderr);
                            "jnz .loop2_pass2        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -1913,9 +1980,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1"
+                           : "%mm0", "%mm1"               // clobber list
 #endif
                         );
                      }
@@ -1944,6 +2010,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $2, %%esi          \n\t"
                            "subl $6, %%edi          \n\t"
@@ -1958,7 +2025,7 @@ fflush(stderr);
                            "jnz .loop2_pass4        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -1966,9 +2033,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0"
+                           : "%mm0"                       // clobber list
 #endif
                         );
                      }
@@ -1997,21 +2063,12 @@ fflush(stderr);
                   {
                      int width_mmx = ((width >> 1) << 1);
                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
-/*
-fprintf(stderr, "GRR DEBUG:  png_do_read_interlace() pass = %d, width_mmx = %d, width = %d\n", pass, width_mmx, width);
-fprintf(stderr, "            sptr = 0x%08lx, dp = 0x%08lx\n", (unsigned long)sptr, (unsigned long)dp);
-fflush(stderr);
- */
                      if (width_mmx)
                      {
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
-#ifdef GRR_DEBUG
-                        FILE *junk = fopen("junk.4bytes", "wb");
-                        if (junk)
-                           fclose(junk);
-#endif /* GRR_DEBUG */
+
                         __asm__ __volatile__ (
                            "subl $4, %%esi          \n\t"
                            "subl $60, %%edi         \n\t"
@@ -2035,7 +2092,7 @@ fflush(stderr);
                            "jnz .loop4_pass0        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
  
@@ -2043,9 +2100,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1"
+                           : "%mm0", "%mm1"               // clobber list
 #endif
                         );
                      }
@@ -2074,6 +2130,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $4, %%esi          \n\t"
                            "subl $28, %%edi         \n\t"
@@ -2093,7 +2150,7 @@ fflush(stderr);
                            "jnz .loop4_pass2        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
  
@@ -2101,9 +2158,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1"
+                           : "%mm0", "%mm1"               // clobber list
 #endif
                         );
                      }
@@ -2132,6 +2188,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $4, %%esi          \n\t"
                            "subl $12, %%edi         \n\t"
@@ -2149,7 +2206,7 @@ fflush(stderr);
                            "jnz .loop4_pass4        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -2157,9 +2214,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width_mmx)  // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0", "%mm1"
+                           : "%mm0", "%mm1"               // clobber list
 #endif
                         );
                      }
@@ -2181,58 +2237,50 @@ fflush(stderr);
                   }
                } /* end of pixel_bytes == 4 */
 
-#define STILL_WORKING_ON_THIS
-#ifdef STILL_WORKING_ON_THIS  // GRR: should work, but needs testing
-                              //      (special 64-bit version of rpng2)
-
                //--------------------------------------------------------------
                else if (pixel_bytes == 8)
                {
+// GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
                   // GRR NOTE:  no need to combine passes here!
                   if (((pass == 0) || (pass == 1)) && width)
                   {
+                     int dummy_value_c;  // fix 'forbidden register spilled'
+                     int dummy_value_S;
+                     int dummy_value_D;
+
                      // source is 8-byte RRGGBBAA
                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
-                        int dummy_value_c;  // fix 'forbidden register spilled'
-                        int dummy_value_S;
-                        int dummy_value_D;
-#ifdef GRR_DEBUG
-                        FILE *junk = fopen("junk.8bytes", "wb");
-                        if (junk)
-                            fclose(junk);
-#endif /* GRR_DEBUG */
-                        __asm__ __volatile__ (
-                           "subl $56, %%edi         \n\t" // start of last block
+                     __asm__ __volatile__ (
+                        "subl $56, %%edi         \n\t" // start of last block
 
-                        ".loop8_pass0:              \n\t"
-                           "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, (%%edi)     \n\t"
-                           "movq %%mm0, 8(%%edi)    \n\t"
-                           "movq %%mm0, 16(%%edi)   \n\t"
-                           "movq %%mm0, 24(%%edi)   \n\t"
-                           "movq %%mm0, 32(%%edi)   \n\t"
-                           "movq %%mm0, 40(%%edi)   \n\t"
-                           "movq %%mm0, 48(%%edi)   \n\t"
-                           "subl $8, %%esi          \n\t"
-                           "movq %%mm0, 56(%%edi)   \n\t"
-                           "subl $64, %%edi         \n\t"
-                           "decl %%ecx              \n\t"
-                           "jnz .loop8_pass0        \n\t"
-                           "EMMS                    \n\t" // DONE
+                     ".loop8_pass0:              \n\t"
+                        "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
+                        "movq %%mm0, (%%edi)     \n\t"
+                        "movq %%mm0, 8(%%edi)    \n\t"
+                        "movq %%mm0, 16(%%edi)   \n\t"
+                        "movq %%mm0, 24(%%edi)   \n\t"
+                        "movq %%mm0, 32(%%edi)   \n\t"
+                        "movq %%mm0, 40(%%edi)   \n\t"
+                        "movq %%mm0, 48(%%edi)   \n\t"
+                        "subl $8, %%esi          \n\t"
+                        "movq %%mm0, 56(%%edi)   \n\t"
+                        "subl $64, %%edi         \n\t"
+                        "decl %%ecx              \n\t"
+                        "jnz .loop8_pass0        \n\t"
+                        "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
+                        : "=c" (dummy_value_c),        // output regs (dummy)
+                          "=S" (dummy_value_S),
+                          "=D" (dummy_value_D)
 
-                           : "1" (sptr),      // esi      // input regs
-                             "2" (dp),        // edi
-                             "0" (width)      // ecx
+                        : "1" (sptr),      // esi      // input regs
+                          "2" (dp),        // edi
+                          "0" (width)      // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0"
+                        : "%mm0"                       // clobber list
 #endif
-                        );
+                     );
                   }
                   else if (((pass == 2) || (pass == 3)) && width)
                   {
@@ -2245,6 +2293,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $24, %%edi         \n\t" // start of last block
 
@@ -2260,7 +2309,7 @@ fflush(stderr);
                            "jnz .loop8_pass2        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -2268,9 +2317,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width)      // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0"
+                           : "%mm0"                       // clobber list
 #endif
                         );
                      }
@@ -2286,6 +2334,7 @@ fflush(stderr);
                         int dummy_value_c;  // fix 'forbidden register spilled'
                         int dummy_value_S;
                         int dummy_value_D;
+
                         __asm__ __volatile__ (
                            "subl $8, %%edi          \n\t" // start of last block
 
@@ -2299,7 +2348,7 @@ fflush(stderr);
                            "jnz .loop8_pass4        \n\t"
                            "EMMS                    \n\t" // DONE
 
-                           : "=c" (dummy_value_c),           // output regs (dummy)
+                           : "=c" (dummy_value_c),        // output regs (dummy)
                              "=S" (dummy_value_S),
                              "=D" (dummy_value_D)
 
@@ -2307,9 +2356,8 @@ fflush(stderr);
                              "2" (dp),        // edi
                              "0" (width)      // ecx
 
-//                           :        // clobber list
 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
-                           : "%mm0"
+                           : "%mm0"                       // clobber list
 #endif
                         );
                      }
@@ -2317,8 +2365,6 @@ fflush(stderr);
 
                } /* end of pixel_bytes == 8 */
 
-#endif /* STILL_WORKING_ON_THIS */
-
                //--------------------------------------------------------------
                else if (pixel_bytes == 6)
                {
@@ -2477,19 +2523,30 @@ fflush(stderr);
 #endif /* PNG_READ_INTERLACING_SUPPORTED */
 
 
+
+
 // These variables are utilized in the functions below.  They are declared
 // globally here to ensure alignment on 8-byte boundaries.
 
 union uAll {
    long long use;
    double  align;
-} LBCarryMask = {0x0101010101010101LL},
-  HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
-  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
+} _LBCarryMask = {0x0101010101010101LL},
+  _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
+  _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
 
 
+
+
+//===========================================================================//
+//                                                                           //
+//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
+//                                                                           //
+//===========================================================================//
+
 // Optimized code for PNG Average filter decoder
-void /* PRIVATE */
+
+static void /* PRIVATE */
 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
                             png_bytep prev_row)
 {
@@ -2497,29 +2554,32 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
    int dummy_value_S;
    int dummy_value_D;
-// int diff;  GRR: global now (shortened to dif/_dif)
 
-   bpp = (row_info->pixel_depth + 7) >> 3;  // Get # bytes per pixel
-   _FullLength  = row_info->rowbytes;        // # of bytes to filter
+   bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
+   _FullLength  = row_info->rowbytes;       // # of bytes to filter
+
    __asm__ __volatile__ (
-      // Init address pointers and offset
-//GRR "movl row, %%edi             \n\t" // edi ==> Avg(x)
-      "xorl %%ebx, %%ebx           \n\t" // ebx ==> x
+      // initialize address pointers and offset
+#ifdef __PIC__
+      "pushl %%ebx                 \n\t" // save index to Global Offset Table
+#endif
+//pre "movl row, %%edi             \n\t" // edi:  Avg(x)
+      "xorl %%ebx, %%ebx           \n\t" // ebx:  x
       "movl %%edi, %%edx           \n\t"
-//GRR "movl prev_row, %%esi        \n\t" // esi ==> Prior(x)
-//GRR "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
-      "subl %%ecx, %%edx           \n\t" // edx ==> Raw(x-bpp)
+//pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+//pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+      "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
 
       "xorl %%eax,%%eax            \n\t"
 
       // Compute the Raw value for the first bpp bytes
       //    Raw(x) = Avg(x) + (Prior(x)/2)
    "avg_rlp:                       \n\t"
-      "movb (%%esi,%%ebx,),%%al    \n\t" // Load al with Prior(x)
+      "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
       "incl %%ebx                  \n\t"
       "shrb %%al                   \n\t" // divide by 2
       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
-//GRR "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
+//pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
       "cmpl %%ecx, %%ebx           \n\t"
       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
       "jb avg_rlp                  \n\t" // mov does not affect flags
@@ -2529,13 +2589,14 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
       "addl %%ebx, _dif            \n\t" // add bpp
       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
-      "subl %%edi, _dif            \n\t" // subtract from start => value ebx at alignment
-      "jz avg_go                   \n\t"
+      "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
+      "jz avg_go                   \n\t" //  alignment
 
       // fix alignment
       // Compute the Raw value for the bytes up to the alignment boundary
       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
       "xorl %%ecx, %%ecx           \n\t"
+
    "avg_lp1:                       \n\t"
       "xorl %%eax, %%eax           \n\t"
       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
@@ -2555,108 +2616,116 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
       "movl %%ecx, _MMXLength      \n\t"
+#ifdef __PIC__
+      "popl %%ebx                  \n\t" // restore index to Global Offset Table
+#endif
 
-      : "=c" (dummy_value_c), // output regs/vars here, e.g., "=m" (_MMXLength) instead of final instr
+      : "=c" (dummy_value_c),            // output regs (dummy)
         "=S" (dummy_value_S),
         "=D" (dummy_value_D)
 
-      : "1" (prev_row),  // esi          // input regs
-        "2" (row),       // edi
-        "0" (bpp)        // ecx
+      : "0" (bpp),       // ecx          // input regs
+        "1" (prev_row),  // esi
+        "2" (row)        // edi
 
-      : "%eax", "%ebx",          // clobber list
-        "%edx"
-// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)     PROBABLY
+      : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+      , "%ebx"
+#endif
+      // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
+      // (seems to work fine without...)
    );
 
-#ifdef GRR_GCC_MMX_CONVERTED
-   // Now do the math for the rest of the row
-   switch ( bpp )
+   // now do the math for the rest of the row
+   switch (bpp)
    {
       case 3:
       {
-         ActiveMask.use  = 0x0000000000ffffff;
-         ShiftBpp.use = 24;    // == 3 * 8
-         ShiftRem.use = 40;    // == 64 - 24
-         __asm__ (
-            // Re-init address pointers and offset
-            "movq $ActiveMask, %%mm7     \n\t"
-            "movl _dif, %%ebx            \n\t" // ebx ==> x = offset to alignment boundary
-            "movq $LBCarryMask, %%mm5    \n\t"
-            "movl row, %%edi             \n\t" // edi ==> Avg(x)
-            "movq $HBClearMask, %%mm4    \n\t"
-            "movl prev_row, %%esi        \n\t" // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set)
-            "movq -8(%%edi,%%ebx,), %%mm2 \n\t" // Load previous aligned 8 bytes
-                                          // (we correct position in loop below)
-         "avg_3lp:                       \n\t"
-            "movq (%%edi,%%ebx,), %%mm0  \n\t" // Load mm0 with Avg(x)
-            // Add (Prev_row/2) to Average
-            "movq %%mm5, %%mm3           \n\t"
-            "psrlq $ShiftRem, %%mm2      \n\t" // Correct position Raw(x-bpp) data
-            "movq (%%esi,%%ebx,), %%mm1  \n\t" // Load mm1 with Prior(x)
-            "movq %%mm7, %%mm6           \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                               // lsb's were == 1 (Only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm6, %%mm2           \n\t" // Leave only Active Group 1 bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
+         _ActiveMask.use  = 0x0000000000ffffffLL;
+         _ShiftBpp.use = 24;    // == 3 * 8
+         _ShiftRem.use = 40;    // == 64 - 24
+
+         __asm__ __volatile__ (
+            // re-init address pointers and offset
+            "movq _ActiveMask, %%mm7      \n\t"
+            "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
+            "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
+// preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
+            "movq _HBClearMask, %%mm4     \n\t"
+// preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+                                                // (correct pos. in loop below)
+         "avg_3lp:                        \n\t"
+            "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
+            "movq %%mm5, %%mm3            \n\t"
+            "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp) data
+            "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
+            "movq %%mm7, %%mm6            \n\t"
+            "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
+            "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
+            "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each byte
+            "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for each byte
+            // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
+            "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
+            "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
+                               // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
+            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1 bytes to add to Avg
+            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
                                //  byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            "psllq $ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 3-5
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $ShiftBpp, %%mm2      \n\t" // shift data to position correctly
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                               // lsb's were == 1 (Only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm6, %%mm2           \n\t" // Leave only Active Group 2 bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
+            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover bytes 3-5
+            "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
+            "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
+            "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
+                               // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
+            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2 bytes to add to Avg
+            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
                                //  byte
 
-            // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
-            "psllq $ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover the last two
+            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last two
                                  // bytes
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $ShiftBpp, %%mm2      \n\t" // shift data to position correctly
+            "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
                               // Data only needs to be shifted once here to
                               // get the correct x-bpp offset.
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm6, %%mm2           \n\t" // Leave only Active Group 2 bytes to add to Avg
-            "addl $8, %%ebx              \n\t"
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
-                                               // byte
-            // Now ready to write back to memory
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
-            // Move updated Raw(x) to use as Raw(x-bpp) for next loop
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raw(x) to mm2
-            "jb avg_3lp                  \n\t"
+            "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
+            "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
+                              // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
+            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2 bytes to add to Avg
+            "addl $8, %%ecx               \n\t"
+            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
+                                                // byte
+            // now ready to write back to memory
+            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+            // move updated Raw(x) to use as Raw(x-bpp) for next loop
+            "cmpl _MMXLength, %%ecx       \n\t"
+            "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
+            "jb avg_3lp                   \n\t"
 
-            : // output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
 
-            : "S" (prev_row),  // esi          // input regs
-              "D" (row)        // edi
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
 
-            : "%ebx", "%edi", "%esi"           // clobber list
-//            GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)   PROBABLY
-//          , "%mm0", "%mm1", "%mm2", "%mm3",
-//            "%mm4", "%mm5", "%mm6", "%mm7"
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
       break;  // end 3 bpp
@@ -2664,189 +2733,207 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
       case 6:
       case 4:
       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
-      //case 5:
+      //case 5:   // GRR BOGUS
       {
-         ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
-                                                // appropriate inactive bytes
-         ShiftBpp.use = bpp << 3;
-         ShiftRem.use = 64 - ShiftBpp.use;
-         __asm__ (
-            "movq $HBClearMask, %%mm4    \n\t"
+         _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
+                                                  // appropriate inactive bytes
+         _ShiftBpp.use = bpp << 3;
+         _ShiftRem.use = 64 - _ShiftBpp.use;
 
-            // Re-init address pointers and offset
-            "movl _dif, %%ebx            \n\t" // ebx ==> x = offset to alignment boundary
+         __asm__ __volatile__ (
+            "movq _HBClearMask, %%mm4    \n\t"
 
-            // Load ActiveMask and clear all bytes except for 1st active group
-            "movq $ActiveMask, %%mm7     \n\t"
-            "movl row, %%edi             \n\t" // edi ==> Avg(x)
-            "psrlq $ShiftRem, %%mm7      \n\t"
-            "movl prev_row, %%esi        \n\t" // esi ==> Prior(x)
+            // re-init address pointers and offset
+            "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment boundary
+
+            // load _ActiveMask and clear all bytes except for 1st active group
+            "movq _ActiveMask, %%mm7     \n\t"
+// preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "psrlq _ShiftRem, %%mm7      \n\t"
+// preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
             "movq %%mm7, %%mm6           \n\t"
-            "movq $LBCarryMask, %%mm5    \n\t"
-            "psllq $ShiftBpp, %%mm6      \n\t" // Create mask for 2nd active group
+            "movq _LBCarryMask, %%mm5    \n\t"
+            "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active group
 
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm2 \n\t" // Load previous aligned 8 bytes
-                                          // (we correct position in loop below)
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+                                          // (we correct pos. in loop below)
          "avg_4lp:                       \n\t"
-            "movq (%%edi,%%ebx,), %%mm0  \n\t"
-            "psrlq $ShiftRem, %%mm2      \n\t" // shift data to position correctly
-            "movq (%%esi,%%ebx,), %%mm1  \n\t"
-            // Add (Prev_row/2) to Average
+            "movq (%%edi,%%ecx,), %%mm0  \n\t"
+            "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
+            "movq (%%esi,%%ecx,), %%mm1  \n\t"
+            // add (Prev_row/2) to average
             "movq %%mm5, %%mm3           \n\t"
             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
+            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
+                              // lsb's were == 1 (only valid for active group)
             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm7, %%mm2           \n\t" // Leave only Active Group 1 bytes to add to Avg
+            "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1 bytes to add to Avg
             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
                               // byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
+            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $ShiftBpp, %%mm2      \n\t" // shift data to position correctly
-            "addl $8, %%ebx              \n\t"
+            "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
+            "addl $8, %%ecx              \n\t"
             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
+                              // lsb's were == 1 (only valid for active group)
             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm6, %%mm2           \n\t" // Leave only Active Group 2 bytes to add to Avg
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
                               // byte
-            "cmpl _MMXLength, %%ebx      \n\t"
-            // Now ready to write back to memory
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
-            // Prep Raw(x-bpp) for next loop
+            "cmpl _MMXLength, %%ecx      \n\t"
+            // now ready to write back to memory
+            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+            // prep Raw(x-bpp) for next loop
             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
             "jb avg_4lp                  \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=S" (dummy_value_S),            // output regs (dummy)
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (prev_row),  // esi          // input regs
+              "1" (row)        // edi
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+            : "%ecx"                           // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
       break;  // end 4,6 bpp
 
       case 2:
       {
-         ActiveMask.use  = 0x000000000000ffff;
-         ShiftBpp.use = 24;   // == 3 * 8
-         ShiftRem.use = 40;   // == 64 - 24
-         __asm__ (
-            // Load ActiveMask
-            "movq $ActiveMask, %%mm7     \n\t"
-            // Re-init address pointers and offset
-            "movl _dif, %%ebx            \n\t" // ebx ==> x = offset to alignment boundary
-            "movq $LBCarryMask, %%mm5    \n\t"
-            "movl row, %%edi             \n\t" // edi ==> Avg(x)
-            "movq $HBClearMask, %%mm4    \n\t"
-            "movl prev_row, %%esi        \n\t" // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm2 \n\t" // Load previous aligned 8 bytes
-                              // (we correct position in loop below)
+         _ActiveMask.use  = 0x000000000000ffffLL;
+         _ShiftBpp.use = 16;   // == 2 * 8
+         _ShiftRem.use = 48;   // == 64 - 16
+
+         __asm__ __volatile__ (
+            // load _ActiveMask
+            "movq _ActiveMask, %%mm7     \n\t"
+            // re-init address pointers and offset
+            "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment boundary
+            "movq _LBCarryMask, %%mm5    \n\t"
+// preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "movq _HBClearMask, %%mm4    \n\t"
+// preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+                              // (we correct pos. in loop below)
          "avg_2lp:                       \n\t"
-            "movq (%%edi,%%ebx,), %%mm0  \n\t"
-            "psllq $ShiftRem, %%mm2      \n\t" // shift data to position correctly
-            "movq (%%esi,%%ebx,), %%mm1  \n\t"
-            // Add (Prev_row/2) to Average
+            "movq (%%edi,%%ecx,), %%mm0  \n\t"
+            "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
+            "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
+            // add (Prev_row/2) to average
             "movq %%mm5, %%mm3           \n\t"
             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
             "movq %%mm7, %%mm6           \n\t"
             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
+
+            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
+                                               // lsb's were == 1 (only valid for active group)
             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm6, %%mm2           \n\t" // Leave only Active Group 1 bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            "psllq $ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 2 & 3
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $ShiftBpp, %%mm2      \n\t" // shift data to position correctly
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                                // lsb's were == 1 (Only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm6, %%mm2           \n\t" // Leave only Active Group 2 bytes to add to Avg
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1 bytes to add to Avg
             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
 
-            // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
-            "psllq $ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 4 & 5
+            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 2 & 3
             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $ShiftBpp, %%mm2      \n\t" // shift data to position correctly
-                                // Data only needs to be shifted once here to
-                                // get the correct x-bpp offset.
+            "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                                // lsb's were == 1 (Only valid for active group)
+                                               // lsb's were == 1 (only valid for active group)
             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm6, %%mm2           \n\t" // Leave only Active Group 2 bytes to add to Avg
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
 
-            // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
-            "psllq $ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 6 & 7
+            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 4 & 5
             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $ShiftBpp, %%mm2      \n\t" // shift data to position correctly
-                                 // Data only needs to be shifted once here to
-                                 // get the correct x-bpp offset.
-            "addl $8, %%ebx              \n\t"
+            "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
-                             // lsb's were == 1 (Only valid for active group)
+                                               // lsb's were == 1 (only valid for active group)
             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            "pand %%mm6, %%mm2           \n\t" // Leave only Active Group 2 bytes to add to Avg
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
 
-            "cmpl _MMXLength, %%ebx      \n\t"
-            // Now ready to write back to memory
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
-            // Prep Raw(x-bpp) for next loop
+            // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
+            "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 6 & 7
+            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
+            "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
+            "addl $8, %%ecx              \n\t"
+            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
+            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
+                                               // lsb's were == 1 (only valid for active group)
+            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
+            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
+            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
+
+            "cmpl _MMXLength, %%ecx      \n\t"
+            // now ready to write back to memory
+            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+            // prep Raw(x-bpp) for next loop
             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
             "jb avg_2lp                  \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=S" (dummy_value_S),            // output regs (dummy)
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (prev_row),  // esi          // input regs
+              "1" (row)        // edi
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+            : "%ecx"                           // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
       break;  // end 2 bpp
 
       case 1:
       {
-         __asm__ (
-            // Re-init address pointers and offset
-            "movl _dif, %%ebx            \n\t" // ebx ==> x = offset to alignment boundary
-            "movl row, %%edi             \n\t" // edi ==> Avg(x)
-            "cmpl _FullLength, %%ebx     \n\t" // Test if offset at end of array
+         __asm__ __volatile__ (
+            // re-init address pointers and offset
+#ifdef __PIC__
+            "pushl %%ebx                 \n\t" // save Global Offset Table index
+#endif
+            "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment boundary
+// preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
             "jnb avg_1end                \n\t"
-            // Do Paeth decode for remaining bytes
-            "movl prev_row, %%esi        \n\t" // esi ==> Prior(x)
+            // do Paeth decode for remaining bytes
+// preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
             "movl %%edi, %%edx           \n\t"
-            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx in loop below
-            "subl bpp, %%edx             \n\t" // edx ==> Raw(x-bpp)
+// preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+            "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
+            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
+                                               //  in loop below
          "avg_1lp:                       \n\t"
             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
             "xorl %%eax, %%eax           \n\t"
@@ -2855,77 +2942,99 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
             "addw %%cx, %%ax             \n\t"
             "incl %%ebx                  \n\t"
             "shrw %%ax                   \n\t" // divide by 2
-            "addb -1(%%edi,%%ebx,), %%al \n\t" // Add Avg(x); -1 to offset inc ebx
-            "cmpl _FullLength, %%ebx     \n\t" // Check if at end of array
-            "movb %%al, -1(%%edi,%%ebx,) \n\t" // Write back Raw(x);
+            "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
+            "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
+            "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
                          // mov does not affect flags; -1 to offset inc ebx
             "jb avg_1lp                  \n\t"
+
          "avg_1end:                      \n\t"
+#ifdef __PIC__
+            "popl %%ebx                  \n\t" // Global Offset Table index
+#endif
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=c" (dummy_value_c),            // output regs (dummy)
+              "=S" (dummy_value_S),
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (bpp),       // ecx          // input regs
+              "1" (prev_row),  // esi
+              "2" (row)        // edi
 
-            : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
+            : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+            , "%ebx"
+#endif
          );
       }
       return;  // end 1 bpp
 
       case 8:
       {
-         __asm__ (
-            // Re-init address pointers and offset
-            "movl _dif, %%ebx            \n\t" // ebx ==> x = offset to alignment boundary
-            "movq $LBCarryMask, %%mm5    \n\t"
-            "movl row, %%edi             \n\t" // edi ==> Avg(x)
-            "movq $HBClearMask, %%mm4    \n\t"
-            "movl prev_row, %%esi        \n\t" // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm2 \n\t" // Load previous aligned 8 bytes
-                                // (NO NEED to correct position in loop below)
+         __asm__ __volatile__ (
+            // re-init address pointers and offset
+            "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
+            "movq _LBCarryMask, %%mm5    \n\t" //            boundary
+// preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "movq _HBClearMask, %%mm4    \n\t"
+// preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+                                      // (NO NEED to correct pos. in loop below)
+
          "avg_8lp:                       \n\t"
-            "movq (%%edi,%%ebx,), %%mm0  \n\t"
+            "movq (%%edi,%%ecx,), %%mm0  \n\t"
             "movq %%mm5, %%mm3           \n\t"
-            "movq (%%esi,%%ebx,), %%mm1  \n\t"
-            "addl $8, %%ebx              \n\t"
+            "movq (%%esi,%%ecx,), %%mm1  \n\t"
+            "addl $8, %%ecx              \n\t"
             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte where both
-                                // lsb's were == 1
+            "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
+                                               //  where both lsb's were == 1
             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
-            "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each byte
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each byte
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
+            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
+            "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
+            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
+            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
+            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
             "jb avg_8lp                  \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=S" (dummy_value_S),            // output regs (dummy)
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (prev_row),  // esi          // input regs
+              "1" (row)        // edi
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5" // CHECKASM: clobber list
+            : "%ecx"                           // clobber list
+#if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2"
+            , "%mm3", "%mm4", "%mm5"
+#endif
          );
       }
       break;  // end 8 bpp
 
-      default:                  // bpp greater than 8 (!= 1,2,3,4,6,8)
+      default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
       {
 
-      GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED (unless smaller than 1?)
+         // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
+         fprintf(stderr,
+           "libpng:  internal logic error (png_read_filter_row_mmx_avg())\n");
 
-        __asm__ (
-            "movq $LBCarryMask, %%mm5    \n\t"
-            // Re-init address pointers and offset
-            "movl _dif, %%ebx            \n\t" // ebx ==> x = offset to alignment boundary
-            "movl row, %%edi             \n\t" // edi ==> Avg(x)
-            "movq $HBClearMask, %%mm4    \n\t"
+#if 0
+        __asm__ __volatile__ (
+            "movq _LBCarryMask, %%mm5    \n\t"
+            // re-init address pointers and offset
+            "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment boundary
+            "movl row, %%edi             \n\t" // edi:  Avg(x)
+            "movq _HBClearMask, %%mm4    \n\t"
             "movl %%edi, %%edx           \n\t"
-            "movl prev_row, %%esi        \n\t" // esi ==> Prior(x)
-            "subl bpp, %%edx             \n\t" // edx ==> Raw(x-bpp)
+            "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
+            "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
          "avg_Alp:                       \n\t"
             "movq (%%edi,%%ebx,), %%mm0  \n\t"
             "movq %%mm5, %%mm3           \n\t"
@@ -2950,24 +3059,32 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
 
             : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
 
-            : "%ebx", "%edx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5" // CHECKASM: clobber list
+            : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
          );
+#endif /* 0 - NEVER REACHED */
       }
       break;
-   }                         // end switch ( bpp )
 
-   __asm__ (
-      // MMX acceleration complete now do clean-up
-      // Check if any remaining bytes left to decode
-      "movl _MMXLength, %%ebx      \n\t" // ebx ==> x = offset bytes remaining after MMX
-      "movl row, %%edi             \n\t" // edi ==> Avg(x)
-      "cmpl _FullLength, %%ebx     \n\t" // Test if offset at end of array
+   } // end switch (bpp)
+
+   __asm__ __volatile__ (
+      // MMX acceleration complete; now do clean-up
+      // check if any remaining bytes left to decode
+#ifdef __PIC__
+      "pushl %%ebx                 \n\t" // save index to Global Offset Table
+#endif
+      "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
+//pre "movl row, %%edi             \n\t" // edi:  Avg(x)
+      "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
       "jnb avg_end                 \n\t"
-      // Do Paeth decode for remaining bytes
-      "movl prev_row, %%esi        \n\t" // esi ==> Prior(x)
+
+      // do Avg decode for remaining bytes
+//pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
       "movl %%edi, %%edx           \n\t"
-      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx in loop below
-      "subl bpp, %%edx             \n\t" // edx ==> Raw(x-bpp)
+//pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+      "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
+      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
+
    "avg_lp2:                       \n\t"
       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
       "xorl %%eax, %%eax           \n\t"
@@ -2976,39 +3093,64 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
       "addw %%cx, %%ax             \n\t"
       "incl %%ebx                  \n\t"
       "shrw %%ax                   \n\t" // divide by 2
-      "addb -1(%%edi,%%ebx,), %%al \n\t" // Add Avg(x); -1 to offset inc ebx
-      "cmpl _FullLength, %%ebx     \n\t" // Check if at end of array
-      "movb %%al, -1(%%edi,%%ebx,) \n\t" // Write back Raw(x);
-                       // mov does not affect flags; -1 to offset inc ebx
-      "jb avg_lp2                  \n\t"
+      "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
+      "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
+      "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
+      "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
+
    "avg_end:                       \n\t"
-      "emms                        \n\t" // End MMX instructions; prep for possible FP instrs.
+      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
+#ifdef __PIC__
+      "popl %%ebx                  \n\t" // restore index to Global Offset Table
+#endif
 
-      : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+      : "=c" (dummy_value_c),            // output regs (dummy)
+        "=S" (dummy_value_S),
+        "=D" (dummy_value_D)
 
-      : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+      : "0" (bpp),       // ecx          // input regs
+        "1" (prev_row),  // esi
+        "2" (row)        // edi
 
-      : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
+      : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+      , "%ebx"
+#endif
    );
-#endif /* GRR_GCC_MMX_CONVERTED */
-}
+
+} /* end png_read_filter_row_mmx_avg() */
+
+
+
+
+//===========================================================================//
+//                                                                           //
+//         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
+//                                                                           //
+//===========================================================================//
 
 // Optimized code for PNG Paeth filter decoder
-void /* PRIVATE */
+
+static void /* PRIVATE */
 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
                               png_bytep prev_row)
 {
-#ifdef GRR_GCC_MMX_CONVERTED
    int bpp;
-   int patemp, pbtemp, pctemp;
+   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
+   int dummy_value_S;
+   int dummy_value_D;
 
    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
    _FullLength  = row_info->rowbytes; // # of bytes to filter
-   __asm__ (
-      "xorl %%ebx, %%ebx           \n\t" // ebx ==> x offset
-      "movl row, %%edi             \n\t"
-      "xorl %%edx, %%edx           \n\t" // edx ==> x-bpp offset
-      "movl prev_row, %%esi        \n\t"
+
+   __asm__ __volatile__ (
+#ifdef __PIC__
+      "pushl %%ebx                 \n\t" // save index to Global Offset Table
+#endif
+      "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
+//pre "movl row, %%edi             \n\t"
+      "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
+//pre "movl prev_row, %%esi        \n\t"
       "xorl %%eax, %%eax           \n\t"
 
       // Compute the Raw value for the first bpp bytes
@@ -3018,7 +3160,8 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
       "movb (%%edi,%%ebx,), %%al   \n\t"
       "addb (%%esi,%%ebx,), %%al   \n\t"
       "incl %%ebx                  \n\t"
-      "cmpl bpp, %%ebx             \n\t"
+//pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
+      "cmpl %%ecx, %%ebx           \n\t"
       "movb %%al, -1(%%edi,%%ebx,) \n\t"
       "jb paeth_rlp                \n\t"
       // get # of bytes to alignment
@@ -3030,62 +3173,70 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
       "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx at alignment
       "jz paeth_go                 \n\t"
       // fix alignment
+
    "paeth_lp1:                     \n\t"
       "xorl %%eax, %%eax           \n\t"
       // pav = p - a = (a + b - c) - a = b - c
       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
-      "movl %%eax, patemp          \n\t" // Save pav for later use
+      "movl %%eax, _patemp         \n\t" // Save pav for later use
       "xorl %%eax, %%eax           \n\t"
       // pbv = p - b = (a + b - c) - b = a - c
       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
       "movl %%eax, %%ecx           \n\t"
       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-      "addl patemp, %%eax          \n\t" // pcv = pav + pbv
+      "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
       // pc = abs(pcv)
       "testl $0x80000000, %%eax    \n\t"
       "jz paeth_pca                \n\t"
       "negl %%eax                  \n\t" // reverse sign of neg values
+
    "paeth_pca:                     \n\t"
-      "movl %%eax, pctemp          \n\t" // save pc for later use
+      "movl %%eax, _pctemp         \n\t" // save pc for later use
       // pb = abs(pbv)
       "testl $0x80000000, %%ecx    \n\t"
       "jz paeth_pba                \n\t"
       "negl %%ecx                  \n\t" // reverse sign of neg values
+
    "paeth_pba:                     \n\t"
-      "movl %%ecx, pbtemp          \n\t" // save pb for later use
+      "movl %%ecx, _pbtemp         \n\t" // save pb for later use
       // pa = abs(pav)
-      "movl patemp, %%eax          \n\t"
+      "movl _patemp, %%eax         \n\t"
       "testl $0x80000000, %%eax    \n\t"
       "jz paeth_paa                \n\t"
       "negl %%eax                  \n\t" // reverse sign of neg values
+
    "paeth_paa:                     \n\t"
-      "movl %%eax, patemp          \n\t" // save pa for later use
+      "movl %%eax, _patemp         \n\t" // save pa for later use
       // test if pa <= pb
       "cmpl %%ecx, %%eax           \n\t"
       "jna paeth_abb               \n\t"
       // pa > pb; now test if pb <= pc
-      "cmpl pctemp, %%ecx          \n\t"
+      "cmpl _pctemp, %%ecx         \n\t"
       "jna paeth_bbc               \n\t"
       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
       "jmp paeth_paeth             \n\t"
+
    "paeth_bbc:                     \n\t"
       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
       "jmp paeth_paeth             \n\t"
+
    "paeth_abb:                     \n\t"
       // pa <= pb; now test if pa <= pc
-      "cmpl pctemp, %%eax          \n\t"
+      "cmpl _pctemp, %%eax         \n\t"
       "jna paeth_abc               \n\t"
       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
       "jmp paeth_paeth             \n\t"
+
    "paeth_abc:                     \n\t"
       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
+
    "paeth_paeth:                   \n\t"
       "incl %%ebx                  \n\t"
       "incl %%edx                  \n\t"
@@ -3093,6 +3244,7 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
       "cmpl _dif, %%ebx            \n\t"
       "jb paeth_lp1                \n\t"
+
    "paeth_go:                      \n\t"
       "movl _FullLength, %%ecx     \n\t"
       "movl %%ecx, %%eax           \n\t"
@@ -3100,40 +3252,51 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
       "movl %%ecx, _MMXLength      \n\t"
+#ifdef __PIC__
+      "popl %%ebx                  \n\t" // restore index to Global Offset Table
+#endif
 
-      : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+      : "=c" (dummy_value_c),            // output regs (dummy)
+        "=S" (dummy_value_S),
+        "=D" (dummy_value_D)
 
-      : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+      : "0" (bpp),       // ecx          // input regs
+        "1" (prev_row),  // esi
+        "2" (row)        // edi
 
-      : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
+      : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+      , "%ebx"
+#endif
    );
 
-   // Now do the math for the rest of the row
-   switch ( bpp )
+   // now do the math for the rest of the row
+   switch (bpp)
    {
       case 3:
       {
-         ActiveMask.use = 0x0000000000ffffff;
-         ActiveMaskEnd.use = 0xffff000000000000;
-         ShiftBpp.use = 24;    // == bpp(3) * 8
-         ShiftRem.use = 40;    // == 64 - 24
-         __asm__ (
-            "movl _dif, %%ebx            \n\t"
-            "movl row, %%edi             \n\t"
-            "movl prev_row, %%esi        \n\t"
+         _ActiveMask.use = 0x0000000000ffffffLL;
+         _ActiveMaskEnd.use = 0xffff000000000000LL;
+         _ShiftBpp.use = 24;    // == bpp(3) * 8
+         _ShiftRem.use = 40;    // == 64 - 24
+
+         __asm__ __volatile__ (
+            "movl _dif, %%ecx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t"
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
          "paeth_3lp:                     \n\t"
-            "psrlq $ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st 3 bytes
-            "movq (%%esi,%%ebx,), %%mm2  \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm1      \n\t" // Unpack High bytes of a
-            "movq -8(%%esi,%%ebx,), %%mm3 \n\t" // Prep c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm2      \n\t" // Unpack High bytes of b
-            "psrlq $ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st 3 bytes
+            "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st 3 bytes
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st 3 bytes
             // pav = p - a = (a + b - c) - a = b - c
             "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // Unpack High bytes of c
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
             // pbv = p - b = (a + b - c) - b = a - c
             "movq %%mm1, %%mm5           \n\t"
             "psubw %%mm3, %%mm4          \n\t"
@@ -3145,17 +3308,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // Create mask pav bytes < 0
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
             "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // Only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // Create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
             "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // Only pbv bytes < 0 in mm0
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm7, %%mm5          \n\t"
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
@@ -3179,18 +3342,18 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "paddw %%mm3, %%mm7          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
             "packuswb %%mm1, %%mm7       \n\t"
-            "movq (%%esi,%%ebx,), %%mm3  \n\t" // load c=Prior(x-bpp)
-            "pand $ActiveMask, %%mm7     \n\t"
+            "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
+            "pand _ActiveMask, %%mm7     \n\t"
             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
-            "paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
-            "punpcklbw %%mm0, %%mm3      \n\t" // Unpack High bytes of c
-            "movq %%mm7, (%%edi,%%ebx,)  \n\t" // write back updated value
-            "movq %%mm7, %%mm1           \n\t" // Now mm1 will be used as Raw(x-bpp)
-            // Now do Paeth for 2nd set of bytes (3-5)
-            "psrlq $ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
-            "punpcklbw %%mm0, %%mm1      \n\t" // Unpack High bytes of a
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
+            // now do Paeth for 2nd set of bytes (3-5)
+            "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
             "pxor %%mm7, %%mm7           \n\t"
-            "punpcklbw %%mm0, %%mm2      \n\t" // Unpack High bytes of b
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
             // pbv = p - b = (a + b - c) - b = a - c
             "movq %%mm1, %%mm5           \n\t"
             // pav = p - a = (a + b - c) - a = b - c
@@ -3205,17 +3368,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm5, %%mm0        \n\t" // Create mask pbv bytes < 0
-            "pcmpgtw %%mm4, %%mm7        \n\t" // Create mask pav bytes < 0
-            "pand %%mm5, %%mm0           \n\t" // Only pbv bytes < 0 in mm0
-            "pand %%mm4, %%mm7           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
+            "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
+            "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
+            "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm0, %%mm5          \n\t"
             "psubw %%mm7, %%mm4          \n\t"
             "psubw %%mm0, %%mm5          \n\t"
             "psubw %%mm7, %%mm4          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
             "movq %%mm4, %%mm7           \n\t"
@@ -3232,7 +3395,7 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "paddw %%mm2, %%mm0          \n\t"
             //  test  ((pa <= pb)? pa:pb) <= pc
             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "movq (%%esi,%%ebx,), %%mm2  \n\t" // load b=Prior(x)
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
             "pand %%mm7, %%mm3           \n\t"
             "pandn %%mm0, %%mm7          \n\t"
             "pxor %%mm1, %%mm1           \n\t"
@@ -3240,21 +3403,21 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "pxor %%mm0, %%mm0           \n\t"
             "packuswb %%mm1, %%mm7       \n\t"
             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
-            "pand $ActiveMask, %%mm7     \n\t"
-            "punpckhbw %%mm0, %%mm2      \n\t" // Unpack High bytes of b
-            "psllq $ShiftBpp, %%mm7      \n\t" // Shift bytes to 2nd group of 3 bytes
+            "pand _ActiveMask, %%mm7     \n\t"
+            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of 3 bytes
              // pav = p - a = (a + b - c) - a = b - c
             "movq %%mm2, %%mm4           \n\t"
-            "paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
-            "psllq $ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
-            "movq %%mm7, (%%edi,%%ebx,)  \n\t" // write back updated value
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
             "movq %%mm7, %%mm1           \n\t"
-            "punpckhbw %%mm0, %%mm3      \n\t" // Unpack High bytes of c
-            "psllq $ShiftBpp, %%mm1      \n\t" // Shift bytes
-                                    // Now mm1 will be used as Raw(x-bpp)
-            // Now do Paeth for 3rd, and final, set of bytes (6-7)
+            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
+                                    // now mm1 will be used as Raw(x-bpp)
+            // now do Paeth for 3rd, and final, set of bytes (6-7)
             "pxor %%mm7, %%mm7           \n\t"
-            "punpckhbw %%mm0, %%mm1      \n\t" // Unpack High bytes of a
+            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
             "psubw %%mm3, %%mm4          \n\t"
             // pbv = p - b = (a + b - c) - b = a - c
             "movq %%mm1, %%mm5           \n\t"
@@ -3267,17 +3430,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // Create mask pav bytes < 0
-            "pcmpgtw %%mm5, %%mm7        \n\t" // Create mask pbv bytes < 0
-            "pand %%mm4, %%mm0           \n\t" // Only pav bytes < 0 in mm7
-            "pand %%mm5, %%mm7           \n\t" // Only pbv bytes < 0 in mm0
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
             "movq %%mm4, %%mm7           \n\t"
@@ -3299,55 +3462,63 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "paddw %%mm3, %%mm7          \n\t"
             "pxor %%mm1, %%mm1           \n\t"
             "packuswb %%mm7, %%mm1       \n\t"
-            // Step ebx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ebx              \n\t"
-            "pand $ActiveMaskEnd, %%mm1  \n\t"
-            "paddb -8(%%edi,%%ebx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
+            // step ecx to next set of 8 bytes and repeat loop til done
+            "addl $8, %%ecx              \n\t"
+            "pand _ActiveMaskEnd, %%mm1  \n\t"
+            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
 
-            "cmpl _MMXLength, %%ebx      \n\t"
+            "cmpl _MMXLength, %%ecx      \n\t"
             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
-            "movq %%mm1, -8(%%edi,%%ebx,) \n\t" // write back updated value
+            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
                                  // mm1 will be used as Raw(x-bpp) next loop
                            // mm3 ready to be used as Prior(x-bpp) next loop
             "jb paeth_3lp                \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
-      break;
+      break;  // end 3 bpp
 
       case 6:
       //case 7:   // GRR BOGUS
       //case 5:   // GRR BOGUS
       {
-         ActiveMask.use  = 0x00000000ffffffff;
-         ActiveMask2.use = 0xffffffff00000000;
-         ShiftBpp.use = bpp << 3;    // == bpp * 8
-         ShiftRem.use = 64 - ShiftBpp.use;
-         __asm__ (
-            "movl _dif, %%ebx            \n\t"
-            "movl row, %%edi             \n\t"
-            "movl prev_row, %%esi        \n\t"
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t"
+         _ActiveMask.use  = 0x00000000ffffffffLL;
+         _ActiveMask2.use = 0xffffffff00000000LL;
+         _ShiftBpp.use = bpp << 3;    // == bpp * 8
+         _ShiftRem.use = 64 - _ShiftBpp.use;
+
+         __asm__ __volatile__ (
+            "movl _dif, %%ecx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
             "pxor %%mm0, %%mm0           \n\t"
+
          "paeth_6lp:                     \n\t"
-            // Must shift to position Raw(x-bpp) data
-            "psrlq $ShiftRem, %%mm1      \n\t"
-            // Do first set of 4 bytes
-            "movq -8(%%esi,%%ebx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm1      \n\t" // Unpack Low bytes of a
-            "movq (%%esi,%%ebx,), %%mm2  \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // Unpack Low bytes of b
-            // Must shift to position Prior(x-bpp) data
-            "psrlq $ShiftRem, %%mm3      \n\t"
+            // must shift to position Raw(x-bpp) data
+            "psrlq _ShiftRem, %%mm1      \n\t"
+            // do first set of 4 bytes
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
+            // must shift to position Prior(x-bpp) data
+            "psrlq _ShiftRem, %%mm3      \n\t"
             // pav = p - a = (a + b - c) - a = b - c
             "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // Unpack Low bytes of c
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
             // pbv = p - b = (a + b - c) - b = a - c
             "movq %%mm1, %%mm5           \n\t"
             "psubw %%mm3, %%mm4          \n\t"
@@ -3358,17 +3529,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // Create mask pav bytes < 0
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
             "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // Only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // Create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
             "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // Only pbv bytes < 0 in mm0
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm7, %%mm5          \n\t"
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
@@ -3392,24 +3563,24 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "paddw %%mm3, %%mm7          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
             "packuswb %%mm1, %%mm7       \n\t"
-            "movq -8(%%esi,%%ebx,), %%mm3 \n\t" // load c=Prior(x-bpp)
-            "pand $ActiveMask, %%mm7     \n\t"
-            "psrlq $ShiftRem, %%mm3      \n\t"
-            "movq (%%esi,%%ebx,), %%mm2  \n\t" // load b=Prior(x) step 1
-            "paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
+            "pand _ActiveMask, %%mm7     \n\t"
+            "psrlq _ShiftRem, %%mm3      \n\t"
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
             "movq %%mm2, %%mm6           \n\t"
-            "movq %%mm7, (%%edi,%%ebx,)  \n\t" // write back updated value
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t"
-            "psllq $ShiftBpp, %%mm6      \n\t"
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
+            "psllq _ShiftBpp, %%mm6      \n\t"
             "movq %%mm7, %%mm5           \n\t"
-            "psrlq $ShiftRem, %%mm1      \n\t"
+            "psrlq _ShiftRem, %%mm1      \n\t"
             "por %%mm6, %%mm3            \n\t"
-            "psllq $ShiftBpp, %%mm5      \n\t"
-            "punpckhbw %%mm0, %%mm3      \n\t" // Unpack High bytes of c
+            "psllq _ShiftBpp, %%mm5      \n\t"
+            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
             "por %%mm5, %%mm1            \n\t"
-            // Do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // Unpack High bytes of b
-            "punpckhbw %%mm0, %%mm1      \n\t" // Unpack High bytes of a
+            // do second set of 4 bytes
+            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
             // pav = p - a = (a + b - c) - a = b - c
             "movq %%mm2, %%mm4           \n\t"
             // pbv = p - b = (a + b - c) - b = a - c
@@ -3422,17 +3593,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // Create mask pav bytes < 0
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
             "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // Only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // Create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
             "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // Only pbv bytes < 0 in mm0
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm7, %%mm5          \n\t"
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
@@ -3456,44 +3627,51 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "pxor %%mm1, %%mm1           \n\t"
             "paddw %%mm3, %%mm7          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            // Step ex to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ebx              \n\t"
+            // step ecx to next set of 8 bytes and repeat loop til done
+            "addl $8, %%ecx              \n\t"
             "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%%edi,%%ebx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm1, -8(%%edi,%%ebx,) \n\t" // write back updated value
+            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
                                 // mm1 will be used as Raw(x-bpp) next loop
             "jb paeth_6lp                \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
-      break;
+      break;  // end 6 bpp
 
       case 4:
       {
-         ActiveMask.use  = 0x00000000ffffffff;
-         __asm__ (
-            "movl _dif, %%ebx            \n\t"
-            "movl row, %%edi             \n\t"
-            "movl prev_row, %%esi        \n\t"
+         _ActiveMask.use  = 0x00000000ffffffffLL;
+
+         __asm__ __volatile__ (
+            "movl _dif, %%ecx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t" // Only time should need to read
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
                                      //  a=Raw(x-bpp) bytes
          "paeth_4lp:                     \n\t"
-            // Do first set of 4 bytes
-            "movq -8(%%esi,%%ebx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpckhbw %%mm0, %%mm1      \n\t" // Unpack Low bytes of a
-            "movq (%%esi,%%ebx,), %%mm2  \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // Unpack High bytes of b
+            // do first set of 4 bytes
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+            "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
             // pav = p - a = (a + b - c) - a = b - c
             "movq %%mm2, %%mm4           \n\t"
-            "punpckhbw %%mm0, %%mm3      \n\t" // Unpack High bytes of c
+            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
             // pbv = p - b = (a + b - c) - b = a - c
             "movq %%mm1, %%mm5           \n\t"
             "psubw %%mm3, %%mm4          \n\t"
@@ -3504,17 +3682,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // Create mask pav bytes < 0
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
             "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // Only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // Create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
             "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // Only pbv bytes < 0 in mm0
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm7, %%mm5          \n\t"
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
@@ -3538,16 +3716,16 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "paddw %%mm3, %%mm7          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
             "packuswb %%mm1, %%mm7       \n\t"
-            "movq (%%esi,%%ebx,), %%mm3  \n\t" // load c=Prior(x-bpp)
-            "pand $ActiveMask, %%mm7     \n\t"
+            "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
+            "pand _ActiveMask, %%mm7     \n\t"
             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
-            "paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
-            "punpcklbw %%mm0, %%mm3      \n\t" // Unpack High bytes of c
-            "movq %%mm7, (%%edi,%%ebx,)  \n\t" // write back updated value
-            "movq %%mm7, %%mm1           \n\t" // Now mm1 will be used as Raw(x-bpp)
-            // Do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // Unpack Low bytes of b
-            "punpcklbw %%mm0, %%mm1      \n\t" // Unpack Low bytes of a
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
+            // do second set of 4 bytes
+            "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
             // pav = p - a = (a + b - c) - a = b - c
             "movq %%mm2, %%mm4           \n\t"
             // pbv = p - b = (a + b - c) - b = a - c
@@ -3560,17 +3738,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // Create mask pav bytes < 0
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
             "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // Only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // Create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
             "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // Only pbv bytes < 0 in mm0
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm7, %%mm5          \n\t"
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
@@ -3594,43 +3772,51 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "pxor %%mm1, %%mm1           \n\t"
             "paddw %%mm3, %%mm7          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            // Step ex to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ebx              \n\t"
+            // step ecx to next set of 8 bytes and repeat loop til done
+            "addl $8, %%ecx              \n\t"
             "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%%edi,%%ebx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm1, -8(%%edi,%%ebx,) \n\t" // write back updated value
+            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
                                 // mm1 will be used as Raw(x-bpp) next loop
             "jb paeth_4lp                \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
-      break;
+      break;  // end 4 bpp
+
       case 8:                          // bpp == 8
       {
-         ActiveMask.use  = 0x00000000ffffffff;
-         __asm__ (
-            "movl _dif, %%ebx            \n\t"
-            "movl row, %%edi             \n\t"
-            "movl prev_row, %%esi        \n\t"
+         _ActiveMask.use  = 0x00000000ffffffffLL;
+
+         __asm__ __volatile__ (
+            "movl _dif, %%ecx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t" // Only time should need to read
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
                                        //  a=Raw(x-bpp) bytes
          "paeth_8lp:                     \n\t"
-            // Do first set of 4 bytes
-            "movq -8(%%esi,%%ebx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm1      \n\t" // Unpack Low bytes of a
-            "movq (%%esi,%%ebx,), %%mm2  \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // Unpack Low bytes of b
+            // do first set of 4 bytes
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
             // pav = p - a = (a + b - c) - a = b - c
             "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // Unpack Low bytes of c
+            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
             // pbv = p - b = (a + b - c) - b = a - c
             "movq %%mm1, %%mm5           \n\t"
             "psubw %%mm3, %%mm4          \n\t"
@@ -3641,17 +3827,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // Create mask pav bytes < 0
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
             "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // Only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // Create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
             "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // Only pbv bytes < 0 in mm0
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm7, %%mm5          \n\t"
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
@@ -3675,17 +3861,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "paddw %%mm3, %%mm7          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
             "packuswb %%mm1, %%mm7       \n\t"
-            "movq -8(%%esi,%%ebx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "pand $ActiveMask, %%mm7     \n\t"
-            "movq (%%esi,%%ebx,), %%mm2  \n\t" // load b=Prior(x)
-            "paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
-            "punpckhbw %%mm0, %%mm3      \n\t" // Unpack High bytes of c
-            "movq %%mm7, (%%edi,%%ebx,)  \n\t" // write back updated value
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
+            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+            "pand _ActiveMask, %%mm7     \n\t"
+            "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
+            "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
+            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
+            "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
+            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
 
-            // Do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // Unpack High bytes of b
-            "punpckhbw %%mm0, %%mm1      \n\t" // Unpack High bytes of a
+            // do second set of 4 bytes
+            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
+            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
             // pav = p - a = (a + b - c) - a = b - c
             "movq %%mm2, %%mm4           \n\t"
             // pbv = p - b = (a + b - c) - b = a - c
@@ -3698,17 +3884,17 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             // pa = abs(p-a) = abs(pav)
             // pb = abs(p-b) = abs(pbv)
             // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // Create mask pav bytes < 0
+            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
             "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // Only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // Create mask pbv bytes < 0
+            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
+            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
             "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // Only pbv bytes < 0 in mm0
+            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
             "psubw %%mm0, %%mm4          \n\t"
             "psubw %%mm7, %%mm5          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // Create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // Only pav bytes < 0 in mm7
+            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
+            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
             "psubw %%mm7, %%mm5          \n\t"
             "psubw %%mm0, %%mm6          \n\t"
             //  test pa <= pb
@@ -3732,94 +3918,113 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "pxor %%mm1, %%mm1           \n\t"
             "paddw %%mm3, %%mm7          \n\t"
             "pxor %%mm0, %%mm0           \n\t"
-            // Step ex to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ebx              \n\t"
+            // step ecx to next set of 8 bytes and repeat loop til done
+            "addl $8, %%ecx              \n\t"
             "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%%edi,%%ebx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm1, -8(%%edi,%%ebx,) \n\t" // write back updated value
+            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
+            "cmpl _MMXLength, %%ecx      \n\t"
+            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
                             // mm1 will be used as Raw(x-bpp) next loop
             "jb paeth_8lp                \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=S" (dummy_value_S),             // output regs (dummy)
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (prev_row),  // esi           // input regs
+              "1" (row)        // edi
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+            : "%ecx"                            // clobber list
+#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3"
+            , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
-      break;
+      break;  // end 8 bpp
 
       case 1:                // bpp = 1
       case 2:                // bpp = 2
       default:               // bpp > 8
       {
-         __asm__ (
+         __asm__ __volatile__ (
+#ifdef __PIC__
+            "pushl %%ebx                 \n\t" // save Global Offset Table index
+#endif
             "movl _dif, %%ebx            \n\t"
             "cmpl _FullLength, %%ebx     \n\t"
             "jnb paeth_dend              \n\t"
-            "movl row, %%edi             \n\t"
-            "movl prev_row, %%esi        \n\t"
-            // Do Paeth decode for remaining bytes
+
+// preload  "movl row, %%edi             \n\t"
+// preload  "movl prev_row, %%esi        \n\t"
+            // do Paeth decode for remaining bytes
             "movl %%ebx, %%edx           \n\t"
-            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx in loop below
-            "subl bpp, %%edx             \n\t" // Set edx = ebx - bpp
+// preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+            "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
+            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
+
          "paeth_dlp:                     \n\t"
             "xorl %%eax, %%eax           \n\t"
             // pav = p - a = (a + b - c) - a = b - c
             "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
-            "movl %%eax, patemp          \n\t" // Save pav for later use
+            "movl %%eax, _patemp         \n\t" // Save pav for later use
             "xorl %%eax, %%eax           \n\t"
             // pbv = p - b = (a + b - c) - b = a - c
             "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
             "movl %%eax, %%ecx           \n\t"
             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "addl patemp, %%eax          \n\t" // pcv = pav + pbv
+            "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
             // pc = abs(pcv)
             "testl $0x80000000, %%eax    \n\t"
             "jz paeth_dpca               \n\t"
             "negl %%eax                  \n\t" // reverse sign of neg values
+
          "paeth_dpca:                    \n\t"
-            "movl %%eax, pctemp          \n\t" // save pc for later use
+            "movl %%eax, _pctemp         \n\t" // save pc for later use
             // pb = abs(pbv)
             "testl $0x80000000, %%ecx    \n\t"
             "jz paeth_dpba               \n\t"
             "negl %%ecx                  \n\t" // reverse sign of neg values
+
          "paeth_dpba:                    \n\t"
-            "movl %%ecx, pbtemp          \n\t" // save pb for later use
+            "movl %%ecx, _pbtemp         \n\t" // save pb for later use
             // pa = abs(pav)
-            "movl patemp, %%eax          \n\t"
+            "movl _patemp, %%eax         \n\t"
             "testl $0x80000000, %%eax    \n\t"
             "jz paeth_dpaa               \n\t"
             "negl %%eax                  \n\t" // reverse sign of neg values
+
          "paeth_dpaa:                    \n\t"
-            "movl %%eax, patemp          \n\t" // save pa for later use
+            "movl %%eax, _patemp         \n\t" // save pa for later use
             // test if pa <= pb
             "cmpl %%ecx, %%eax           \n\t"
             "jna paeth_dabb              \n\t"
             // pa > pb; now test if pb <= pc
-            "cmpl pctemp, %%ecx          \n\t"
+            "cmpl _pctemp, %%ecx         \n\t"
             "jna paeth_dbbc              \n\t"
             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
             "jmp paeth_dpaeth            \n\t"
+
          "paeth_dbbc:                    \n\t"
             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
             "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
             "jmp paeth_dpaeth            \n\t"
+
          "paeth_dabb:                    \n\t"
             // pa <= pb; now test if pa <= pc
-            "cmpl pctemp, %%eax          \n\t"
+            "cmpl _pctemp, %%eax         \n\t"
             "jna paeth_dabc              \n\t"
             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
             "jmp paeth_dpaeth            \n\t"
+
          "paeth_dabc:                    \n\t"
             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
             "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
+
          "paeth_dpaeth:                  \n\t"
             "incl %%ebx                  \n\t"
             "incl %%edx                  \n\t"
@@ -3827,85 +4032,110 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
             "cmpl _FullLength, %%ebx     \n\t"
             "jb paeth_dlp                \n\t"
+
          "paeth_dend:                    \n\t"
+#ifdef __PIC__
+            "popl %%ebx                  \n\t" // index to Global Offset Table
+#endif
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=c" (dummy_value_c),            // output regs (dummy)
+              "=S" (dummy_value_S),
+              "=D" (dummy_value_D)
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (bpp),       // ecx          // input regs
+              "1" (prev_row),  // esi
+              "2" (row)        // edi
 
-            : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
+            : "%eax", "%edx"                   // clobber list
+#ifndef __PIC__
+            , "%ebx"
+#endif
          );
       }
       return;                   // No need to go further with this one
-   }                         // end switch ( bpp )
-   __asm__ (
-      // MMX acceleration complete now do clean-up
-      // Check if any remaining bytes left to decode
+
+   } // end switch (bpp)
+
+   __asm__ __volatile__ (
+      // MMX acceleration complete; now do clean-up
+      // check if any remaining bytes left to decode
+#ifdef __PIC__
+      "pushl %%ebx                 \n\t" // save index to Global Offset Table
+#endif
       "movl _MMXLength, %%ebx      \n\t"
       "cmpl _FullLength, %%ebx     \n\t"
       "jnb paeth_end               \n\t"
-      "movl row, %%edi             \n\t"
-      "movl prev_row, %%esi        \n\t"
-      // Do Paeth decode for remaining bytes
+//pre "movl row, %%edi             \n\t"
+//pre "movl prev_row, %%esi        \n\t"
+      // do Paeth decode for remaining bytes
       "movl %%ebx, %%edx           \n\t"
-      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx in loop below
-      "subl bpp, %%edx             \n\t" // Set edx = ebx - bpp
+//pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
+      "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
+      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
+
    "paeth_lp2:                     \n\t"
       "xorl %%eax, %%eax           \n\t"
       // pav = p - a = (a + b - c) - a = b - c
       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
-      "movl %%eax, patemp          \n\t" // Save pav for later use
+      "movl %%eax, _patemp         \n\t" // Save pav for later use
       "xorl %%eax, %%eax           \n\t"
       // pbv = p - b = (a + b - c) - b = a - c
       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
       "movl %%eax, %%ecx           \n\t"
       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-      "addl patemp, %%eax          \n\t" // pcv = pav + pbv
+      "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
       // pc = abs(pcv)
       "testl $0x80000000, %%eax    \n\t"
       "jz paeth_pca2               \n\t"
       "negl %%eax                  \n\t" // reverse sign of neg values
+
    "paeth_pca2:                    \n\t"
-      "movl %%eax, pctemp          \n\t" // save pc for later use
+      "movl %%eax, _pctemp         \n\t" // save pc for later use
       // pb = abs(pbv)
       "testl $0x80000000, %%ecx    \n\t"
       "jz paeth_pba2               \n\t"
       "negl %%ecx                  \n\t" // reverse sign of neg values
+
    "paeth_pba2:                    \n\t"
-      "movl %%ecx, pbtemp          \n\t" // save pb for later use
+      "movl %%ecx, _pbtemp         \n\t" // save pb for later use
       // pa = abs(pav)
-      "movl patemp, %%eax          \n\t"
+      "movl _patemp, %%eax         \n\t"
       "testl $0x80000000, %%eax    \n\t"
       "jz paeth_paa2               \n\t"
       "negl %%eax                  \n\t" // reverse sign of neg values
+
    "paeth_paa2:                    \n\t"
-      "movl %%eax, patemp          \n\t" // save pa for later use
+      "movl %%eax, _patemp         \n\t" // save pa for later use
       // test if pa <= pb
       "cmpl %%ecx, %%eax           \n\t"
       "jna paeth_abb2              \n\t"
       // pa > pb; now test if pb <= pc
-      "cmpl pctemp, %%ecx          \n\t"
+      "cmpl _pctemp, %%ecx         \n\t"
       "jna paeth_bbc2              \n\t"
       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
       "jmp paeth_paeth2            \n\t"
+
    "paeth_bbc2:                    \n\t"
       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
       "jmp paeth_paeth2            \n\t"
+
    "paeth_abb2:                    \n\t"
       // pa <= pb; now test if pa <= pc
-      "cmpl pctemp, %%eax          \n\t"
+      "cmpl _pctemp, %%eax         \n\t"
       "jna paeth_abc2              \n\t"
       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
       "jmp paeth_paeth2            \n\t"
+
    "paeth_abc2:                    \n\t"
       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
+
    "paeth_paeth2:                  \n\t"
       "incl %%ebx                  \n\t"
       "incl %%edx                  \n\t"
@@ -3913,491 +4143,602 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
       "cmpl _FullLength, %%ebx     \n\t"
       "jb paeth_lp2                \n\t"
+
    "paeth_end:                     \n\t"
-      "emms                        \n\t" // End MMX instructions; prep for possible FP instrs.
+      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
+#ifdef __PIC__
+      "popl %%ebx                  \n\t" // restore index to Global Offset Table
+#endif
 
-      : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+      : "=c" (dummy_value_c),            // output regs (dummy)
+        "=S" (dummy_value_S),
+        "=D" (dummy_value_D)
 
-      : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+      : "0" (bpp),       // ecx          // input regs
+        "1" (prev_row),  // esi
+        "2" (row)        // edi
 
-      : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
+      : "%eax", "%edx"                   // clobber list (no input regs!)
+#ifndef __PIC__
+      , "%ebx"
+#endif
    );
-#endif /* GRR_GCC_MMX_CONVERTED */
-}
+
+} /* end png_read_filter_row_mmx_paeth() */
+
+
+
+
+//===========================================================================//
+//                                                                           //
+//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
+//                                                                           //
+//===========================================================================//
 
 // Optimized code for PNG Sub filter decoder
-void /* PRIVATE */
+
+static void /* PRIVATE */
 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
 {
-#ifdef GRR_GCC_MMX_CONVERTED
    int bpp;
+   int dummy_value_a;
+   int dummy_value_D;
 
-   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-   _FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
-   __asm__ (
-      "movl row, %%edi             \n\t"
+   bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
+   _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
+
+   __asm__ __volatile__ (
+//pre "movl row, %%edi             \n\t"
       "movl %%edi, %%esi           \n\t" // lp = row
-      "addl bpp, %%edi             \n\t" // rp = row + bpp
-      "xorl %%eax, %%eax           \n\t"
+//pre "movl bpp, %%eax             \n\t"
+      "addl %%eax, %%edi           \n\t" // rp = row + bpp
+//irr "xorl %%eax, %%eax           \n\t"
       // get # of bytes to alignment
       "movl %%edi, _dif            \n\t" // take start of row
       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
-                                         // alignment boundary
-      "xorl %%ebx, %%ebx           \n\t"
+                                         //  alignment boundary
+      "xorl %%ecx, %%ecx           \n\t"
       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
       "subl %%edi, _dif            \n\t" // subtract from start ==> value
-                                         //  ebx at alignment
-      "jz sub_go                   \n\t"
-      // fix alignment
-   "sub_lp1:                       \n\t"
-      "movb (%%esi,%%ebx,), %%al   \n\t"
-      "addb %%al, (%%edi,%%ebx,)   \n\t"
-      "incl %%ebx                  \n\t"
-      "cmpl _dif, %%ebx            \n\t"
+      "jz sub_go                   \n\t" //  ecx at alignment
+
+   "sub_lp1:                       \n\t" // fix alignment
+      "movb (%%esi,%%ecx,), %%al   \n\t"
+      "addb %%al, (%%edi,%%ecx,)   \n\t"
+      "incl %%ecx                  \n\t"
+      "cmpl _dif, %%ecx            \n\t"
       "jb sub_lp1                  \n\t"
+
    "sub_go:                        \n\t"
-      "movl _FullLength, %%ecx     \n\t"
-      "movl %%ecx, %%edx           \n\t"
-      "subl %%ebx, %%edx           \n\t" // subtract alignment fix
+      "movl _FullLength, %%eax     \n\t"
+      "movl %%eax, %%edx           \n\t"
+      "subl %%ecx, %%edx           \n\t" // subtract alignment fix
       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
-      "subl %%edx, %%ecx           \n\t" // drop over bytes from length
-      "movl %%ecx, _MMXLength      \n\t"
+      "subl %%edx, %%eax           \n\t" // drop over bytes from length
+      "movl %%eax, _MMXLength      \n\t"
 
-      : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+      : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+        "=D" (dummy_value_D)    // 1
 
-      : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+      : "0" (bpp),              // eax    // input regs
+        "1" (row)               // edi
 
-      : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
+      : "%ebx", "%ecx", "%edx"            // clobber list
+      , "%esi"
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+      , "%mm0", "%mm1", "%mm2", "%mm3"
+      , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
    );
 
-   // Now do the math for the rest of the row
-   switch ( bpp )
+   // now do the math for the rest of the row
+   switch (bpp)
    {
-        case 3:
-        {
-         ActiveMask.use  = 0x0000ffffff000000;
-         ShiftBpp.use = 24;       // == 3 * 8
-         ShiftRem.use  = 40;      // == 64 - 24
-         __asm__ (
-            "movl row, %%edi             \n\t"
-            "movq $ActiveMask, %%mm7     \n\t" // Load ActiveMask for 2nd active byte group
-            "movl %%edi, %%esi           \n\t" // lp = row
-            "addl bpp, %%edi             \n\t" // rp = row + bpp
-            "movq %%mm7, %%mm6           \n\t"
-            "movl _dif, %%ebx            \n\t"
-            "psllq $ShiftBpp, %%mm6      \n\t" // Move mask in mm6 to cover 3rd active
-                                  // byte group
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t"
-         "sub_3lp:                       \n\t"
-            "psrlq $ShiftRem, %%mm1      \n\t" // Shift data for adding 1st bpp bytes
-                          // no need for mask; shift clears inactive bytes
-            // Add 1st active group
-            "movq (%%edi,%%ebx,), %%mm0  \n\t"
-            "paddb %%mm1, %%mm0          \n\t"
-            // Add 2nd active group
-            "movq %%mm0, %%mm1           \n\t" // mov updated Raws to mm1
-            "psllq $ShiftBpp, %%mm1      \n\t" // shift data to position correctly
-            "pand %%mm7, %%mm1           \n\t" // mask to use only 2nd active group
-            "paddb %%mm1, %%mm0          \n\t"
-            // Add 3rd active group
-            "movq %%mm0, %%mm1           \n\t" // mov updated Raws to mm1
-            "psllq $ShiftBpp, %%mm1      \n\t" // shift data to position correctly
-            "pand %%mm6, %%mm1           \n\t" // mask to use only 3rd active group
-            "addl $8, %%ebx              \n\t"
-            "paddb %%mm1, %%mm0          \n\t"
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // Write updated Raws back to array
-            // Prep for doing 1st add at top of loop
-            "movq %%mm0, %%mm1           \n\t"
-            "jb sub_3lp                  \n\t"
+      case 3:
+      {
+         _ActiveMask.use  = 0x0000ffffff000000LL;
+         _ShiftBpp.use = 24;       // == 3 * 8
+         _ShiftRem.use  = 40;      // == 64 - 24
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+         __asm__ __volatile__ (
+// preload  "movl row, %%edi              \n\t"
+            "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
+                                                //  active byte group
+            "movl %%edi, %%esi            \n\t" // lp = row
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
+            "movq %%mm7, %%mm6            \n\t"
+            "movl _dif, %%edx             \n\t"
+            "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
+                                                //  3rd active byte group
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+         "sub_3lp:                        \n\t" // shift data for adding first
+            "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
+                                                //  shift clears inactive bytes)
+            // add 1st active group
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm6", "%mm7" // CHECKASM: clobber list
+            // add 2nd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 3rd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
+            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
+            "jb sub_3lp                   \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm6", "%mm7"
+#endif
          );
       }
       break;
 
       case 1:
       {
-         // Placed here just in case this is a duplicate of the
-         // non-MMX code for the SUB filter in png_read_filter_row above
-         //
-         //         png_bytep rp;
-         //         png_bytep lp;
-         //         png_uint_32 i;
-         //         bpp = (row_info->pixel_depth + 7) >> 3;
-         //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
-         //            i < row_info->rowbytes; i++, rp++, lp++)
-         //      {
-         //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
-         //      }
-         __asm__ (
-            "movl _dif, %%ebx            \n\t"
-            "movl row, %%edi             \n\t"
-            "cmpl _FullLength, %%ebx     \n\t"
+         __asm__ __volatile__ (
+            "movl _dif, %%edx            \n\t"
+// preload  "movl row, %%edi             \n\t"
+            "cmpl _FullLength, %%edx     \n\t"
             "jnb sub_1end                \n\t"
             "movl %%edi, %%esi           \n\t" // lp = row
             "xorl %%eax, %%eax           \n\t"
-            "addl bpp, %%edi             \n\t" // rp = row + bpp
+// preload  "movl bpp, %%eax             \n\t"
+            "addl %%eax, %%edi           \n\t" // rp = row + bpp
+
          "sub_1lp:                       \n\t"
-            "movb (%%esi,%%ebx,), %%al   \n\t"
-            "addb %%al, (%%edi,%%ebx,)   \n\t"
-            "incl %%ebx                  \n\t"
-            "cmpl _FullLength, %%ebx     \n\t"
+            "movb (%%esi,%%edx,), %%al   \n\t"
+            "addb %%al, (%%edi,%%edx,)   \n\t"
+            "incl %%edx                  \n\t"
+            "cmpl _FullLength, %%edx     \n\t"
             "jb sub_1lp                  \n\t"
+
          "sub_1end:                      \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
 
-            : "%eax", "%ebx", "%edi", "%esi" // CHECKASM: clobber list
+            : "%edx", "%esi"                    // clobber list
          );
       }
       return;
 
       case 6:
-      case 7:
       case 4:
-      case 5:
+      //case 7:   // GRR BOGUS
+      //case 5:   // GRR BOGUS
       {
-         ShiftBpp.use = bpp << 3;
-         ShiftRem.use = 64 - ShiftBpp.use;
-         __asm__ (
-            "movl row, %%edi             \n\t"
-            "movl _dif, %%ebx            \n\t"
-            "movl %%edi, %%esi           \n\t" // lp = row
-            "addl bpp, %%edi             \n\t" // rp = row + bpp
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t"
-         "sub_4lp:                       \n\t"
-            "psrlq $ShiftRem, %%mm1      \n\t" // Shift data for adding 1st bpp bytes
-                          // no need for mask; shift clears inactive bytes
-            "movq (%%edi,%%ebx,), %%mm0  \n\t"
-            "paddb %%mm1, %%mm0          \n\t"
-            // Add 2nd active group
-            "movq %%mm0, %%mm1           \n\t" // mov updated Raws to mm1
-            "psllq $ShiftBpp, %%mm1      \n\t" // shift data to position correctly
-                                   // there is no need for any mask
-                                   // since shift clears inactive bits/bytes
-            "addl $8, %%ebx              \n\t"
-            "paddb %%mm1, %%mm0          \n\t"
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
-            "movq %%mm0, %%mm1           \n\t" // Prep for doing 1st add at top of loop
-            "jb sub_4lp                  \n\t"
+         _ShiftBpp.use = bpp << 3;
+         _ShiftRem.use = 64 - _ShiftBpp.use;
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+         __asm__ __volatile__ (
+// preload  "movl row, %%edi              \n\t"
+            "movl _dif, %%edx             \n\t"
+            "movl %%edi, %%esi            \n\t" // lp = row
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1" // CHECKASM: clobber list
+         "sub_4lp:                        \n\t" // shift data for adding first
+            "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
+                                                //  shift clears inactive bytes)
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 2nd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t"
+            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
+            "jb sub_4lp                   \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1"
+#endif
          );
       }
       break;
 
       case 2:
       {
-         ActiveMask.use  = 0x00000000ffff0000;
-         ShiftBpp.use = 16;       // == 2 * 8
-         ShiftRem.use = 48;       // == 64 - 16
-         __asm__ (
-            "movq $ActiveMask, %%mm7     \n\t" // Load ActiveMask for 2nd active byte group
-            "movl _dif, %%ebx            \n\t"
-            "movq %%mm7, %%mm6           \n\t"
-            "movl row, %%edi             \n\t"
-            "psllq $ShiftBpp, %%mm6      \n\t" // Move mask in mm6 to cover 3rd active
-                                    //  byte group
-            "movl %%edi, %%esi           \n\t" // lp = row
-            "movq %%mm6, %%mm5           \n\t"
-            "addl bpp, %%edi             \n\t" // rp = row + bpp
-            "psllq $ShiftBpp, %%mm5      \n\t" // Move mask in mm5 to cover 4th active
-                                    //  byte group
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            "movq -8(%%edi,%%ebx,), %%mm1 \n\t"
-         "sub_2lp:                       \n\t"
-            // Add 1st active group
-            "psrlq $ShiftRem, %%mm1      \n\t" // Shift data for adding 1st bpp bytes
-                                    // no need for mask; shift clears inactive
-                                    //  bytes
-            "movq (%%edi,%%ebx,), %%mm0  \n\t"
-            "paddb %%mm1, %%mm0          \n\t"
-            // Add 2nd active group
-            "movq %%mm0, %%mm1           \n\t" // mov updated Raws to mm1
-            "psllq $ShiftBpp, %%mm1      \n\t" // shift data to position correctly
-            "pand %%mm7, %%mm1           \n\t" // mask to use only 2nd active group
-            "paddb %%mm1, %%mm0          \n\t"
-            // Add 3rd active group
-            "movq %%mm0, %%mm1           \n\t" // mov updated Raws to mm1
-            "psllq $ShiftBpp, %%mm1      \n\t" // shift data to position correctly
-            "pand %%mm6, %%mm1           \n\t" // mask to use only 3rd active group
-            "paddb %%mm1, %%mm0          \n\t"
-            // Add 4th active group
-            "movq %%mm0, %%mm1           \n\t" // mov updated Raws to mm1
-            "psllq $ShiftBpp, %%mm1      \n\t" // shift data to position correctly
-            "pand %%mm5, %%mm1           \n\t" // mask to use only 4th active group
-            "addl $8, %%ebx              \n\t"
-            "paddb %%mm1, %%mm0          \n\t"
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // Write updated Raws back to array
-            "movq %%mm0, %%mm1           \n\t" // Prep for doing 1st add at top of loop
-            "jb sub_2lp                  \n\t"
+         _ActiveMask.use = 0x00000000ffff0000LL;
+         _ShiftBpp.use = 16;       // == 2 * 8
+         _ShiftRem.use = 48;       // == 64 - 16
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+         __asm__ __volatile__ (
+            "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
+                                                //  active byte group
+            "movl _dif, %%edx             \n\t"
+            "movq %%mm7, %%mm6            \n\t"
+// preload  "movl row, %%edi              \n\t"
+            "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
+                                                //  3rd active byte group
+            "movl %%edi, %%esi            \n\t" // lp = row
+            "movq %%mm6, %%mm5            \n\t"
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
+            "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
+                                                //  4th active byte group
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+         "sub_2lp:                        \n\t" // shift data for adding first
+            "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
+                                                //  shift clears inactive bytes)
+            // add 1st active group
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+            // add 2nd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 3rd active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
+            "paddb %%mm1, %%mm0           \n\t"
+
+            // add 4th active group
+            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
+            "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
+            "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
+            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
+            "jb sub_2lp                   \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
       break;
+
       case 8:
       {
-         __asm__ (
-            "movl row, %%edi             \n\t"
-            "movl _dif, %%ebx            \n\t"
-            "movl %%edi, %%esi           \n\t" // lp = row
-            "addl bpp, %%edi             \n\t" // rp = row + bpp
-            "movl _MMXLength, %%ecx      \n\t"
-            "movq -8(%%edi,%%ebx,), %%mm7 \n\t" // PRIME the pump (load the first
-                                    // Raw(x-bpp) data set
-            "andl $0x0000003f, %%ecx     \n\t" // calc bytes over mult of 64
-         "sub_8lp:                       \n\t"
-            "movq (%%edi,%%ebx,), %%mm0  \n\t" // Load Sub(x) for 1st 8 bytes
-            "paddb %%mm7, %%mm0          \n\t"
-            "movq 8(%%edi,%%ebx,), %%mm1 \n\t" // Load Sub(x) for 2nd 8 bytes
-            "movq %%mm0, (%%edi,%%ebx,)  \n\t" // Write Raw(x) for 1st 8 bytes
-                                   // Now mm0 will be used as Raw(x-bpp) for
-                                   // the 2nd group of 8 bytes.  This will be
-                                   // repeated for each group of 8 bytes with
-                                   // the 8th group being used as the Raw(x-bpp)
-                                   // for the 1st group of the next loop.
-            "paddb %%mm0, %%mm1          \n\t"
-            "movq 16(%%edi,%%ebx,), %%mm2 \n\t" // Load Sub(x) for 3rd 8 bytes
-            "movq %%mm1, 8(%%edi,%%ebx,) \n\t" // Write Raw(x) for 2nd 8 bytes
-            "paddb %%mm1, %%mm2          \n\t"
-            "movq 24(%%edi,%%ebx,), %%mm3 \n\t" // Load Sub(x) for 4th 8 bytes
-            "movq %%mm2, 16(%%edi,%%ebx,) \n\t" // Write Raw(x) for 3rd 8 bytes
-            "paddb %%mm2, %%mm3          \n\t"
-            "movq 32(%%edi,%%ebx,), %%mm4 \n\t" // Load Sub(x) for 5th 8 bytes
-            "movq %%mm3, 24(%%edi,%%ebx,) \n\t" // Write Raw(x) for 4th 8 bytes
-            "paddb %%mm3, %%mm4          \n\t"
-            "movq 40(%%edi,%%ebx,), %%mm5 \n\t" // Load Sub(x) for 6th 8 bytes
-            "movq %%mm4, 32(%%edi,%%ebx,) \n\t" // Write Raw(x) for 5th 8 bytes
-            "paddb %%mm4, %%mm5          \n\t"
-            "movq 48(%%edi,%%ebx,), %%mm6 \n\t" // Load Sub(x) for 7th 8 bytes
-            "movq %%mm5, 40(%%edi,%%ebx,) \n\t" // Write Raw(x) for 6th 8 bytes
-            "paddb %%mm5, %%mm6          \n\t"
-            "movq 56(%%edi,%%ebx,), %%mm7 \n\t" // Load Sub(x) for 8th 8 bytes
-            "movq %%mm6, 48(%%edi,%%ebx,) \n\t" // Write Raw(x) for 7th 8 bytes
-            "addl $64, %%ebx             \n\t"
-            "paddb %%mm6, %%mm7          \n\t"
-            "cmpl %%ecx, %%ebx           \n\t"
-            "movq %%mm7, -8(%%edi,%%ebx,) \n\t" // Write Raw(x) for 8th 8 bytes
-            "jb sub_8lp                  \n\t"
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "jnb sub_8lt8                \n\t"
-         "sub_8lpA:                      \n\t"
-            "movq (%%edi,%%ebx,), %%mm0  \n\t"
-            "addl $8, %%ebx              \n\t"
-            "paddb %%mm7, %%mm0          \n\t"
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // use -8 to offset early add to ebx
-            "movq %%mm0, %%mm7           \n\t" // Move calculated Raw(x) data to mm1 to
-                                    // be the new Raw(x-bpp) for the next loop
-            "jb sub_8lpA                 \n\t"
-         "sub_8lt8:                      \n\t"
+         __asm__ __volatile__ (
+// preload  "movl row, %%edi              \n\t"
+            "movl _dif, %%edx             \n\t"
+            "movl %%edi, %%esi            \n\t" // lp = row
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
+            "movl _MMXLength, %%ecx       \n\t"
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+            // prime the pump:  load the first Raw(x-bpp) data set
+            "movq -8(%%edi,%%edx,), %%mm7 \n\t"
+            "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+         "sub_8lp:                        \n\t"
+            "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
+            "paddb %%mm7, %%mm0           \n\t"
+            "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
+            "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
 
-            : "%ebx", "%ecx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+            // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
+            // This will be repeated for each group of 8 bytes with the 8th
+            // group being used as the Raw(x-bpp) for the 1st group of the
+            // next loop.
+
+            "paddb %%mm0, %%mm1           \n\t"
+            "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
+            "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
+            "paddb %%mm1, %%mm2           \n\t"
+            "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
+            "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
+            "paddb %%mm2, %%mm3           \n\t"
+            "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
+            "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
+            "paddb %%mm3, %%mm4           \n\t"
+            "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
+            "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
+            "paddb %%mm4, %%mm5           \n\t"
+            "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
+            "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
+            "paddb %%mm5, %%mm6           \n\t"
+            "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
+            "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
+            "addl $64, %%edx              \n\t"
+            "paddb %%mm6, %%mm7           \n\t"
+            "cmpl %%ecx, %%edx            \n\t"
+            "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
+            "jb sub_8lp                   \n\t"
+
+            "cmpl _MMXLength, %%edx       \n\t"
+            "jnb sub_8lt8                 \n\t"
+
+         "sub_8lpA:                       \n\t"
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm7, %%mm0           \n\t"
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
+            "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
+                                                //  to mm1 to be new Raw(x-bpp)
+                                                //  for next loop
+            "jb sub_8lpA                  \n\t"
+
+         "sub_8lt8:                       \n\t"
+
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
+
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%ecx", "%edx", "%esi"            // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
          );
       }
       break;
 
-      default:                // bpp greater than 8 bytes
+      default:                // bpp greater than 8 bytes	GRR BOGUS
       {
-         __asm__ (
-            "movl _dif, %%ebx            \n\t"
-            "movl row, %%edi             \n\t"
-            "movl %%edi, %%esi           \n\t" // lp = row
-            "addl bpp, %%edi             \n\t" // rp = row + bpp
-         "sub_Alp:                       \n\t"
-            "movq (%%edi,%%ebx,), %%mm0  \n\t"
-            "movq (%%esi,%%ebx,), %%mm1  \n\t"
-            "addl $8, %%ebx              \n\t"
-            "paddb %%mm1, %%mm0          \n\t"
-            "cmpl _MMXLength, %%ebx      \n\t"
-            "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // mov does not affect flags; -8 to offset
-                                   //  add ebx
-            "jb sub_Alp                  \n\t"
+         __asm__ __volatile__ (
+            "movl _dif, %%edx             \n\t"
+// preload  "movl row, %%edi              \n\t"
+            "movl %%edi, %%esi            \n\t" // lp = row
+// preload  "movl bpp, %%eax              \n\t"
+            "addl %%eax, %%edi            \n\t" // rp = row + bpp
 
-            : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+         "sub_Alp:                        \n\t"
+            "movq (%%edi,%%edx,), %%mm0   \n\t"
+            "movq (%%esi,%%edx,), %%mm1   \n\t"
+            "addl $8, %%edx               \n\t"
+            "paddb %%mm1, %%mm0           \n\t"
+            "cmpl _MMXLength, %%edx       \n\t"
+            "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
+                                                //  -8 to offset addl edx
+            "jb sub_Alp                   \n\t"
 
-            : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+            : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+              "=D" (dummy_value_D)    // 1
 
-            : "%ebx", "%edi", "%esi", "%mm0", "%mm1" // CHECKASM: clobber list
+            : "0" (bpp),              // eax    // input regs
+              "1" (row)               // edi
+
+            : "%edx", "%esi"                    // clobber list
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+            , "%mm0", "%mm1"
+#endif
          );
       }
       break;
 
-   } // end switch ( bpp )
+   } // end switch (bpp)
 
-   __asm__ (
-      "movl _MMXLength, %%ebx      \n\t"
-      "movl row, %%edi             \n\t"
-      "cmpl _FullLength, %%ebx     \n\t"
-      "jnb sub_end                 \n\t"
-      "movl %%edi, %%esi           \n\t" // lp = row
-      "xorl %%eax, %%eax           \n\t"
-      "addl bpp, %%edi             \n\t" // rp = row + bpp
-   "sub_lp2:                       \n\t"
-      "movb (%%esi,%%ebx,), %%al   \n\t"
-      "addb %%al, (%%edi,%%ebx,)   \n\t"
-      "incl %%ebx                  \n\t"
-      "cmpl _FullLength, %%ebx     \n\t"
-      "jb sub_lp2                  \n\t"
-   "sub_end:                       \n\t"
-      "emms                        \n\t" // end MMX instructions
+   __asm__ __volatile__ (
+      "movl _MMXLength, %%edx       \n\t"
+//pre "movl row, %%edi              \n\t"
+      "cmpl _FullLength, %%edx      \n\t"
+      "jnb sub_end                  \n\t"
 
-      : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+      "movl %%edi, %%esi            \n\t" // lp = row
+//pre "movl bpp, %%eax              \n\t"
+      "addl %%eax, %%edi            \n\t" // rp = row + bpp
+      "xorl %%eax, %%eax            \n\t"
 
-      : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+   "sub_lp2:                        \n\t"
+      "movb (%%esi,%%edx,), %%al    \n\t"
+      "addb %%al, (%%edi,%%edx,)    \n\t"
+      "incl %%edx                   \n\t"
+      "cmpl _FullLength, %%edx      \n\t"
+      "jb sub_lp2                   \n\t"
 
-      : "%eax", "%ebx", "%edi", "%esi" // CHECKASM: clobber list
+   "sub_end:                        \n\t"
+      "EMMS                         \n\t" // end MMX instructions
+
+      : "=a" (dummy_value_a),   // 0      // output regs (dummy)
+        "=D" (dummy_value_D)    // 1
+
+      : "0" (bpp),              // eax    // input regs
+        "1" (row)               // edi
+
+      : "%edx", "%esi"                    // clobber list
    );
-#endif /* GRR_GCC_MMX_CONVERTED */
-}
+
+} // end of png_read_filter_row_mmx_sub()
+
+
+
+
+//===========================================================================//
+//                                                                           //
+//            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
+//                                                                           //
+//===========================================================================//
 
 // Optimized code for PNG Up filter decoder
-void /* PRIVATE */
+
+static void /* PRIVATE */
 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
                            png_bytep prev_row)
 {
-#ifdef GRR_GCC_MMX_CONVERTED
    png_uint_32 len;
+   int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
+   int dummy_value_S;
+   int dummy_value_D;
 
-   len = row_info->rowbytes;       // # of bytes to filter
-   __asm__ (
-      "movl row, %%edi             \n\t"
+   len = row_info->rowbytes;              // number of bytes to filter
+
+   __asm__ __volatile__ (
+//pre "movl row, %%edi              \n\t"
       // get # of bytes to alignment
-      "movl %%edi, %%ecx           \n\t"
-      "xorl %%ebx, %%ebx           \n\t"
-      "addl $0x7, %%ecx            \n\t"
-      "xorl %%eax, %%eax           \n\t"
-      "andl $0xfffffff8, %%ecx     \n\t"
-      "movl prev_row, %%esi        \n\t"
-      "subl %%edi, %%ecx           \n\t"
-      "jz up_go                    \n\t"
-      // fix alignment
-   "up_lp1:                        \n\t"
-      "movb (%%edi,%%ebx,), %%al   \n\t"
-      "addb (%%esi,%%ebx,), %%al   \n\t"
-      "incl %%ebx                  \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to offset inc ebx
-      "jb up_lp1                   \n\t"
-   "up_go:                         \n\t"
-      "movl len, %%ecx             \n\t"
-      "movl %%ecx, %%edx           \n\t"
-      "subl %%ebx, %%edx           \n\t" // subtract alignment fix
-      "andl $0x0000003f, %%edx     \n\t" // calc bytes over mult of 64
-      "subl %%edx, %%ecx           \n\t" // drop over bytes from length
-      // Unrolled loop - use all MMX registers and interleave to reduce
+      "movl %%edi, %%ecx            \n\t"
+      "xorl %%ebx, %%ebx            \n\t"
+      "addl $0x7, %%ecx             \n\t"
+      "xorl %%eax, %%eax            \n\t"
+      "andl $0xfffffff8, %%ecx      \n\t"
+//pre "movl prev_row, %%esi         \n\t"
+      "subl %%edi, %%ecx            \n\t"
+      "jz up_go                     \n\t"
+
+   "up_lp1:                         \n\t" // fix alignment
+      "movb (%%edi,%%ebx,), %%al    \n\t"
+      "addb (%%esi,%%ebx,), %%al    \n\t"
+      "incl %%ebx                   \n\t"
+      "cmpl %%ecx, %%ebx            \n\t"
+      "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
+      "jb up_lp1                    \n\t" //  offset incl ebx
+
+   "up_go:                          \n\t"
+//pre "movl len, %%edx              \n\t"
+      "movl %%edx, %%ecx            \n\t"
+      "subl %%ebx, %%edx            \n\t" // subtract alignment fix
+      "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
+      "subl %%edx, %%ecx            \n\t" // drop over bytes from length
+
+      // unrolled loop - use all MMX registers and interleave to reduce
       // number of branch instructions (loops) and reduce partial stalls
-   "up_loop:                       \n\t"
-      "movq (%%esi,%%ebx,), %%mm1  \n\t"
-      "movq (%%edi,%%ebx,), %%mm0  \n\t"
-      "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
-      "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
-      "movq %%mm0, (%%edi,%%ebx,)  \n\t"
-      "paddb %%mm3, %%mm2          \n\t"
+   "up_loop:                        \n\t"
+      "movq (%%esi,%%ebx,), %%mm1   \n\t"
+      "movq (%%edi,%%ebx,), %%mm0   \n\t"
+      "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
+      "paddb %%mm1, %%mm0           \n\t"
+      "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
+      "movq %%mm0, (%%edi,%%ebx,)   \n\t"
+      "paddb %%mm3, %%mm2           \n\t"
       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
-      "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
+      "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
-      "paddb %%mm5, %%mm4          \n\t"
+      "paddb %%mm5, %%mm4           \n\t"
       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
-      "paddb %%mm7, %%mm6          \n\t"
+      "paddb %%mm7, %%mm6           \n\t"
       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
+      "paddb %%mm1, %%mm0           \n\t"
       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
-      "paddb %%mm3, %%mm2          \n\t"
+      "paddb %%mm3, %%mm2           \n\t"
       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
-      "paddb %%mm5, %%mm4          \n\t"
+      "paddb %%mm5, %%mm4           \n\t"
       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
-      "addl $64, %%ebx             \n\t"
-      "paddb %%mm7, %%mm6          \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
+      "addl $64, %%ebx              \n\t"
+      "paddb %%mm7, %%mm6           \n\t"
+      "cmpl %%ecx, %%ebx            \n\t"
       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
-                                     // -8 to offset add ebx
-      "jb up_loop                  \n\t"
+      "jb up_loop                   \n\t" //  -8 to offset addl ebx
 
-      "cmpl $0, %%edx              \n\t" // Test for bytes over mult of 64
-      "jz up_end                   \n\t"
+      "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
+      "jz up_end                    \n\t"
 
+      "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
+      "jb up_lt8                    \n\t" //  [added by lcreeve@netins.net]
 
-      // 2 lines added by lcreeve@netins.net
-      // (mail 11 Jul 98 in png-implement list)
-      "cmpl $8, %%edx              \n\t" //test for less than 8 bytes
-      "jb up_lt8                   \n\t"
+      "addl %%edx, %%ecx            \n\t"
+      "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
+      "subl %%edx, %%ecx            \n\t" // drop over bytes from length
+      "jz up_lt8                    \n\t"
 
+   "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
+      "movq (%%esi,%%ebx,), %%mm1   \n\t"
+      "movq (%%edi,%%ebx,), %%mm0   \n\t"
+      "addl $8, %%ebx               \n\t"
+      "paddb %%mm1, %%mm0           \n\t"
+      "cmpl %%ecx, %%ebx            \n\t"
+      "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
+      "jb up_lpA                    \n\t" //  offset add ebx
+      "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
+      "jz up_end                    \n\t"
 
-      "addl %%edx, %%ecx           \n\t"
-      "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
-      "subl %%edx, %%ecx           \n\t" // drop over bytes from length
-      "jz up_lt8                   \n\t"
-      // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
-   "up_lpA:                        \n\t"
-      "movq (%%esi,%%ebx,), %%mm1  \n\t"
-      "movq (%%edi,%%ebx,), %%mm0  \n\t"
-      "addl $8, %%ebx              \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to offset add ebx
-      "jb up_lpA                   \n\t"
-      "cmpl $0, %%edx              \n\t" // Test for bytes over mult of 8
-      "jz up_end                   \n\t"
-   "up_lt8:                        \n\t"
-      "xorl %%eax, %%eax           \n\t"
-      "addl %%edx, %%ecx           \n\t" // move over byte count into counter
-      // Loop using x86 registers to update remaining bytes
-   "up_lp2:                        \n\t"
-      "movb (%%edi,%%ebx,), %%al   \n\t"
-      "addb (%%esi,%%ebx,), %%al   \n\t"
-      "incl %%ebx                  \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to offset inc ebx
-      "jb up_lp2                   \n\t"
-   "up_end:                        \n\t"
-      // Conversion of filtered row completed
-      "emms                        \n\t" // End MMX instructions; prep for possible FP instrs.
+   "up_lt8:                         \n\t"
+      "xorl %%eax, %%eax            \n\t"
+      "addl %%edx, %%ecx            \n\t" // move over byte count into counter
 
-      : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
+   "up_lp2:                         \n\t" // use x86 regs for remaining bytes
+      "movb (%%edi,%%ebx,), %%al    \n\t"
+      "addb (%%esi,%%ebx,), %%al    \n\t"
+      "incl %%ebx                   \n\t"
+      "cmpl %%ecx, %%ebx            \n\t"
+      "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
+      "jb up_lp2                    \n\t" //  offset inc ebx
 
-      : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
+   "up_end:                         \n\t"
+      "EMMS                         \n\t" // conversion of filtered row complete
 
-      : "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
+      : "=d" (dummy_value_d),   // 0      // output regs (dummy)
+        "=S" (dummy_value_S),   // 1
+        "=D" (dummy_value_D)    // 2
+
+      : "0" (len),              // edx    // input regs
+        "1" (prev_row),         // esi
+        "2" (row)               // edi
+
+      : "%eax", "%ebx", "%ecx"            // clobber list (no input regs!)
+
+#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
+      , "%mm0", "%mm1", "%mm2", "%mm3"
+      , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
    );
-#endif /* GRR_GCC_MMX_CONVERTED */
-}
 
+} // end of png_read_filter_row_mmx_up()
+
+
+
+
+//===========================================================================//
+//                                                                           //
+//                   P N G _ R E A D _ F I L T E R _ R O W                   //
+//                                                                           //
+//===========================================================================//
 
 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
 
@@ -4410,17 +4751,21 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
 #ifdef PNG_DEBUG
    char filnm[6];
 #endif
-   #define UseMMX 1
+
+#define UseMMX 1
+
+#define UseMMX_sub    1   // GRR:  converted 20000730
+#define UseMMX_up     1   // GRR:  converted 20000729
+#define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
+#define UseMMX_paeth  1   // GRR:  converted 20000828
 
    if (mmx_supported == 2)
        mmx_supported = mmxsupport();
 
-#ifdef GRR_GCC_MMX_CONVERTED
    if (!mmx_supported)
-#endif
    {
        png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
-       return ;
+       return;
    }
 
 #ifdef PNG_DEBUG
@@ -4432,23 +4777,24 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
 #endif
    switch (filter)
    {
-      case 0: sprintf(filnm, "None ");
+      case 0: sprintf(filnm, "none");
          break;
-      case 1: sprintf(filnm, "Sub  ");
+      case 1: sprintf(filnm, "sub");
          break;
-      case 2: sprintf(filnm, "Up   ");
+      case 2: sprintf(filnm, "up");
          break;
-      case 3: sprintf(filnm, "Avg  ");
+      case 3: sprintf(filnm, "avg");
          break;
       case 4: sprintf(filnm, "Paeth");
          break;
-      default: sprintf(filnm, "Unknw");
+      default: sprintf(filnm, "unknw");
          break;
    }
-   png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
-   png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
+   png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
+   png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
+   png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
       (int)((row_info->pixel_depth + 7) >> 3));
-   png_debug1(0,"len=%8d, ", row_info->rowbytes);
+   png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
 #endif
 
    switch (filter)
@@ -4457,7 +4803,7 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
          break;
 
       case PNG_FILTER_VALUE_SUB:
-#if (UseMMX == 1)
+#if (UseMMX_sub == 1)
          if ((row_info->pixel_depth > 8) && (row_info->rowbytes >= 128))
          {
             png_read_filter_row_mmx_sub(row_info, row);
@@ -4476,11 +4822,11 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
                rp++;
             }
-         }  //end !UseMMX
+         }  //end !UseMMX_sub
          break;
 
       case PNG_FILTER_VALUE_UP:
-#if (UseMMX == 1)
+#if (UseMMX_up == 1)
          if ((row_info->pixel_depth > 8) && (row_info->rowbytes >= 128))
          {
             png_read_filter_row_mmx_up(row_info, row, prev_row);
@@ -4496,11 +4842,11 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
             {
                   *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
             }
-         }  //end !UseMMX
+         }  //end !UseMMX_up
          break;
 
       case PNG_FILTER_VALUE_AVG:
-#if (UseMMX == 1)
+#if (UseMMX_avg == 1)
          if ((row_info->pixel_depth > 8) && (row_info->rowbytes >= 128))
          {
             png_read_filter_row_mmx_avg(row_info, row, prev_row);
@@ -4528,11 +4874,11 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
                rp++;
             }
-         }  //end !UseMMX
+         }  //end !UseMMX_avg
          break;
 
       case PNG_FILTER_VALUE_PAETH:
-#if (UseMMX == 1)
+#if (UseMMX_paeth == 1)
          if ((row_info->pixel_depth > 8) && (row_info->rowbytes >= 128))
          {
             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
@@ -4589,7 +4935,7 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
                rp++;
             }
-         }  //end !UseMMX
+         }  //end !UseMMX_paeth
          break;
 
       default:
@@ -4602,6 +4948,14 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
 
 
+
+
+//===========================================================================//
+//                                                                           //
+//                            M M X S U P P O R T                            //
+//                                                                           //
+//===========================================================================//
+
 // GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
 //             (2) all instructions compile with gcc 2.7.2.3 and later
 //             (3) the function is moved down here to prevent gcc from
@@ -4618,7 +4972,7 @@ int mmxsupport(void)
 {
     int mmx_supported_local = 0;
 
-    __asm__ (
+    __asm__ __volatile__ (
 //      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
 //      "pushf                \n\t"  // save Eflag to stack
         "pushfl               \n\t"  // save Eflag to stack
@@ -4670,11 +5024,13 @@ int mmxsupport(void)
     return mmx_supported_local;
 }
 
+
 #else /* !ORIG_THAT_USED_TO_CLOBBER_EBX */
 
+
 int mmxsupport(void)
 {
-    __asm__ (
+    __asm__ __volatile__ (
         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
         "pushl %%ecx          \n\t"  // so does ecx...
         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
@@ -4740,5 +5096,3 @@ int mmxsupport(void)
 #endif /* ?ORIG_THAT_USED_TO_CLOBBER_EBX */
 
 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */
-
-
diff --git a/pngget.c b/pngget.c
index 266dc3b17..42ba52897 100644
--- a/pngget.c
+++ b/pngget.c
@@ -1,7 +1,7 @@
 
 /* pngget.c - retrieval of values from info struct
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngmem.c b/pngmem.c
index 22c77beed..6a0474f84 100644
--- a/pngmem.c
+++ b/pngmem.c
@@ -1,7 +1,7 @@
 
 /* pngmem.c - stub functions for memory allocation
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngpread.c b/pngpread.c
index 6dfd01803..0e564666b 100644
--- a/pngpread.c
+++ b/pngpread.c
@@ -1,7 +1,7 @@
 
 /* pngpread.c - read a png file in push mode
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -894,25 +894,25 @@ png_read_push_finish_row(png_structp png_ptr)
    /* arrays to facilitate easy interlacing - use pass (0 - 6) as index */
 
    /* start of interlace block */
-   const int png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
+   const int FARDATA png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
 
    /* offset to next interlace block */
-   const int png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
+   const int FARDATA png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
 
    /* start of interlace block in the y direction */
-   const int png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
+   const int FARDATA png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
 
    /* offset to next interlace block in the y direction */
-   const int png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
+   const int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
 
    /* Width of interlace block.  This is not currently used - if you need
     * it, uncomment it here and in png.h
-   const int png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
+   const int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
    */
 
    /* Height of interlace block.  This is not currently used - if you need
     * it, uncomment it here and in png.h
-   const int png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
+   const int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
    */
 #endif
 
@@ -1429,7 +1429,8 @@ png_progressive_combine_row (png_structp png_ptr,
    png_bytep old_row, png_bytep new_row)
 {
 #ifdef PNG_USE_LOCAL_ARRAYS
-   const int png_pass_dsp_mask[7] = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
+   const int FARDATA png_pass_dsp_mask[7] =
+      {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
 #endif
    if (new_row != NULL)    /* new_row must == png_ptr->row_buf here. */
       png_combine_row(png_ptr, old_row, png_pass_dsp_mask[png_ptr->pass]);
diff --git a/pngread.c b/pngread.c
index d1cbc512d..af88de5ec 100644
--- a/pngread.c
+++ b/pngread.c
@@ -1,7 +1,7 @@
 
 /* pngread.c - read a PNG file
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -440,6 +440,9 @@ png_read_update_info(png_structp png_ptr, png_infop info_ptr)
    /* save jump buffer and error functions */
    if (!(png_ptr->flags & PNG_FLAG_ROW_INIT))
       png_read_start_row(png_ptr);
+   else
+      png_warning(png_ptr,
+      "Ignoring extra png_read_update_info() call; row buffer not reallocated");
    png_read_transform_info(png_ptr, info_ptr);
 }
 
@@ -698,7 +701,7 @@ png_read_row(png_structp png_ptr, png_bytep row, png_bytep dsp_row)
  * not called png_set_interlace_handling(), the display_row buffer will
  * be ignored, so pass NULL to it.
  *
- * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.8
+ * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.9beta2
  */
 
 void PNGAPI
@@ -747,7 +750,7 @@ png_read_rows(png_structp png_ptr, png_bytepp row,
  * only call this function once.  If you desire to have an image for
  * each pass of a interlaced image, use png_read_rows() instead.
  *
- * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.8
+ * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.9beta2
  */
 void PNGAPI
 png_read_image(png_structp png_ptr, png_bytepp image)
diff --git a/pngrio.c b/pngrio.c
index b6e592254..5cd3ddc06 100644
--- a/pngrio.c
+++ b/pngrio.c
@@ -1,7 +1,7 @@
 
 /* pngrio.c - functions for data input
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngrtran.c b/pngrtran.c
index da75f0a85..57391bac4 100644
--- a/pngrtran.c
+++ b/pngrtran.c
@@ -1,7 +1,7 @@
 
 /* pngrtran.c - transforms the data in a row for PNG readers
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -1090,7 +1090,12 @@ png_read_transform_info(png_structp png_ptr, png_infop info_ptr)
    if ((png_ptr->transformations & PNG_FILLER) &&
        ((info_ptr->color_type == PNG_COLOR_TYPE_RGB) ||
        (info_ptr->color_type == PNG_COLOR_TYPE_GRAY)))
+   {
       info_ptr->channels++;
+#if 0 /* if adding a true alpha channel not just filler */
+      info_ptr->color_type |= PNG_COLOR_MASK_ALPHA;
+#endif
+   }
 #endif
 
 #if defined(PNG_USER_TRANSFORM_PTR_SUPPORTED) && \
diff --git a/pngrutil.c b/pngrutil.c
index fc6242050..e87ee5357 100644
--- a/pngrutil.c
+++ b/pngrutil.c
@@ -1,7 +1,7 @@
 
 /* pngrutil.c - utilities to read a PNG file
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -976,6 +976,8 @@ png_handle_iCCP(png_structp png_ptr, png_infop info_ptr, png_uint_32 length)
    png_byte compression_type;
    png_charp profile;
    png_uint_32 skip = 0;
+   png_uint_32 profile_size = 0;
+   png_uint_32 profile_length = 0;
    png_size_t slength, prefix_length, data_length;
 
    png_debug(1, "in png_handle_iCCP\n");
@@ -1027,22 +1029,43 @@ png_handle_iCCP(png_structp png_ptr, png_infop info_ptr, png_uint_32 length)
 
    /* there should be at least one zero (the compression type byte)
       following the separator, and we should be on it  */
-   if (*profile || profile >= chunkdata + slength)
+   if ( profile >= chunkdata + slength)
    {
       png_free(png_ptr, chunkdata);
-      png_warning(png_ptr, "malformed iCCP chunk");
+      png_warning(png_ptr, "Malformed iCCP chunk");
       return;
    }
 
    /* compression_type should always be zero */
    compression_type = *profile++;
+   if (compression_type)
+   {
+      png_warning(png_ptr, "Ignoring nonzero compression type in iCCP chunk");
+      compression_type=0x00;  /* Reset it to zero (libpng-1.0.6 through 1.0.8
+                                 wrote nonzero) */
+   }
 
    prefix_length = profile - chunkdata;
    chunkdata = png_decompress_chunk(png_ptr, compression_type, chunkdata,
                                     slength, prefix_length, &data_length);
 
+   profile_length = data_length - prefix_length;
+   profile_size = ((*(chunkdata+prefix_length))<<24) |
+                  ((*(chunkdata+prefix_length+1))<<16) |
+                  ((*(chunkdata+prefix_length+2))<< 8) |
+                  ((*(chunkdata+prefix_length+3))    );
+
+   if(profile_size < profile_length)
+      profile_length = profile_size;
+
+   if(profile_size > profile_length)
+   {
+      png_warning(png_ptr, "Ignoring truncated iCCP profile.\n");
+      return;
+   }
+
    png_set_iCCP(png_ptr, info_ptr, chunkdata, compression_type,
-                chunkdata + prefix_length, data_length);
+                chunkdata + prefix_length, data_length-prefix_length);
    png_free(png_ptr, chunkdata);
 }
 #endif /* PNG_READ_iCCP_SUPPORTED */
@@ -1336,7 +1359,6 @@ png_handle_bKGD(png_structp png_ptr, png_infop info_ptr, png_uint_32 length)
           if(buf[0] > info_ptr->num_palette)
           {
              png_warning(png_ptr, "Incorrect bKGD chunk index value");
-             png_crc_finish(png_ptr, length);
              return;
           }
           png_ptr->background.red =
diff --git a/pngset.c b/pngset.c
index 60923288d..114012656 100644
--- a/pngset.c
+++ b/pngset.c
@@ -1,7 +1,7 @@
 
 /* pngset.c - storage of image information into info struct
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -741,10 +741,27 @@ png_set_unknown_chunk_location(png_structp png_ptr, png_infop info_ptr,
 void PNGAPI
 png_permit_empty_plte (png_structp png_ptr, int empty_plte_permitted)
 {
-   png_debug(1, "in png_permit_empty_plte\n");
+   /* This function is deprecated in favor of png_permit_mng_features()
+      and will be removed from libpng-2.0.0 */
+   png_debug(1, "in png_permit_empty_plte, DEPRECATED.\n");
    if (png_ptr == NULL)
       return;
-   png_ptr->empty_plte_permitted=(png_byte)empty_plte_permitted;
+   png_ptr->mng_features_permitted = (png_byte)
+     ((png_ptr->mng_features_permitted & (~(PNG_FLAG_MNG_EMPTY_PLTE))) |
+     ((empty_plte_permitted & PNG_FLAG_MNG_EMPTY_PLTE)));
+}
+#endif
+
+#if defined(PNG_MNG_FEATURES_SUPPORTED)
+png_uint_32 PNGAPI
+png_permit_mng_features (png_structp png_ptr, png_uint_32 mng_features)
+{
+   png_debug(1, "in png_permit_mng_features\n");
+   if (png_ptr == NULL)
+      return (png_uint_32)0;
+   png_ptr->mng_features_permitted =
+     (png_byte)(mng_features & PNG_ALL_MNG_FEATURES);
+   return (png_uint_32)png_ptr->mng_features_permitted;
 }
 #endif
 
@@ -825,6 +842,8 @@ png_set_compression_buffer_size(png_structp png_ptr, png_uint_32 size)
        png_free(png_ptr, png_ptr->zbuf);
     png_ptr->zbuf_size = (png_size_t)size;
     png_ptr->zbuf = (png_bytep)png_malloc(png_ptr, size);
+    if(!png_ptr->zbuf)
+       png_error(png_ptr,"Unable to malloc zbuf");
     png_ptr->zstream.next_out = png_ptr->zbuf;
     png_ptr->zstream.avail_out = (uInt)png_ptr->zbuf_size;
 }
diff --git a/pngtrans.c b/pngtrans.c
index b974a66ae..140905c6f 100644
--- a/pngtrans.c
+++ b/pngtrans.c
@@ -1,7 +1,7 @@
 
 /* pngtrans.c - transforms the data in a row (used by both readers and writers)
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngvcrd.c b/pngvcrd.c
index ed09aaea8..4b85a1fd7 100644
--- a/pngvcrd.c
+++ b/pngvcrd.c
@@ -2,7 +2,7 @@
  *
  * For Intel x86 CPU and Microsoft Visual C++ compiler
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * Copyright (c) 1998, Intel Corporation
@@ -10,6 +10,8 @@
  * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
  * Interface to libpng contributed by Gilles Vollant, 1999
  *
+ * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
+ *
  */
 
 #define PNG_INTERNAL
@@ -2117,8 +2119,8 @@ davg4lp:
       case 2:
       {
          ActiveMask.use  = 0x000000000000ffff;
-         ShiftBpp.use = 24;   // == 3 * 8
-         ShiftRem.use = 40;   // == 64 - 24
+         ShiftBpp.use = 16;   // == 2 * 8     [BUGFIX]
+         ShiftRem.use = 48;   // == 64 - 16   [BUGFIX]
          _asm {
             // Load ActiveMask
             movq mm7, ActiveMask
@@ -2133,7 +2135,7 @@ davg4lp:
                               // (we correct position in loop below)
 davg2lp:
             movq mm0, [edi + ebx]
-            psllq mm2, ShiftRem  // shift data to position correctly
+            psrlq mm2, ShiftRem  // shift data to position correctly   [BUGFIX]
             movq mm1, [esi + ebx]
             // Add (Prev_row/2) to Average
             movq mm3, mm5
diff --git a/pngwio.c b/pngwio.c
index c70062426..d62fe85ba 100644
--- a/pngwio.c
+++ b/pngwio.c
@@ -1,7 +1,7 @@
 
 /* pngwio.c - functions for data output
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngwrite.c b/pngwrite.c
index 167e77056..ba79c7b83 100644
--- a/pngwrite.c
+++ b/pngwrite.c
@@ -1,7 +1,7 @@
 
 /* pngwrite.c - general routines to write a PNG file
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -964,7 +964,6 @@ void PNGAPI
 png_set_filter(png_structp png_ptr, int method, int filters)
 {
    png_debug(1, "in png_set_filter\n");
-   /* We allow 'method' only for future expansion of the base filter method. */
    if (method == PNG_FILTER_TYPE_BASE)
    {
       switch (filters & (PNG_ALL_FILTERS | 0x07))
diff --git a/pngwtran.c b/pngwtran.c
index 257ccd13c..d0f5dd7fb 100644
--- a/pngwtran.c
+++ b/pngwtran.c
@@ -1,7 +1,7 @@
 
 /* pngwtran.c - transforms the data in a row for PNG writers
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngwutil.c b/pngwutil.c
index fb702c986..ef81008d6 100644
--- a/pngwutil.c
+++ b/pngwutil.c
@@ -1,7 +1,7 @@
 
 /* pngwutil.c - utilities to write a PNG file
  *
- * libpng 1.0.8 - July 24, 2000
+ * libpng 1.0.9beta2 - November 19, 2000
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -514,8 +514,8 @@ png_write_PLTE(png_structp png_ptr, png_colorp palette, png_uint_32 num_pal)
 
    png_debug(1, "in png_write_PLTE\n");
    if ((
-#ifdef PNG_WRITE_EMPTY_PLTE_SUPPORTED
-        !png_ptr->empty_plte_permitted &&
+#if defined(PNG_MNG_FEATURES_SUPPORTED)
+        !(png_ptr->mng_features_permitted & PNG_FLAG_MNG_EMPTY_PLTE) &&
 #endif
         num_pal == 0) || num_pal > 256)
      {
@@ -670,6 +670,7 @@ png_write_iCCP(png_structp png_ptr, png_charp name, int compression_type,
    /* make sure we include the NULL after the name and the compression type */
    png_write_chunk_start(png_ptr, (png_bytep)png_iCCP,
           (png_uint_32)name_len+profile_len+2);
+   new_name[name_len+1]=0x00;
    png_write_chunk_data(png_ptr, (png_bytep)new_name, name_len + 2);
 
    if (profile_len)
@@ -996,9 +997,9 @@ png_write_bKGD(png_structp png_ptr, png_color_16p back, int color_type)
    if (color_type == PNG_COLOR_TYPE_PALETTE)
    {
       if (
-#ifdef PNG_WRITE_EMPTY_PLTE_SUPPORTED
-          (!png_ptr->empty_plte_permitted ||
-          (png_ptr->empty_plte_permitted && png_ptr->num_palette)) &&
+#if defined(PNG_MNG_FEATURES_SUPPORTED)
+          (png_ptr->num_palette ||
+          (!(png_ptr->mng_features_permitted & PNG_FLAG_MNG_EMPTY_PLTE))) &&
 #endif
          back->index > png_ptr->num_palette)
       {
@@ -1083,7 +1084,7 @@ png_check_keyword(png_structp png_ptr, png_charp key, png_charpp new_key)
 
    png_debug1(2, "Keyword to be checked is '%s'\n", key);
 
-   *new_key = (png_charp)png_malloc(png_ptr, (png_uint_32)(key_len + 1));
+   *new_key = (png_charp)png_malloc(png_ptr, (png_uint_32)(key_len + 2));
 
    /* Replace non-printing characters with a blank and print a warning */
    for (kp = key, dp = *new_key; *kp != '\0'; kp++, dp++)
@@ -2553,6 +2554,7 @@ png_write_filtered_row(png_structp png_ptr, png_bytep filtered_row)
    png_debug(1, "in png_write_filtered_row\n");
    png_debug1(2, "filter = %d\n", filtered_row[0]);
    /* set up the zlib input buffer */
+
    png_ptr->zstream.next_in = filtered_row;
    png_ptr->zstream.avail_in = (uInt)png_ptr->row_info.rowbytes + 1;
    /* repeat until we have compressed all the data */