[libpng15] Further optimization of png_combine_row() in the interlaced case.

This commit is contained in:
John Bowler
2011-10-11 16:01:33 -05:00
committed by Glenn Randers-Pehrson
parent 76b62317b5
commit 4e68aa7e40
4 changed files with 435 additions and 186 deletions

View File

@@ -2782,9 +2782,10 @@ png_check_chunk_name(png_structp png_ptr, png_uint_32 chunk_name)
void /* PRIVATE */
png_combine_row(png_structp png_ptr, png_bytep dp, int display)
{
int pixel_depth = png_ptr->transformed_pixel_depth;
png_bytep sp = png_ptr->row_buf + 1;
unsigned int pixel_depth = png_ptr->transformed_pixel_depth;
png_const_bytep sp = png_ptr->row_buf + 1;
png_uint_32 row_width = png_ptr->width;
unsigned int pass = png_ptr->pass;
png_debug(1, "in png_combine_row");
@@ -2812,197 +2813,391 @@ png_combine_row(png_structp png_ptr, png_bytep dp, int display)
*/
#ifdef PNG_READ_INTERLACING_SUPPORTED
if (png_ptr->interlaced && (png_ptr->transformations & PNG_INTERLACE) &&
png_ptr->pass < 6 && (display == 0 || display == 1))
pass < 6 && (display == 0 ||
/* The following copies everything for 'display' on passes 0, 2 and 4. */
(display == 1 && (pass & 1) != 0)))
{
/* These are reversed from the values used prior to libpng 1.5.6 to allow
* testing against '1' rather than 0x80
/* Narrow images may have no bits in a pass; the caller should handle
* this, but this test is cheap:
*/
static PNG_CONST png_byte png_pass_mask[2][6] = {
{0x01, 0x10, 0x11, 0x44, 0x55, 0xaa /*, 0xff*/}, /* regular */
{0xff, 0xf0, 0xff, 0xcc, 0xff, 0xaa /*, 0xff*/}};/* display */
unsigned int mask = png_pass_mask[display][png_ptr->pass] + 0x100;
if (row_width <= PNG_PASS_START_COL(pass))
return;
if (mask != 0x1ff)
if (pixel_depth < 8)
{
if (pixel_depth < 8)
{
/* Must write partial bytes, the 'shift' here is to the left, but
* the PNG bits go to the right, i.e. start at the most significant
* bit.
*/
unsigned int shift;
unsigned int inc = (unsigned int)pixel_depth;
unsigned int m = mask << 1;
unsigned int pixel_mask = (1 << pixel_depth) - 1;
/* For pixel depths up to 4bpp the 8-pixel mask can be expanded to fit
* into 32 bits, then a single loop over the bytes using the four byte
* values in the 32 bit mask can be used. For the 'display' option the
* expanded mask may also not require any masking within a byte. To
* make this work the PACKSWAP option must be taken into account - it
* simply requires the pixels to be reversed in each byte.
*
* The 'regular' case requires a mask for each of the first 6 passes,
* the 'display' case does a copy for the even passes in the range
* 0..6. This has already been handled in the tst above.
*
* The masks are arranged as four bytes with the first byte to use in
* the lowest bits (little-endian) regardless of the order (PACKSWAP or
* not) of the pixels in each byte.
*
* NOTE: the whole of this logic depends on the caller of this function
* only calling it on rows appropriate to the pass. This function only
* understands the 'x' logic, the 'y' logic is handled by the caller.
*
* The following defines allow generation of compile time constant bit
* masks for each pixel depth and each possibility of swapped or not
* swapped bytes. Pass is in the range 0..6, 'x', a pixel index, is in
* the range 0..7, the result is 1 if the pixel is to be copied in the
* pass, 0 if not. 'S' is for the sparkle method, 'B' for the block
* method.
*/
# define S_COPY(p,x) (((p)<4 ? 0x80088822 >> ((3-(p))*8+(7-(x))) :\
0xaa55ff00 >> ((7-(p))*8+(7-(x)))) & 1)
# define B_COPY(p,x) (((p)<4 ? 0xff0fff33 >> ((3-(p))*8+(7-(x))) :\
0xff55ff00 >> ((7-(p))*8+(7-(x)))) & 1)
# ifdef PNG_READ_PACKSWAP_SUPPORTED
if (png_ptr->transformations & PNG_PACKSWAP)
{
/* The bytes have been swapped; start at the other end and
* move in the opposite direction.
*/
shift = 0;
/* inc is already correct */
}
else
# endif
/* Return a mask for pass 'p' pixel 'x' at depth 'd'. The mask is
* little endian - the first pixel is at bit 0 - however the extra
* parameter 's' can be set to cause the mask position to be swapped
* within each byte, to match the PNG format. This is done by XOR of
* the shift with 7, 6 or 4 for bit depths 1, 2 and 4.
*/
# define PIXEL_MASK(p,x,d,s) (((1U<<(d))-1)<<(((x)*(d))^((s)?8-(d):0)))
/* Bits not swapped: normal case */
{
shift = 8 - inc;
inc = -inc; /* but note, unsigned */
}
/* Hence generate the appropriate 'block' or 'sparkle' pixel copy mask.
*/
# define S_MASKx(p,x,d,s) (S_COPY(p,x)?PIXEL_MASK(p,x,d,s):0)
# define B_MASKx(p,x,d,s) (B_COPY(p,x)?PIXEL_MASK(p,x,d,s):0)
for (;;)
{
m >>= 1;
/* Combine 8 of these to get the full mask. For the 1 and 2 bpp cases
* the result needs replicating, for the 4bpp case the above generates
* a full 32 bits.
*/
# define MASK_EXPAND(m,d) ((m)*((d)==1?0x01010101:((d)==2?0x00010001:1)))
if (m == 1)
m = mask;
# define S_MASK(p,d,s) MASK_EXPAND(S_MASKx(p,0,d,s) + S_MASKx(p,1,d,s) +\
S_MASKx(p,2,d,s) + S_MASKx(p,3,d,s) + S_MASKx(p,4,d,s) +\
S_MASKx(p,5,d,s) + S_MASKx(p,6,d,s) + S_MASKx(p,7,d,s), d)
if (m & 1)
{
/* Find the bits to select and copy those over: */
unsigned int bit_mask = pixel_mask << shift;
*dp = (png_byte)((*dp & ~bit_mask) | (*sp & bit_mask));
}
# define B_MASK(p,d,s) MASK_EXPAND(B_MASKx(p,0,d,s) + B_MASKx(p,1,d,s) +\
B_MASKx(p,2,d,s) + B_MASKx(p,3,d,s) + B_MASKx(p,4,d,s) +\
B_MASKx(p,5,d,s) + B_MASKx(p,6,d,s) + B_MASKx(p,7,d,s), d)
if (--row_width == 0)
break;
#if PNG_USE_COMPILE_TIME_MASKS
/* Utility macros to construct all the masks for a depth/swap
* combination. The 's' parameter says whether the format is PNG
* (big endian bytes) or not. Only the three odd numbered passes are
* required for the display/block algorithm.
*/
# define S_MASKS(d,s) { S_MASK(0,d,s), S_MASK(1,d,s), S_MASK(2,d,s),\
S_MASK(3,d,s), S_MASK(4,d,s), S_MASK(5,d,s) }
/* And move to the next set of bits, checking for the end of this
* byte.
*/
shift += inc;
if (shift > 7) /* because it is unsigned */
{
++sp;
++dp;
}
shift &= 7;
}
}
# define B_MASKS(d,s) { B_MASK(1,d,s), S_MASK(3,d,s), S_MASK(5,d,s) }
else /* pixel_depth >= 8 */
{
unsigned int m;
# define DEPTH_INDEX(d) ((d)==1?0:((d)==2?1:2))
pixel_depth >>= 3; /* now in bytes */
m = mask << 1;
/* Hence the pre-compiled masks indexed by PACKSWAP (or not), depth and
* then pass:
*/
static PNG_CONST png_uint_32 row_mask[2/*PACKSWAP*/][3/*depth*/][6] = {
/* Little-endian byte masks for PACKSWAP */
{ S_MASKS(1,0), S_MASKS(2,0), S_MASKS(4,0) },
/* Normal (big-endian byte) masks - PNG format */
{ S_MASKS(1,1), S_MASKS(2,1), S_MASKS(4,1) }
};
/* This is here to give the compiler some help in the common cases
* where there are very few bytes.
*/
if (pixel_depth == 1)
{
do
{
m >>= 1;
/* display_mask has only three entries for the odd passes, so index by
* pass>>1.
*/
static PNG_CONST png_uint_32 display_mask[2][3][3] = {
/* Little-endian byte masks for PACKSWAP */
{ B_MASKS(1,0), B_MASKS(2,0), B_MASKS(4,0) },
/* Normal (big-endian byte) masks - PNG format */
{ B_MASKS(1,1), B_MASKS(2,1), B_MASKS(4,1) }
};
if (m == 1)
m = mask;
# define MASK(pass,depth,display,png)\
((display)?display_mask[png][DEPTH_INDEX(depth)][pass>>1]:\
row_mask[png][DEPTH_INDEX(depth)][pass])
if (m & 1)
*dp = *sp;
#else /* !PNG_USE_COMPILE_TIME_MASKS */
/* This is the runtime alternative: it seems unlikely that this will
* ever be either smaller or faster than the compile time approach.
*/
# define MASK(pass,depth,display,png)\
((display)?B_MASK(pass,depth,png):S_MASK(pass,depth,png))
#endif /* !PNG_USE_COMPILE_TIME_MASKS */
++dp;
++sp;
}
while (--row_width > 0);
}
/* Use the appropriate mask to copy the required bits. In some cases
* the byte mask will be 0 or 0xff, optimize these cases. row_width is
* the number of pixels, but the code copies bytes, so it is necessary
* to special case the end.
*/
png_uint_32 pixels_per_byte = 8 / pixel_depth;
png_uint_32 mask;
else if (pixel_depth == 3)
{
do
{
m >>= 1;
if (m == 1)
m = mask;
if (m & 1)
dp[0] = sp[0], dp[1] = sp[1], dp[2] = sp[2];
dp += 3;
sp += 3;
}
while (--row_width > 0);
}
/* This is a common optimization for 2 and 4 byte pixels, for other
* values rely on the toolchain memcpy being optimized.
*/
else if (pixel_depth == sizeof (png_uint_16) &&
png_isaligned(sp, png_uint_16) && png_isaligned(dp, png_uint_16))
{
png_uint_16p dp16 = (png_uint_16p)dp;
png_uint_16p sp16 = (png_uint_16p)sp;
do
{
m >>= 1;
if (m == 1)
m = mask;
if (m & 1)
*dp16 = *sp16;
++dp16;
++sp16;
}
while (--row_width > 0);
}
else if (pixel_depth == sizeof (png_uint_32) &&
png_isaligned(sp, png_uint_32) && png_isaligned(dp, png_uint_32))
{
png_uint_32p dp32 = (png_uint_32p)dp;
png_uint_32p sp32 = (png_uint_32p)sp;
do
{
m >>= 1;
if (m == 1)
m = mask;
if (m & 1)
*dp32 = *sp32;
++dp32;
++sp32;
}
while (--row_width > 0);
}
# ifdef PNG_READ_PACKSWAP_SUPPORTED
if (png_ptr->transformations & PNG_PACKSWAP)
mask = MASK(pass, pixel_depth, display, 0);
else
# endif
mask = MASK(pass, pixel_depth, display, 1);
for (;;)
{
png_uint_32 m;
/* It doesn't matter in the following if png_uint_32 has more than
* 32 bits because the high bits always match those in m<<24, it is,
* however, essential to use OR here, not +, because of this.
*/
m = mask;
mask = (m >> 8) | (m << 24); /* rotate right to good compilers */
m &= 0xff;
if (m != 0) /* something to copy */
{
do
{
m >>= 1;
if (m == 1)
m = mask;
if (m & 1)
png_memcpy(dp, sp, pixel_depth);
sp += pixel_depth;
dp += pixel_depth;
}
while (--row_width > 0);
if (m != 0xff)
*dp = (png_byte)((*dp & ~m) | (*sp & m));
else
*dp = *sp;
}
/* NOTE: this may overwrite the last byte with garbage if the image
* is not an exact number of bytes wide, libpng has always done
* this.
*/
if (row_width <= pixels_per_byte)
return;
row_width -= pixels_per_byte;
++dp;
++sp;
}
}
else /* pixel_depth >= 8 */
{
unsigned int bytes_to_copy, bytes_to_jump;
/* Validate the depth - it must be a multiple of 8 */
if (pixel_depth & 7)
png_error(png_ptr, "invalid user transform pixel depth");
pixel_depth >>= 3; /* now in bytes */
row_width *= pixel_depth;
/* Regardless of pass number the Adam 7 interlace always results in a
* fixed number of pixels to copy then to skip. There may be a
* different number of pixels to skip at the start though.
*/
{
unsigned int offset = PNG_PASS_START_COL(pass) * pixel_depth;
row_width -= offset;
dp += offset;
sp += offset;
}
return;
}
/* else mask is 0xff */
/* Work out the bytes to copy. */
if (display)
{
/* When doing the 'block' algorithm the pixel in the pass gets
* replicated to adjacent pixels. This is why the even (0,2,4,6)
* passes are skipped above - the entire expanded row is copied.
*/
bytes_to_copy = (1<<((6-pass)>>1)) * pixel_depth;
/* But don't allow this number to exceed the actual row width. */
if (bytes_to_copy > row_width)
bytes_to_copy = row_width;
}
else /* normal row; Adam7 only ever gives us one pixel to copy. */
bytes_to_copy = pixel_depth;
/* In Adam7 there is a constant offset between where the pixels go. */
bytes_to_jump = PNG_PASS_COL_OFFSET(pass) * pixel_depth;
/* And simply copy these bytes. Some optimization is possible here,
* depending on the value of 'bytes_to_copy'. Speical case the low
* byte counts, which we know to be frequent.
*/
switch (bytes_to_copy)
{
case 1:
for (;;)
{
*dp = *sp;
if (row_width <= bytes_to_jump)
return;
dp += bytes_to_jump;
sp += bytes_to_jump;
row_width -= bytes_to_jump;
}
case 2:
/* There is a possibility of a partial copy at the end here, this
* slows the code down somewhat.
*/
do
{
dp[0] = sp[0], dp[1] = sp[1];
if (row_width <= bytes_to_jump)
return;
sp += bytes_to_jump;
dp += bytes_to_jump;
row_width -= bytes_to_jump;
}
while (row_width > 1);
/* And there can only be one byte left at this point: */
*dp = *sp;
return;
case 3:
/* This can only be the RGB case, so each copy is exactly one
* pixel and it is not necessary to check for a partial copy.
*/
for(;;)
{
dp[0] = sp[0], dp[1] = sp[1], dp[2] = sp[2];
if (row_width <= bytes_to_jump)
return;
sp += bytes_to_jump;
dp += bytes_to_jump;
row_width -= bytes_to_jump;
}
default:
#if PNG_ALIGN_TYPE != PNG_ALIGN_NONE
/* Check for double byte alignment and, if possible, use a 16
* bit copy. Don't attempt this for narrow images - ones that
* are less than an interlace panel wide. Don't attempt it for
* wide bytes-to-copy either - use the memcpy there.
*/
if (bytes_to_copy < 16 /*else use memcpy*/ &&
png_isaligned(dp, png_uint_16) &&
png_isaligned(sp, png_uint_16) &&
bytes_to_copy % sizeof (png_uint_16) == 0 &&
bytes_to_jump % sizeof (png_uint_16) == 0)
{
/* Everything is aligned for png_uint_16 copies, but try for
* png_uint_32 first.
*/
if (png_isaligned(dp, png_uint_32) &&
png_isaligned(sp, png_uint_32) &&
bytes_to_copy % sizeof (png_uint_32) == 0 &&
bytes_to_jump % sizeof (png_uint_32) == 0)
{
png_uint_32p dp32 = (png_uint_32p)dp;
png_const_uint_32p sp32 = (png_const_uint_32p)sp;
unsigned int skip = (bytes_to_jump-bytes_to_copy) /
sizeof (png_uint_32);
do
{
size_t c = bytes_to_copy;
do
{
*dp32++ = *sp32++;
c -= sizeof (png_uint_32);
}
while (c > 0);
if (row_width <= bytes_to_jump)
return;
dp32 += skip;
sp32 += skip;
row_width -= bytes_to_jump;
}
while (bytes_to_copy <= row_width);
/* Get to here when the row_width truncates the final copy.
* There will be 1-3 bytes left to copy, so don't try the
* 16bit loop below.
*/
dp = (png_bytep)dp32;
sp = (png_const_bytep)sp32;
do
*dp++ = *sp++;
while (--row_width > 0);
return;
}
/* Else do it in 16 bit quantities, but only if the size is
* not too large.
*/
else
{
png_uint_16p dp16 = (png_uint_16p)dp;
png_const_uint_16p sp16 = (png_const_uint_16p)sp;
unsigned int skip = (bytes_to_jump-bytes_to_copy) /
sizeof (png_uint_16);
do
{
size_t c = bytes_to_copy;
do
{
*dp16++ = *sp16++;
c -= sizeof (png_uint_16);
}
while (c > 0);
if (row_width <= bytes_to_jump)
return;
dp16 += skip;
sp16 += skip;
row_width -= bytes_to_jump;
}
while (bytes_to_copy <= row_width);
/* End of row - 1 byte left, bytes_to_copy>row_width: */
dp = (png_bytep)dp16;
sp = (png_const_bytep)sp16;
do
*dp++ = *sp++;
while (--row_width > 0);
return;
}
}
#endif /* PNG_ALIGN_ code */
/* The true default - use a memcpy: */
for (;;)
{
png_memcpy(dp, sp, bytes_to_copy);
if (row_width <= bytes_to_jump)
return;
sp += bytes_to_jump;
dp += bytes_to_jump;
row_width -= bytes_to_jump;
if (bytes_to_copy > row_width)
bytes_to_copy = row_width;
}
}
} /* pixel_depth >= 8 */
/* NOT REACHED*/
}
else
#endif
/* If here then the switch above wasn't used so just memcpy the whole row
* from the temporary row buffer:
* from the temporary row buffer (notice that this overwrites the end of the
* destination row if it is a partial byte.)
*/
png_memcpy(dp, sp, PNG_ROWBYTES(pixel_depth, row_width));
}