/*
* Copyright © 2008 Rodrigo Kumpera
* Copyright © 2008 André Tupinambá
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of Red Hat not be used in advertising or
* publicity pertaining to distribution of the software without specific,
* written prior permission. Red Hat makes no representations about the
* suitability of this software for any purpose. It is provided "as is"
* without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
* Author: Rodrigo Kumpera (
[email protected])
* André Tupinambá (
[email protected])
*
* Based on work by Owen Taylor and Søren Sandmann
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
#include <emmintrin.h> /* for SSE2 intrinsics */
#include "pixman-private.h"
#include "pixman-combine32.h"
#include "pixman-inlines.h"
static __m128i mask_0080;
static __m128i mask_00ff;
static __m128i mask_0101;
static __m128i mask_ffff;
static __m128i mask_ff000000;
static __m128i mask_alpha;
static __m128i mask_565_r;
static __m128i mask_565_g1, mask_565_g2;
static __m128i mask_565_b;
static __m128i mask_red;
static __m128i mask_green;
static __m128i mask_blue;
static __m128i mask_565_fix_rb;
static __m128i mask_565_fix_g;
static force_inline __m128i
unpack_32_1x128 (uint32_t data)
{
return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
}
static force_inline void
unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
{
*data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
*data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
}
static force_inline __m128i
unpack_565_to_8888 (__m128i lo)
{
__m128i r, g, b, rb, t;
r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
rb = _mm_or_si128 (r, b);
t = _mm_and_si128 (rb, mask_565_fix_rb);
t = _mm_srli_epi32 (t, 5);
rb = _mm_or_si128 (rb, t);
t = _mm_and_si128 (g, mask_565_fix_g);
t = _mm_srli_epi32 (t, 6);
g = _mm_or_si128 (g, t);
return _mm_or_si128 (rb, g);
}
static force_inline void
unpack_565_128_4x128 (__m128i data,
__m128i* data0,
__m128i* data1,
__m128i* data2,
__m128i* data3)
{
__m128i lo, hi;
lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
lo = unpack_565_to_8888 (lo);
hi = unpack_565_to_8888 (hi);
unpack_128_2x128 (lo, data0, data1);
unpack_128_2x128 (hi, data2, data3);
}
static force_inline uint16_t
pack_565_32_16 (uint32_t pixel)
{
return (uint16_t) (((pixel >> 8) & 0xf800) |
((pixel >> 5) & 0x07e0) |
((pixel >> 3) & 0x001f));
}
static force_inline __m128i
pack_2x128_128 (__m128i lo, __m128i hi)
{
return _mm_packus_epi16 (lo, hi);
}
static force_inline __m128i
pack_565_2x128_128 (__m128i lo, __m128i hi)
{
__m128i data;
__m128i r, g1, g2, b;
data = pack_2x128_128 (lo, hi);
r = _mm_and_si128 (data, mask_565_r);
g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
}
static force_inline __m128i
pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
{
return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
pack_565_2x128_128 (*xmm2, *xmm3));
}
static force_inline int
is_opaque (__m128i x)
{
__m128i ffs = _mm_cmpeq_epi8 (x, x);
return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
}
static force_inline int
is_zero (__m128i x)
{
return _mm_movemask_epi8 (
_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
}
static force_inline int
is_transparent (__m128i x)
{
return (_mm_movemask_epi8 (
_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
}
static force_inline __m128i
expand_pixel_32_1x128 (uint32_t data)
{
return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
}
static force_inline __m128i
expand_alpha_1x128 (__m128i data)
{
return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
_MM_SHUFFLE (3, 3, 3, 3)),
_MM_SHUFFLE (3, 3, 3, 3));
}
static force_inline void
expand_alpha_2x128 (__m128i data_lo,
__m128i data_hi,
__m128i* alpha_lo,
__m128i* alpha_hi)
{
__m128i lo, hi;
lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
*alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
*alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
}
static force_inline void
expand_alpha_rev_2x128 (__m128i data_lo,
__m128i data_hi,
__m128i* alpha_lo,
__m128i* alpha_hi)
{
__m128i lo, hi;
lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
*alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
*alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
}
static force_inline void
pix_multiply_2x128 (__m128i* data_lo,
__m128i* data_hi,
__m128i* alpha_lo,
__m128i* alpha_hi,
__m128i* ret_lo,
__m128i* ret_hi)
{
__m128i lo, hi;
lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
lo = _mm_adds_epu16 (lo, mask_0080);
hi = _mm_adds_epu16 (hi, mask_0080);
*ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
*ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
}
static force_inline void
pix_add_multiply_2x128 (__m128i* src_lo,
__m128i* src_hi,
__m128i* alpha_dst_lo,
__m128i* alpha_dst_hi,
__m128i* dst_lo,
__m128i* dst_hi,
__m128i* alpha_src_lo,
__m128i* alpha_src_hi,
__m128i* ret_lo,
__m128i* ret_hi)
{
__m128i t1_lo, t1_hi;
__m128i t2_lo, t2_hi;
pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
*ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
*ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
}
static force_inline void
negate_2x128 (__m128i data_lo,
__m128i data_hi,
__m128i* neg_lo,
__m128i* neg_hi)
{
*neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
*neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
}
static force_inline void
invert_colors_2x128 (__m128i data_lo,
__m128i data_hi,
__m128i* inv_lo,
__m128i* inv_hi)
{
__m128i lo, hi;
lo = _mm_shufflelo_ep