A few missing SSE intrinsics

C and C++ programmers already have many SSE intrinsics available to them. Most of those map straightforwardly to their matching hardware instructions, but there are holes where the hardware doesn't natively support a given operation or type. For instance, there's an _mm_cmpgt_epi8 intrinsic, but not an _mm_cmpgt_epu8. The first one exists because there's a hardware primitive, the second one is missing because its implementation would have to be composite.

The set of intrinsics that we do have is quite rich, and can be combined in many ways to make up for the ones we don't. Because SSE programming is tricky enough as it is, I'm a big fan of building ever more small, self-contained "primitives" that are composited inline into ever larger functional units. We create the building blocks, forge them into subassemblies, and then let the compiler inline everything into an optimized function body.

This page contains some of the new "missing intrinsics" that I built on top of the existing ones, and gradually also on top of the ones defined earlier. I purposefully followed the existing naming conventions for type and operator names, so that these functions act as a kind of "polyfill" for the holes in the existing set (to use a JavaScript term). The naming convention probably violates a standard somewhere, but I think it's defensible: most of these functions are small enough to be considered almost-primitives, some of the original intrinsics are also implemented as composites, and most operate in the same way as the actual intrinsics, namely in vertical fashion. (x0 and y0 form a pair, x1 and y1 form a pair, and so on.) With these new "primitives", SSE code becomes easier to read, test and reason about.

License

These are fundamental building blocks as far as I'm concerned, and I wouldn't want to claim any rights to these. For what it's worth, I declare that I place these in the public domain. Do what you want with this stuff.

GitHub repo

There's a small GitHub repo for this code here, that also contains a small test harness for checking correctness.

July 14, 2014

_mm_cmple_epu8

static inline __m128i
_mm_cmple_epu8 (__m128i x, __m128i y)
{
	// Returns 0xFF where x <= y:
	return _mm_cmpeq_epi8(_mm_min_epu8(x, y), x);
}

Compare each of the 8-bit unsigned ints in x to those in y and set the result to 0xFF where x <= y. Equivalent to taking the minimum of both vectors and checking where x equals this minimum.

R0 R1 R15
(x0 <= y0) ? 0xFF : 0x00 (x1 <= y1) ? 0xFF : 0x00 (x15 <= y15) ? 0xFF : 0x00

_mm_cmpge_epu8

static inline __m128i
_mm_cmpge_epu8 (__m128i x, __m128i y)
{
	// Returns 0xFF where x >= y:
	return _mm_cmple_epu8(y, x);
}

Compare each of the 8-bit unsigned ints in x to those in y and set the result to 0xFF where x >= y. Equivalent to calling _mm_cmple_epu8 above with the arguments swapped.

R0 R1 R15
(x0 >= y0) ? 0xFF : 0x0 (x1 >= y1) ? 0xFF : 0x0 (x15 >= y15) ? 0xFF : 0x0

_mm_cmpgt_epu8

static inline __m128i
_mm_cmpgt_epu8 (__m128i x, __m128i y)
{
	// Returns 0xFF where x > y:
	return _mm_andnot_si128(
		_mm_cmpeq_epi8(x, y),
		_mm_cmpeq_epi8(_mm_max_epu8(x, y), x)
	);
}

Compare each of the 8-bit unsigned ints in x to those in y and set the result to 0xFF where x > y. Equivalent to checking whether x is equal to the maximum of x and y, but not equal to y itself.

R0 R1 R15
(x0 > y0) ? 0xFF : 0x0 (x1 > y1) ? 0xFF : 0x0 (x15 > y15) ? 0xFF : 0x0

_mm_cmplt_epu8

static inline __m128i
_mm_cmplt_epu8 (__m128i x, __m128i y)
{
	// Returns 0xFF where x < y:
	return _mm_cmpgt_epu8(y, x);
}

Compare each of the 8-bit unsigned ints in x to those in y and set the result to 0xFF where x < y. Equivalent to calling _mm_cmpgt_epu8 above with swapped arguments.

R0 R1 R15
(x0 < y0) ? 0xFF : 0x0 (x1 < y1) ? 0xFF : 0x0 (x15 < y15) ? 0xFF : 0x0

_mm_cmple_epu16

static inline __m128i
_mm_cmple_epu16 (__m128i x, __m128i y)
{
	// Returns 0xFFFF where x <= y:
	return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
}

Compare each of the eight 16-bit unsigned ints in x to those in y and set the result to 0xFFFF where x <= y. Equivalent to doing the saturating subtraction of x − y and checking where the result is zero (meaning x was larger than or equal to y).

R0 R1 R7
(x0 <= y0) ? 0xFFFF : 0x0 (x1 <= y1) ? 0xFFFF : 0x0 (x7 <= y7) ? 0xFFFF : 0x0

_mm_cmpge_epu16

static inline __m128i
_mm_cmpge_epu16 (__m128i x, __m128i y)
{
	// Returns 0xFFFF where x >= y:
	return _mm_cmple_epu16(y, x);
}

Compare each of the eight 16-bit unsigned ints in x to those in y and set the result to 0xFFFF where x >= y. Equivalent to calling _mm_cmple_epu16 above with the arguments swapped.

R0 R1 R7
(x0 >= y0) ? 0xFFFF : 0x0 (x1 >= y1) ? 0xFFFF : 0x0 (x7 >= y7) ? 0xFFFF : 0x0

_mm_cmpgt_epu16

static inline __m128i
_mm_cmpgt_epu16 (__m128i x, __m128i y)
{
	// Returns 0xFFFF where x > y:
	return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
}

Compare each of the eight 16-bit unsigned ints in x to those in y and set the result to 0xFFFF where x > y. Equivalent to checking whether x is larger than, but not equal to, y.

R0 R1 R7
(x0 > y0) ? 0xFFFF : 0x0 (x1 > y1) ? 0xFFFF : 0x0 (x7 > y7) ? 0xFFFF : 0x0

_mm_cmplt_epu16

static inline __m128i
_mm_cmplt_epu16 (__m128i x, __m128i y)
{
	// Returns 0xFFFF where x < y:
	return _mm_cmpgt_epu16(y, x);
}

Compare each of the 16-bit unsigned ints in x to those in y and set the result to 0xFFFF where x < y. Equivalent to calling _mm_cmpgt_epu16 above with swapped arguments.

R0 R1 R7
(x0 < y0) ? 0xFFFF : 0x0 (x1 < y1) ? 0xFFFF : 0x0 (x7 < y7) ? 0xFFFF : 0x0

_mm_cmpge_epi16

static inline __m128i
_mm_cmpge_epi16 (__m128i x, __m128i y)
{
	// Returns 0xFFFF where x >= y:
	return _mm_or_si128(_mm_cmpeq_epi16(x, y), _mm_cmpgt_epi16(x, y));
}

Compare each of the eight 16-bit signed ints in x to those in y and set the result to 0xFFFF where x >= y. Equivalent to checking whether x is either larger than or equal to y.

R0 R1 R7
(x0 >= y0) ? 0xFFFF : 0x0 (x1 >= y1) ? 0xFFFF : 0x0 (x7 >= y7) ? 0xFFFF : 0x0

_mm_not_si128

static inline __m128i
_mm_not_si128 (__m128i x)
{
	// Returns ~x, the bitwise complement of x:
	return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
}

Returns ~x, the bitwise complement (inverse) of x. Perform an xor of every bit with 1, because 0 xor 1 = 1 and 1 xor 1 = 0. This function can usually be avoided by applying De Morgan's laws to your logic statements, so that they use the native _mm_andnot_si128.

R0
~x0

_mm_setone_epi8

static inline __m128i
_mm_setone_epi8 (void)
{
	// Returns a vector where every byte has the value 1:
	__m128i x = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
	return _mm_xor_si128(_mm_add_epi8(x, x), x);
}

Just like _mm_setzero_si128 sets everything to zero, this function sets all bytes in the vector to the value 1 without touching memory. A vector where every lane is 1 can be useful for incrementing a set of counters or performing a shift. However, a _mm_set1_epi8(1) will probably still be faster.

R0 R1 R15
0x1 0x1 0x1

_mm_setone_epi16

static inline __m128i
_mm_setone_epi16 (void)
{
	// Returns a vector where every word has the value 1:
	__m128i x = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
	return _mm_xor_si128(_mm_add_epi16(x, x), x);
}

See above. Sets all words in the vector to the value 1 without touching memory. A _mm_set1_epi16(1) is probably faster.

R0 R1 R7
0x1 0x1 0x1

_mm_blendv_si128

static inline __m128i
_mm_blendv_si128 (__m128i x, __m128i y, __m128i mask)
{
	// Replace bit in x with bit in y when matching bit in mask is set:
	return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y));
}

Copies the bits from x to the output where mask is not set, and the bits from y where mask is set. Combines the values in x with those in y based on the bitmask. This is useful for replacing branching logic with masking logic, because the SSE truth operations return all-ones for the bytes where a given condition holds.

R0
(x0 & ~mask) | (y0 & mask)

_mm_blendv_epi8

#ifndef __SSE4_1__
static inline __m128i
_mm_blendv_epi8 (__m128i x, __m128i y, __m128i mask)
{
	// Replace byte in x with byte in y when the MSB of the corresponding
	// byte in mask is set:
	return _mm_blendv_si128(x, y, _mm_cmplt_epi8(mask, _mm_setzero_si128()));
}
#endif

This function was introduced as a hardware primitive in SSE 4.1. For each of the 16 bytes, copy either the byte from x or the byte from y to the output, depending on whether the matching byte in mask has its most significant bit (MSB) set. If the MSB is set, the byte in y is copied, else the one from x is passed through.

In practice with older SSE versions, when you're using logic masks in which each byte is known to be 0x00 or 0xFF, it's faster to use _mm_blendv_si128.

R0 R1 R15
(mask0 & 0x80) ? y0 : x0 (mask1 & 0x80) ? y1 : x1 (mask15 & 0x80) ? y15 : x15

_mm_min_epu16

#ifndef __SSE4_1__
static inline __m128i
_mm_min_epu16 (__m128i x, __m128i y)
{
	// Returns x where x <= y, else y:
	return _mm_blendv_si128(y, x, _mm_cmple_epu16(x, y));
}
#endif

Compare each of the eight 16-bit unsigned ints in x to those in y and place the one with the lowest value in the result. This demonstrates the use of _mm_blendv_si128 defined above to sculpt the output based on a condition mask. This instruction was introduced as a hardware primitive in SSE4.

R0 R1 R7
min(x0, y0) min(x1, y1) min(x7, y7)

_mm_max_epu16

#ifndef __SSE4_1__
static inline __m128i
_mm_max_epu16 (__m128i x, __m128i y)
{
	// Returns x where x >= y, else y:
	return _mm_blendv_si128(x, y, _mm_cmple_epu16(x, y));
}
#endif

Compare each of the eight 16-bit unsigned ints in x to those in y and place the one with the highest value in the result. Equivalent to _mm_min_epu16 above with the inverse result. This instruction was introduced as a hardware primitive in SSE4.

R0 R1 R7
max(x0, y0) max(x1, y1) max(x7, y7)

_mm_absdiff_epu8

static inline __m128i
_mm_absdiff_epu8 (__m128i x, __m128i y)
{
	// Calculate absolute difference: abs(x - y):
	return _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x));
}

Calculate the absolute difference between the 8-bit unsigned int in x and y. Use unsigned saturating subtraction for fast calculation. Saturating subtraction clamps negative results to zero. The absolute difference is subs(x, y) + subs(y, x). For example, the absolute difference between 16 and 7 is subs(16 − 7) + subs(7 − 16), or 9 + 0. Since at least one of the subtractions will be zero, we can use an or to combine them.

R0 R1 R15
abs(x0 − y0) abs(x1 − y1) abs(x15 − y15)

_mm_absdiff_epu16

static inline __m128i
_mm_absdiff_epu16 (__m128i x, __m128i y)
{
	// Calculate absolute difference: abs(x - y):
	return _mm_or_si128(_mm_subs_epu16(x, y), _mm_subs_epu16(y, x));
}

Same as above, but for 16-bit unsigned ints.

R0 R1 R7
abs(x0 − y0) abs(x1 − y1) abs(x7 − y7)

_mm_div255_epu16

static inline __m128i
_mm_div255_epu16 (__m128i x)
{
	// Divide 8 16-bit uints by 255:
	// x := ((x + 1) + (x >> 8)) >> 8:
	return _mm_srli_epi16(_mm_adds_epu16(
		_mm_adds_epu16(x, _mm_set1_epi16(1)),
		_mm_srli_epi16(x, 8)), 8);
}

Divides all eight 16-bit unsigned ints in x by the constant value 255, using the formula x := ((x + 1) + (x >> 8)) >> 8.

R0 R1 R7
x0 ∕ 255 x1 ∕ 255 x7 ∕ 255

_mm_scale_epu8

// Probably too big to want to inline:
static __m128i
_mm_scale_epu8 (__m128i x, __m128i y)
{
	// Returns an "alpha blend" of x scaled by y/255;
	//   x := x * (y / 255)
	// Reorder: x := (x * y) / 255

	// Unpack x and y into 16-bit uints:
	__m128i xlo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
	__m128i ylo = _mm_unpacklo_epi8(y, _mm_setzero_si128());
	__m128i xhi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
	__m128i yhi = _mm_unpackhi_epi8(y, _mm_setzero_si128());

	// Multiply x with y, keeping the low 16 bits:
	xlo = _mm_mullo_epi16(xlo, ylo);
	xhi = _mm_mullo_epi16(xhi, yhi);

	// Divide by 255:
	xlo = _mm_div255_epu16(xlo);
	xhi = _mm_div255_epu16(xhi);

	// Repack the 16-bit uints to clamped 8-bit values:
	return _mm_packus_epi16(xlo, xhi);
}

"Alpha blend" the 8-bit unsigned ints in x with the 8-bit unsigned "opacity" value in y. Calculates x := x * (y / 255), or scaling x by the ratio in y. Could be useful for image alpha blending.

R0 R1 R15
(x0 * y0) ∕ 255 (x1 * y1) ∕ 255 (x15 * y15) ∕ 255

_mm_divfast_epu8

// Probably too big to want to inline:
static __m128i
_mm_divfast_epu8 (__m128i x, uint8_t d)
{
	// Find shift factor:
	// This is actually much faster than using __builtin_clz():
	uint8_t n
		= (d >= 128) ? 15
		: (d >= 64) ? 14
		: (d >= 32) ? 13
		: (d >= 16) ? 12
		: (d >= 8) ? 11
		: (d >= 4) ? 10
		: (d >= 2) ? 9
		: 8;

	// Set 8 words of "inverse sensitivity":
	// Multiplying by this amount and right-shifting will give a
	// very good approximation of the result:
	__m128i inv = _mm_set1_epi16((1 << n) / d + 1);

	// Unpack input into two 16-bit uints:
	__m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
	__m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128());

	// Multiply with the "inverse sensitivity" and divide:
	lo = _mm_srli_epi16(_mm_mullo_epi16(lo, inv), n);
	hi = _mm_srli_epi16(_mm_mullo_epi16(hi, inv), n);

	// Repack:
	return _mm_packus_epi16(lo, hi);
}

Divide the 8-bit unsigned ints in x by the (scalar, non-vector) 8-bit unsigned integer d, accepting a slight error for 0.12% of the input space. This works on the principle that a / b is equal to (a * c) / (b * c), where c is some arbitrary constant. After rearranging parentheses, we have (a * (c / b)) * c. This reduces the problem to two multiplications and one constant division. The trick is finding a properly sized constant c such that c / b does not alias (implying that c should be at least 256 * b), but not so big that multiplying later by c will overflow the result. And c should be a power of two so that we can do the last multiply with a left shift.

For all 65280 possible inputs (256 numerators with 255 denominators, since zero is not a denominator), this function is wrong in just 78 cases, as compared to regular truncating integer division. In each of those cases, the error is the same, namely off by +1. If that is acceptable, this method is fast. Or we can correct the error at a small performance cost as shown below with _mm_div_epu8.

R0 R1 R15
x0 ∕ d x1 ∕ d x15 ∕ d

_mm_div_epu8

// Probably too big to want to inline:
static __m128i
_mm_div_epu8 (__m128i x, uint8_t d)
{
	// This is the same routine as above, but exact; it includes a
	// correction step where we back-substitute the quotient and correct
	// the off-by-one errors of the faster routine.
	uint8_t n
		= (d >= 128) ? 15
		: (d >= 64) ? 14
		: (d >= 32) ? 13
		: (d >= 16) ? 12
		: (d >= 8) ? 11
		: (d >= 4) ? 10
		: (d >= 2) ? 9
		: 8;

	// Set 8 words of "inverse sensitivity":
	// Multiplying by this amount and right-shifting will give a
	// very good approximation of the result:
	__m128i inv = _mm_set1_epi16((1 << n) / d + 1);

	// Unpack input into two 16-bit uints:
	__m128i xlo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
	__m128i xhi = _mm_unpackhi_epi8(x, _mm_setzero_si128());

	// Multiply with the "inverse sensitivity" and divide:
	__m128i alo = _mm_srli_epi16(_mm_mullo_epi16(xlo, inv), n);
	__m128i ahi = _mm_srli_epi16(_mm_mullo_epi16(xhi, inv), n);

	// alo and ahi contain the quotients. The result is sometimes too large
	// by one, so we introduce a correction step. Decrement the quotients:
	__m128i blo = _mm_subs_epu16(alo, _mm_set1_epi16(1));
	__m128i bhi = _mm_subs_epu16(ahi, _mm_set1_epi16(1));

	// Multiply with the divisor to get something back in the neighbourhood
	// of the original numbers:
	__m128i mdd = _mm_set1_epi16(d);
	__m128i plo = _mm_mullo_epi16(blo, mdd);
	__m128i phi = _mm_mullo_epi16(bhi, mdd);

	// If (orig - new) >= d, the existing quotient is a better fit:
	// We can use native epi16 ops because these are all 8-bit numbers:
	__m128i masklo = _mm_cmpeq_epi16(_mm_min_epi16(_mm_subs_epu16(xlo, plo), mdd), mdd);
	__m128i maskhi = _mm_cmpeq_epi16(_mm_min_epi16(_mm_subs_epu16(xhi, phi), mdd), mdd);

	// Decrement the original divisors according to the inverse masks:
	alo = _mm_subs_epu16(alo, _mm_andnot_si128(masklo, _mm_set1_epi16(1)));
	ahi = _mm_subs_epu16(ahi, _mm_andnot_si128(maskhi, _mm_set1_epi16(1)));

	// Repack:
	return _mm_packus_epi16(alo, ahi);
}

Divide the 8-bit unsigned ints in x by the (non-vector) 8-bit unsigned integer d, doing back-substitution to check for and correct for the 78 error cases of the faster but inaccurate method above. Once we found the result the fast way, we multiply it with the divisor and check whether the result is close enough to the original numerator. If not, we splice in the result minus one. Surprisingly, this is just 10 to 20 percent slower than the fast version above.

R0 R1 R15
x0 ∕ d x1 ∕ d x15 ∕ d

_mm_bswap_epi16

static inline __m128i
_mm_bswap_epi16 (__m128i x)
{
	// Swap upper and higher byte in each 16-bit word:
	return _mm_or_si128(
		_mm_slli_epi16(x, 8),
		_mm_srli_epi16(x, 8));
}

Change endianness (reverse byte order) in each 16-bit word by exchanging the high and the low byte.

R0 R1 R2 R3 R4 R15
R1 R0 R3 R2 R5 R14

_mm_bswap_epi32

static inline __m128i
_mm_bswap_epi32 (__m128i x)
{
	// Reverse order of bytes in each 32-bit word.

#ifdef __SSSE3__
	return _mm_shuffle_epi8(x,
		_mm_set_epi8(
			12, 13, 14, 15,
			 8,  9, 10, 11,
			 4,  5,  6,  7,
			 0,  1,  2,  3));
#else
	// First swap bytes in each 16-bit word:
	__m128i a = _mm_or_si128(
		_mm_slli_epi16(x, 8),
		_mm_srli_epi16(x, 8));

	// Then swap all 16-bit words:
	a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
	a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));

	return a;
#endif
}

Change endianness (reverse byte order) in each 32-bit word by reversing all four bytes. Assume that the SSSE3 _mm_shuffle_epi8 function plus a literal value does the job faster if available.

R0 R1 R2 R3 R4 R15
R3 R2 R1 R0 R7 R12

_mm_bswap_epi64

static inline __m128i
_mm_bswap_epi64 (__m128i x)
{
	// Reverse order of bytes in each 64-bit word.

#ifdef __SSSE3__
	return _mm_shuffle_epi8(x,
		_mm_set_epi8(
			 8,  9, 10, 11,
			12, 13, 14, 15,
			 0,  1,  2,  3,
			 4,  5,  6,  7));
#else
	// Swap bytes in each 16-bit word:
	__m128i a = _mm_or_si128(
		_mm_slli_epi16(x, 8),
		_mm_srli_epi16(x, 8));

	// Reverse all 16-bit words in 64-bit halves:
	a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
	a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));

	return a;
#endif
}

Change endianness (reverse byte order) in each 64-bit word by reversing all eight bytes.

R0 R1 R2 R3 R4 R15
R7 R6 R5 R4 R3 R8

_mm_bswap_si128

static inline __m128i
_mm_bswap_si128 (__m128i x)
{
	// Reverse order of all bytes in the 128-bit word.

#ifdef __SSSE3__
	return _mm_shuffle_epi8(x,
		_mm_set_epi8(
			 0,  1,  2,  3,
			 4,  5,  6,  7,
			 8,  9, 10, 11,
			12, 13, 14, 15));
#else
	// Swap bytes in each 16-bit word:
	__m128i a = _mm_or_si128(
		_mm_slli_epi16(x, 8),
		_mm_srli_epi16(x, 8));

	// Reverse all 16-bit words in 64-bit halves:
	a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
	a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));

	// Reverse 64-bit halves:
	return _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2));
#endif
}

Change endianness (reverse byte order) in the 128-bit vector by reversing all sixteen bytes.

R0 R1 R2 R3 R4 R15
R15 R14 R13 R12 R11 R0