15 #include "visiontransfer/bitconversions.h" 16 #include "visiontransfer/exceptions.h" 20 # include <immintrin.h> 22 # include <smmintrin.h> 24 # include <emmintrin.h> 32 void BitConversions::decode12BitSplit(
int startRow,
int stopRow,
unsigned const char* src,
33 unsigned char* dst,
int srcStride,
int dstStride,
int rowWidth) {
35 const unsigned char* dispStart = src;
36 const unsigned char* subpixStart = &src[rowWidth];
39 if(rowWidth % 32 == 0) {
40 if(srcStride % 32 == 0 && reinterpret_cast<size_t>(src) % 32 == 0) {
41 decode12BitSplitAVX2<true>(startRow, stopRow, dispStart, subpixStart,
42 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
44 decode12BitSplitAVX2<false>(startRow, stopRow, dispStart, subpixStart,
45 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
51 if(rowWidth % 16 == 0) {
52 if(srcStride % 16 == 0 && reinterpret_cast<size_t>(src) % 16 == 0) {
53 decode12BitSplitSSE2<true>(startRow, stopRow, dispStart, subpixStart,
54 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
56 decode12BitSplitSSE2<false>(startRow, stopRow, dispStart, subpixStart,
57 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
63 decode12BitSplitFallback(startRow, stopRow, dispStart, subpixStart, rowWidth,
64 reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
69 template <
bool alignedLoad>
70 void BitConversions::decode12BitSplitSSE2(
int startRow,
int stopRow,
const unsigned char* dispStart,
71 const unsigned char* subpixStart,
int width,
unsigned short* dst,
int srcStride,
int dstStride) {
77 __m128i zero = _mm_set1_epi8(0x00);
78 __m128i subpixMask = _mm_set1_epi8(0x0f);
79 unsigned char* outPos = &
reinterpret_cast<unsigned char*
>(dst)[startRow*dstStride];
80 int outRowPadding = dstStride - 2*width;
82 for(
int y = startRow; y<stopRow; y++) {
83 const unsigned char* intPos = &dispStart[y*srcStride];
84 const unsigned char* intEndPos = &dispStart[y*srcStride + width];
85 const unsigned char* subpixPos = &subpixStart[y*srcStride];
87 for(; intPos < intEndPos;) {
89 __m128i subpixOffsets;
91 subpixOffsets = _mm_load_si128(reinterpret_cast<const __m128i*>(subpixPos));
93 subpixOffsets = _mm_loadu_si128(reinterpret_cast<const __m128i*>(subpixPos));
97 __m128i offsetsEven = _mm_and_si128(subpixOffsets, subpixMask);
98 __m128i offsetsUneven = _mm_and_si128(_mm_srli_epi16(subpixOffsets, 4), subpixMask);
100 for(
int i=0; i<2; i++) {
104 intDisps = _mm_load_si128(reinterpret_cast<const __m128i*>(intPos));
106 intDisps = _mm_loadu_si128(reinterpret_cast<const __m128i*>(intPos));
112 __m128i disps1 = _mm_slli_epi16(_mm_unpacklo_epi8(intDisps, zero), 4);
113 __m128i disps2 = _mm_slli_epi16(_mm_unpackhi_epi8(intDisps, zero), 4);
118 offsets = _mm_unpacklo_epi8(offsetsEven, offsetsUneven);
120 offsets = _mm_unpackhi_epi8(offsetsEven, offsetsUneven);
124 disps1 = _mm_or_si128(disps1, _mm_unpacklo_epi8(offsets, zero));
125 disps2 = _mm_or_si128(disps2, _mm_unpackhi_epi8(offsets, zero));
128 _mm_store_si128(reinterpret_cast<__m128i*>(outPos), disps1);
130 _mm_store_si128(reinterpret_cast<__m128i*>(outPos), disps2);
133 if(intPos >= intEndPos) {
140 outPos += outRowPadding;
146 template <
bool alignedLoad>
147 void BitConversions::decode12BitSplitAVX2(
int startRow,
int stopRow,
const unsigned char* dispStart,
148 const unsigned char* subpixStart,
int width,
unsigned short* dst,
int srcStride,
int dstStride) {
149 if(width % 32 != 0) {
156 __m256i zero = _mm256_set1_epi8(0x00);
157 __m256i subpixMask = _mm256_set1_epi8(0x0f);
158 unsigned char* outPos = &
reinterpret_cast<unsigned char*
>(dst)[startRow*dstStride];
159 int outRowPadding = dstStride - 2*width;
161 for(
int y = startRow; y<stopRow; y++) {
162 const unsigned char* intPos = &dispStart[y*srcStride];
163 const unsigned char* intEndPos = &dispStart[y*srcStride + width];
164 const unsigned char* subpixPos = &subpixStart[y*srcStride];
166 for(; intPos < intEndPos;) {
168 __m256i subpixOffsets;
171 subpixOffsets = _mm256_load_si256(reinterpret_cast<const __m256i*>(subpixPos));
173 subpixOffsets = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(subpixPos));
177 __m256i offsetsEven = _mm256_and_si256(subpixOffsets, subpixMask);
178 __m256i offsetsUneven = _mm256_and_si256(_mm256_srli_epi16 (subpixOffsets, 4), subpixMask);
180 for(
int i=0; i<2; i++) {
184 intDisps = _mm256_load_si256(reinterpret_cast<const __m256i*>(intPos));
186 intDisps = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(intPos));
191 __m256i intDispsMixup = _mm256_permute4x64_epi64(intDisps, 0xd8);
194 __m256i disps1 = _mm256_slli_epi16(_mm256_unpacklo_epi8(intDispsMixup, zero), 4);
195 __m256i disps2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(intDispsMixup, zero), 4);
198 __m256i offsetsEvenMixup = _mm256_permute4x64_epi64(offsetsEven, 0xd8);
199 __m256i offsetsUnevenMixup = _mm256_permute4x64_epi64(offsetsUneven, 0xd8);
204 offsets = _mm256_unpacklo_epi8(offsetsEvenMixup, offsetsUnevenMixup);
206 offsets = _mm256_unpackhi_epi8(offsetsEvenMixup, offsetsUnevenMixup);
210 __m256i offsetsMixup = _mm256_permute4x64_epi64(offsets, 0xd8);
213 disps1 = _mm256_or_si256(disps1, _mm256_unpacklo_epi8(offsetsMixup, zero));
214 disps2 = _mm256_or_si256(disps2, _mm256_unpackhi_epi8(offsetsMixup, zero));
217 _mm256_store_si256(reinterpret_cast<__m256i*>(outPos), disps1);
219 _mm256_store_si256(reinterpret_cast<__m256i*>(outPos), disps2);
222 if(intPos >= intEndPos) {
229 outPos += outRowPadding;
234 void BitConversions::decode12BitSplitFallback(
int startRow,
int stopRow,
const unsigned char* dispStart,
235 const unsigned char* subpixStart,
int width,
unsigned short* dst,
int srcStride,
int dstStride) {
237 int dstStrideShort = dstStride/2;
240 for(
int y = startRow; y < stopRow; y++) {
241 for(
int x = 0; x < width; x++) {
243 unsigned short subpix = 0;
245 subpix = subpixStart[y*srcStride + x/2] & 0x0F;
247 subpix = subpixStart[y*srcStride + x/2] >> 4;
250 dst[y*dstStrideShort + x] = (
static_cast<unsigned short>(dispStart[y*srcStride + x]) << 4) | subpix;
255 void BitConversions::decode12BitPacked(
int startRow,
int stopRow,
unsigned const char* src,
256 unsigned char* dst,
int srcStride,
int dstStride,
int rowWidth) {
258 const unsigned char* dispStart = src;
261 if(rowWidth % 32 == 0) {
262 if(srcStride % 16 == 0 && reinterpret_cast<size_t>(src) % 16 == 0) {
263 decode12BitPackedSSE4<true>(startRow, stopRow, dispStart,
264 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
266 decode12BitPackedSSE4<false>(startRow, stopRow, dispStart,
267 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
273 # if defined(__ARM_NEON) && defined(__ARM_ARCH_ISA_A64) 274 if(rowWidth % 32 == 0) {
275 if(srcStride % 16 == 0 && reinterpret_cast<size_t>(src) % 16 == 0) {
276 decode12BitPackedNEON<true>(startRow, stopRow, dispStart,
277 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
279 decode12BitPackedNEON<false>(startRow, stopRow, dispStart,
280 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
287 decode12BitPackedFallback(startRow, stopRow, dispStart, rowWidth,
288 reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
293 template <
bool alignedLoad>
294 void BitConversions::decode12BitPackedSSE4(
int startRow,
int stopRow,
const unsigned char* dispStart,
295 int width,
unsigned short* dst,
int srcStride,
int dstStride) {
296 if(width % 32 != 0) {
301 unsigned char* outPos = &
reinterpret_cast<unsigned char*
>(dst)[startRow*dstStride];
302 int outRowPadding = dstStride - 2*width;
304 const __m128i shuffleMask1a = _mm_set_epi8(11, 10, 10, 9, 8, 7, 7, 6, 5, 4, 4, 3, 2, 1, 1, 0);
305 const __m128i shuffleMask1b = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 15, 14, 13, 13, 12);
307 const __m128i shuffleMask2a = _mm_set_epi8(7, 6, 6, 5, 4, 3, 3, 2, 1, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff);
308 const __m128i shuffleMask2b = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 15, 15, 14, 13, 12, 12, 11, 10, 9, 9, 8);
310 const __m128i shuffleMask3a = _mm_set_epi8(3, 2, 2, 1, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff);
311 const __m128i shuffleMask3b = _mm_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 6, 5, 5, 4);
313 const __m128i shiftMultiplyMask = _mm_set_epi16(1, 16, 1, 16, 1, 16, 1, 16);
315 const __m128i blendMask1 = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0);
316 const __m128i blendMask2 = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
318 int dispRowWidth = width * 3/2;
320 for(
int y = startRow; y<stopRow; y++) {
321 const unsigned char* rowPos = &dispStart[y*srcStride];
322 const unsigned char* rowEnd = &dispStart[y*srcStride + dispRowWidth];
324 while(rowPos < rowEnd) {
327 __m128i rowPixels1, rowPixels2, rowPixels3;
329 rowPixels1 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
332 rowPixels2 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
335 rowPixels3 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
338 rowPixels1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
341 rowPixels2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
344 rowPixels3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
350 __m128i part1 = _mm_shuffle_epi8(rowPixels1, shuffleMask1a);
351 __m128i part2a = _mm_shuffle_epi8(rowPixels1, shuffleMask1b);
352 __m128i part2b = _mm_shuffle_epi8(rowPixels2, shuffleMask2a);
353 __m128i part3a = _mm_shuffle_epi8(rowPixels2, shuffleMask2b);
354 __m128i part3b = _mm_shuffle_epi8(rowPixels3, shuffleMask3a);
355 __m128i part4 = _mm_shuffle_epi8(rowPixels3, shuffleMask3b);
357 __m128i part2 = _mm_blendv_epi8(part2a, part2b, blendMask1);
358 __m128i part3 = _mm_blendv_epi8(part3a, part3b, blendMask2);
362 __m128i shift1a = _mm_mullo_epi16(part1, shiftMultiplyMask);
363 __m128i shift2a = _mm_mullo_epi16(part2, shiftMultiplyMask);
364 __m128i shift3a = _mm_mullo_epi16(part3, shiftMultiplyMask);
365 __m128i shift4a = _mm_mullo_epi16(part4, shiftMultiplyMask);
369 __m128i shift1b = _mm_srli_epi16(shift1a, 4);
370 __m128i shift2b = _mm_srli_epi16(shift2a, 4);
371 __m128i shift3b = _mm_srli_epi16(shift3a, 4);
372 __m128i shift4b = _mm_srli_epi16(shift4a, 4);
374 _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift1b);
376 _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift2b);
378 _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift3b);
380 _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift4b);
384 outPos += outRowPadding;
389 #if defined(__ARM_NEON) && defined(__ARM_ARCH_ISA_A64) 390 #define TX(y,x) ((x + y*16)/3 + ((x + y*16)%3)*16) 392 template <
bool alignedLoad>
393 void BitConversions::decode12BitPackedNEON(
int startRow,
int stopRow,
const unsigned char* dispStart,
394 int width,
unsigned short* dst,
int srcStride,
int dstStride) {
395 if(width % 32 != 0) {
400 unsigned char* outPos = &
reinterpret_cast<unsigned char*
>(dst)[startRow*dstStride];
401 int outRowPadding = dstStride - 2*width;
404 const uint8x16_t shuffleMask1 = {TX(0,0), TX(0,1), TX(0,1), TX(0,2), TX(0,3), TX(0,4),
405 TX(0,4), TX(0,5), TX(0,6), TX(0,7), TX(0,7), TX(0,8), TX(0,9), TX(0,10), TX(0,10), TX(0,11)};
406 const uint8x16_t shuffleMask2 = {TX(0,12), TX(0,13), TX(0,13), TX(0,14), TX(0,15), TX(1,0),
407 TX(1,0), TX(1,1), TX(1,2), TX(1,3), TX(1,3), TX(1,4), TX(1,5), TX(1,6), TX(1,6), TX(1,7)};
408 const uint8x16_t shuffleMask3 = {TX(1,8), TX(1,9), TX(1,9), TX(1,10), TX(1,11), TX(1,12),
409 TX(1,12), TX(1,13), TX(1,14), TX(1,15), TX(1,15), TX(2,0), TX(2,1), TX(2,2), TX(2,2), TX(2,3)};
410 const uint8x16_t shuffleMask4 = {TX(2,4), TX(2,5), TX(2,5), TX(2,6), TX(2,7), TX(2,8),
411 TX(2,8), TX(2,9), TX(2,10), TX(2,11), TX(2,11), TX(2,12), TX(2,13), TX(2,14), TX(2,14), TX(2,15)};
413 const int16x8_t shiftMask = {4, 0, 4, 0, 4, 0, 4, 0};
415 int dispRowWidth = width * 3/2;
417 for(
int y = startRow; y<stopRow; y++) {
418 const unsigned char* rowPos = &dispStart[y*srcStride];
419 const unsigned char* rowEnd = &dispStart[y*srcStride + dispRowWidth];
421 while(rowPos < rowEnd) {
424 uint8x16x3_t rowPixels;
426 rowPixels = vld3q_u8(reinterpret_cast<const uint8_t*>(
427 __builtin_assume_aligned(rowPos, 16)));
429 rowPixels = vld3q_u8(reinterpret_cast<const uint8_t*>(rowPos));
435 uint8x16_t part1 = vqtbl3q_u8(rowPixels, shuffleMask1);
436 uint8x16_t part2 = vqtbl3q_u8(rowPixels, shuffleMask2);
437 uint8x16_t part3 = vqtbl3q_u8(rowPixels, shuffleMask3);
438 uint8x16_t part4 = vqtbl3q_u8(rowPixels, shuffleMask4);
442 uint16x8_t shift1a = vshlq_u16(vreinterpretq_u16_u8(part1), shiftMask);
443 uint16x8_t shift2a = vshlq_u16(vreinterpretq_u16_u8(part2), shiftMask);
444 uint16x8_t shift3a = vshlq_u16(vreinterpretq_u16_u8(part3), shiftMask);
445 uint16x8_t shift4a = vshlq_u16(vreinterpretq_u16_u8(part4), shiftMask);
449 uint16x8_t shift1b = vshrq_n_u16(shift1a, 4);
450 uint16x8_t shift2b = vshrq_n_u16(shift2a, 4);
451 uint16x8_t shift3b = vshrq_n_u16(shift3a, 4);
452 uint16x8_t shift4b = vshrq_n_u16(shift4a, 4);
454 vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift1b);
456 vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift2b);
458 vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift3b);
460 vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift4b);
464 outPos += outRowPadding;
469 void BitConversions::decode12BitPackedFallback(
int startRow,
int stopRow,
const unsigned char* dispStart,
470 int width,
unsigned short* dst,
int srcStride,
int dstStride) {
472 int dstStrideShort = dstStride/2;
475 for(
int y = startRow; y < stopRow; y++) {
476 const unsigned char* srcPtr = &dispStart[y*srcStride];
477 unsigned short* dstPtr = &dst[y*dstStrideShort];
478 unsigned short* dstEndPtr = dstPtr + width;
480 while(dstPtr != dstEndPtr) {
481 *dstPtr =
static_cast<unsigned short>(*srcPtr);
483 *dstPtr |=
static_cast<unsigned short>(*srcPtr & 0x0f) << 8;
486 *dstPtr =
static_cast<unsigned short>(*srcPtr) >> 4;
488 *dstPtr |=
static_cast<unsigned short>(*srcPtr) << 4;
Exception class that is used for all protocol exceptions.