123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172 |
- ///////////////////////////////////////////////////////////////////////////
- //
- // Copyright (c) 2009-2014 DreamWorks Animation LLC.
- //
- // All rights reserved.
- //
- // Redistribution and use in source and binary forms, with or without
- // modification, are permitted provided that the following conditions are
- // met:
- // * Redistributions of source code must retain the above copyright
- // notice, this list of conditions and the following disclaimer.
- // * Redistributions in binary form must reproduce the above
- // copyright notice, this list of conditions and the following disclaimer
- // in the documentation and/or other materials provided with the
- // distribution.
- // * Neither the name of DreamWorks Animation nor the names of
- // its contributors may be used to endorse or promote products derived
- // from this software without specific prior written permission.
- //
- // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- //
- ///////////////////////////////////////////////////////////////////////////
- #ifndef IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
- #define IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
- //
- // Various SSE accelerated functions, used by Imf::DwaCompressor.
- // These have been separated into a separate .h file, as the fast
- // paths are done with template specialization.
- //
- // Unless otherwise noted, all pointers are assumed to be 32-byte
- // aligned. Unaligned pointers may risk seg-faulting.
- //
- #include "ImfNamespace.h"
- #include "ImfSimd.h"
- #include "ImfSystemSpecific.h"
- #include "OpenEXRConfig.h"
- #include <half.h>
- #include <assert.h>
- #include <algorithm>
- OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
- #define _SSE_ALIGNMENT 32
- #define _SSE_ALIGNMENT_MASK 0x0F
- #define _AVX_ALIGNMENT_MASK 0x1F
- //
- // Test if we should enable GCC inline asm paths for AVX
- //
- #ifdef OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX
- #define IMF_HAVE_GCC_INLINEASM
- #ifdef __LP64__
- #define IMF_HAVE_GCC_INLINEASM_64
- #endif /* __LP64__ */
- #endif /* OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX */
- //
- // A simple 64-element array, aligned properly for SIMD access.
- //
- template <class T>
- class SimdAlignedBuffer64
- {
- public:
- SimdAlignedBuffer64(): _buffer (0), _handle (0)
- {
- alloc();
- }
- SimdAlignedBuffer64(const SimdAlignedBuffer64 &rhs): _handle(0)
- {
- alloc();
- memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
- }
- SimdAlignedBuffer64 &operator=(const SimdAlignedBuffer64 &rhs)
- {
- memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
- return *this;
- }
- #if __cplusplus >= 201103L
- SimdAlignedBuffer64(SimdAlignedBuffer64 &&rhs) noexcept
- : _handle(rhs._handle), _buffer(rhs._buffer)
- {
- rhs._handle = nullptr;
- rhs._buffer = nullptr;
- }
- SimdAlignedBuffer64 &operator=(SimdAlignedBuffer64 &&rhs) noexcept
- {
- std::swap(_handle, rhs._handle);
- std::swap(_buffer, rhs._buffer);
- return *this;
- }
- #endif
- ~SimdAlignedBuffer64 ()
- {
- if (_handle)
- EXRFreeAligned (_handle);
- _handle = 0;
- _buffer = 0;
- }
- void alloc()
- {
- //
- // Try EXRAllocAligned first - but it might fallback to
- // unaligned allocs. If so, overalloc.
- //
- _handle = (char *) EXRAllocAligned
- (64 * sizeof(T), _SSE_ALIGNMENT);
- if (((size_t)_handle & (_SSE_ALIGNMENT - 1)) == 0)
- {
- _buffer = (T *)_handle;
- return;
- }
- EXRFreeAligned(_handle);
- _handle = (char *) EXRAllocAligned
- (64 * sizeof(T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT);
- char *aligned = _handle;
- while ((size_t)aligned & (_SSE_ALIGNMENT - 1))
- aligned++;
- _buffer = (T *)aligned;
- }
- T *_buffer;
- private:
- char *_handle;
- };
- typedef SimdAlignedBuffer64<float> SimdAlignedBuffer64f;
- typedef SimdAlignedBuffer64<unsigned short> SimdAlignedBuffer64us;
- namespace {
- //
- // Color space conversion, Inverse 709 CSC, Y'CbCr -> R'G'B'
- //
- void
- csc709Inverse (float &comp0, float &comp1, float &comp2)
- {
- float src[3];
- src[0] = comp0;
- src[1] = comp1;
- src[2] = comp2;
- comp0 = src[0] + 1.5747f * src[2];
- comp1 = src[0] - 0.1873f * src[1] - 0.4682f * src[2];
- comp2 = src[0] + 1.8556f * src[1];
- }
- #ifndef IMF_HAVE_SSE2
- //
- // Scalar color space conversion, based on 709 primiary chromaticies.
- // No scaling or offsets, just the matrix
- //
- void
- csc709Inverse64 (float *comp0, float *comp1, float *comp2)
- {
- for (int i = 0; i < 64; ++i)
- csc709Inverse (comp0[i], comp1[i], comp2[i]);
- }
- #else /* IMF_HAVE_SSE2 */
- //
- // SSE2 color space conversion
- //
- void
- csc709Inverse64 (float *comp0, float *comp1, float *comp2)
- {
- __m128 c0 = { 1.5747f, 1.5747f, 1.5747f, 1.5747f};
- __m128 c1 = { 1.8556f, 1.8556f, 1.8556f, 1.8556f};
- __m128 c2 = {-0.1873f, -0.1873f, -0.1873f, -0.1873f};
- __m128 c3 = {-0.4682f, -0.4682f, -0.4682f, -0.4682f};
- __m128 *r = (__m128 *)comp0;
- __m128 *g = (__m128 *)comp1;
- __m128 *b = (__m128 *)comp2;
- __m128 src[3];
- #define CSC_INVERSE_709_SSE2_LOOP(i) \
- src[0] = r[i]; \
- src[1] = g[i]; \
- src[2] = b[i]; \
- \
- r[i] = _mm_add_ps (r[i], _mm_mul_ps (src[2], c0)); \
- \
- g[i] = _mm_mul_ps (g[i], c2); \
- src[2] = _mm_mul_ps (src[2], c3); \
- g[i] = _mm_add_ps (g[i], src[0]); \
- g[i] = _mm_add_ps (g[i], src[2]); \
- \
- b[i] = _mm_mul_ps (c1, src[1]); \
- b[i] = _mm_add_ps (b[i], src[0]);
- CSC_INVERSE_709_SSE2_LOOP (0)
- CSC_INVERSE_709_SSE2_LOOP (1)
- CSC_INVERSE_709_SSE2_LOOP (2)
- CSC_INVERSE_709_SSE2_LOOP (3)
- CSC_INVERSE_709_SSE2_LOOP (4)
- CSC_INVERSE_709_SSE2_LOOP (5)
- CSC_INVERSE_709_SSE2_LOOP (6)
- CSC_INVERSE_709_SSE2_LOOP (7)
- CSC_INVERSE_709_SSE2_LOOP (8)
- CSC_INVERSE_709_SSE2_LOOP (9)
- CSC_INVERSE_709_SSE2_LOOP (10)
- CSC_INVERSE_709_SSE2_LOOP (11)
- CSC_INVERSE_709_SSE2_LOOP (12)
- CSC_INVERSE_709_SSE2_LOOP (13)
- CSC_INVERSE_709_SSE2_LOOP (14)
- CSC_INVERSE_709_SSE2_LOOP (15)
- }
- #endif /* IMF_HAVE_SSE2 */
- //
- // Color space conversion, Forward 709 CSC, R'G'B' -> Y'CbCr
- //
- // Simple FPU color space conversion. Based on the 709
- // primary chromaticies, with no scaling or offsets.
- //
- void
- csc709Forward64 (float *comp0, float *comp1, float *comp2)
- {
- float src[3];
- for (int i = 0; i<64; ++i)
- {
- src[0] = comp0[i];
- src[1] = comp1[i];
- src[2] = comp2[i];
- comp0[i] = 0.2126f * src[0] + 0.7152f * src[1] + 0.0722f * src[2];
- comp1[i] = -0.1146f * src[0] - 0.3854f * src[1] + 0.5000f * src[2];
- comp2[i] = 0.5000f * src[0] - 0.4542f * src[1] - 0.0458f * src[2];
- }
- }
- //
- // Byte interleaving of 2 byte arrays:
- // src0 = AAAA
- // src1 = BBBB
- // dst = ABABABAB
- //
- // numBytes is the size of each of the source buffers
- //
- #ifndef IMF_HAVE_SSE2
- //
- // Scalar default implementation
- //
- void
- interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
- {
- for (int x = 0; x < numBytes; ++x)
- {
- dst[2 * x] = src0[x];
- dst[2 * x + 1] = src1[x];
- }
- }
- #else /* IMF_HAVE_SSE2 */
- //
- // SSE2 byte interleaving
- //
- void
- interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
- {
- int dstAlignment = (size_t)dst % 16;
- int src0Alignment = (size_t)src0 % 16;
- int src1Alignment = (size_t)src1 % 16;
- __m128i *dst_epi8 = (__m128i*)dst;
- __m128i *src0_epi8 = (__m128i*)src0;
- __m128i *src1_epi8 = (__m128i*)src1;
- int sseWidth = numBytes / 16;
- if ((!dstAlignment) && (!src0Alignment) && (!src1Alignment))
- {
- __m128i tmp0, tmp1;
- //
- // Aligned loads and stores
- //
- for (int x = 0; x < sseWidth; ++x)
- {
- tmp0 = src0_epi8[x];
- tmp1 = src1_epi8[x];
- _mm_stream_si128 (&dst_epi8[2 * x],
- _mm_unpacklo_epi8 (tmp0, tmp1));
- _mm_stream_si128 (&dst_epi8[2 * x + 1],
- _mm_unpackhi_epi8 (tmp0, tmp1));
- }
- //
- // Then do run the leftovers one at a time
- //
- for (int x = 16 * sseWidth; x < numBytes; ++x)
- {
- dst[2 * x] = src0[x];
- dst[2 * x + 1] = src1[x];
- }
- }
- else if ((!dstAlignment) && (src0Alignment == 8) && (src1Alignment == 8))
- {
- //
- // Aligned stores, but catch up a few values so we can
- // use aligned loads
- //
-
- for (int x = 0; x < std::min (numBytes, 8); ++x)
- {
- dst[2 * x] = src0[x];
- dst[2 * x + 1] = src1[x];
- }
- if (numBytes > 8)
- {
- dst_epi8 = (__m128i*)&dst[16];
- src0_epi8 = (__m128i*)&src0[8];
- src1_epi8 = (__m128i*)&src1[8];
- sseWidth = (numBytes - 8) / 16;
- for (int x=0; x<sseWidth; ++x)
- {
- _mm_stream_si128 (&dst_epi8[2 * x],
- _mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x]));
- _mm_stream_si128 (&dst_epi8[2 * x + 1],
- _mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x]));
- }
- //
- // Then do run the leftovers one at a time
- //
- for (int x = 16 * sseWidth + 8; x < numBytes; ++x)
- {
- dst[2 * x] = src0[x];
- dst[2 * x + 1] = src1[x];
- }
- }
- }
- else
- {
- //
- // Unaligned everything
- //
- for (int x = 0; x < sseWidth; ++x)
- {
- __m128i tmpSrc0_epi8 = _mm_loadu_si128 (&src0_epi8[x]);
- __m128i tmpSrc1_epi8 = _mm_loadu_si128 (&src1_epi8[x]);
- _mm_storeu_si128 (&dst_epi8[2 * x],
- _mm_unpacklo_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
- _mm_storeu_si128 (&dst_epi8[2 * x + 1],
- _mm_unpackhi_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
- }
- //
- // Then do run the leftovers one at a time
- //
- for (int x = 16 * sseWidth; x < numBytes; ++x)
- {
- dst[2 * x] = src0[x];
- dst[2 * x + 1] = src1[x];
- }
- }
- }
- #endif /* IMF_HAVE_SSE2 */
- //
- // Float -> half float conversion
- //
- // To enable F16C based conversion, we can't rely on compile-time
- // detection, hence the multiple defined versions. Pick one based
- // on runtime cpuid detection.
- //
- //
- // Default boring conversion
- //
- void
- convertFloatToHalf64_scalar (unsigned short *dst, float *src)
- {
- for (int i=0; i<64; ++i)
- dst[i] = ((half)src[i]).bits();
- }
- //
- // F16C conversion - Assumes aligned src and dst
- //
- void
- convertFloatToHalf64_f16c (unsigned short *dst, float *src)
- {
- //
- // Ordinarly, I'd avoid using inline asm and prefer intrinsics.
- // However, in order to get the intrinsics, we need to tell
- // the compiler to generate VEX instructions.
- //
- // (On the GCC side, -mf16c goes ahead and activates -mavc,
- // resulting in VEX code. Without -mf16c, no intrinsics..)
- //
- // Now, it's quite likely that we'll find ourselves in situations
- // where we want to build *without* VEX, in order to maintain
- // maximum compatability. But to get there with intrinsics,
- // we'd need to break out code into a separate file. Bleh.
- // I'll take the asm.
- //
- #if defined IMF_HAVE_GCC_INLINEASM
- __asm__
- ("vmovaps (%0), %%ymm0 \n"
- "vmovaps 0x20(%0), %%ymm1 \n"
- "vmovaps 0x40(%0), %%ymm2 \n"
- "vmovaps 0x60(%0), %%ymm3 \n"
- "vcvtps2ph $0, %%ymm0, %%xmm0 \n"
- "vcvtps2ph $0, %%ymm1, %%xmm1 \n"
- "vcvtps2ph $0, %%ymm2, %%xmm2 \n"
- "vcvtps2ph $0, %%ymm3, %%xmm3 \n"
- "vmovdqa %%xmm0, 0x00(%1) \n"
- "vmovdqa %%xmm1, 0x10(%1) \n"
- "vmovdqa %%xmm2, 0x20(%1) \n"
- "vmovdqa %%xmm3, 0x30(%1) \n"
- "vmovaps 0x80(%0), %%ymm0 \n"
- "vmovaps 0xa0(%0), %%ymm1 \n"
- "vmovaps 0xc0(%0), %%ymm2 \n"
- "vmovaps 0xe0(%0), %%ymm3 \n"
- "vcvtps2ph $0, %%ymm0, %%xmm0 \n"
- "vcvtps2ph $0, %%ymm1, %%xmm1 \n"
- "vcvtps2ph $0, %%ymm2, %%xmm2 \n"
- "vcvtps2ph $0, %%ymm3, %%xmm3 \n"
- "vmovdqa %%xmm0, 0x40(%1) \n"
- "vmovdqa %%xmm1, 0x50(%1) \n"
- "vmovdqa %%xmm2, 0x60(%1) \n"
- "vmovdqa %%xmm3, 0x70(%1) \n"
- #ifndef __AVX__
- "vzeroupper \n"
- #endif /* __AVX__ */
- : /* Output */
- : /* Input */ "r"(src), "r"(dst)
- #ifndef __AVX__
- : /* Clobber */ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"
- #else
- : /* Clobber */ "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"
- #endif /* __AVX__ */
- );
- #else
- convertFloatToHalf64_scalar (dst, src);
- #endif /* IMF_HAVE_GCC_INLINEASM */
- }
- //
- // Convert an 8x8 block of HALF from zig-zag order to
- // FLOAT in normal order. The order we want is:
- //
- // src dst
- // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28
- // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42
- // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43
- // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53
- // 32 33 34 35 36 37 38 39 10 19 23 32 39 45 52 54
- // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60
- // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61
- // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63
- //
- void
- fromHalfZigZag_scalar (unsigned short *src, float *dst)
- {
- half *srcHalf = (half *)src;
- dst[0] = (float)srcHalf[0];
- dst[1] = (float)srcHalf[1];
- dst[2] = (float)srcHalf[5];
- dst[3] = (float)srcHalf[6];
- dst[4] = (float)srcHalf[14];
- dst[5] = (float)srcHalf[15];
- dst[6] = (float)srcHalf[27];
- dst[7] = (float)srcHalf[28];
- dst[8] = (float)srcHalf[2];
- dst[9] = (float)srcHalf[4];
- dst[10] = (float)srcHalf[7];
- dst[11] = (float)srcHalf[13];
- dst[12] = (float)srcHalf[16];
- dst[13] = (float)srcHalf[26];
- dst[14] = (float)srcHalf[29];
- dst[15] = (float)srcHalf[42];
- dst[16] = (float)srcHalf[3];
- dst[17] = (float)srcHalf[8];
- dst[18] = (float)srcHalf[12];
- dst[19] = (float)srcHalf[17];
- dst[20] = (float)srcHalf[25];
- dst[21] = (float)srcHalf[30];
- dst[22] = (float)srcHalf[41];
- dst[23] = (float)srcHalf[43];
- dst[24] = (float)srcHalf[9];
- dst[25] = (float)srcHalf[11];
- dst[26] = (float)srcHalf[18];
- dst[27] = (float)srcHalf[24];
- dst[28] = (float)srcHalf[31];
- dst[29] = (float)srcHalf[40];
- dst[30] = (float)srcHalf[44];
- dst[31] = (float)srcHalf[53];
- dst[32] = (float)srcHalf[10];
- dst[33] = (float)srcHalf[19];
- dst[34] = (float)srcHalf[23];
- dst[35] = (float)srcHalf[32];
- dst[36] = (float)srcHalf[39];
- dst[37] = (float)srcHalf[45];
- dst[38] = (float)srcHalf[52];
- dst[39] = (float)srcHalf[54];
- dst[40] = (float)srcHalf[20];
- dst[41] = (float)srcHalf[22];
- dst[42] = (float)srcHalf[33];
- dst[43] = (float)srcHalf[38];
- dst[44] = (float)srcHalf[46];
- dst[45] = (float)srcHalf[51];
- dst[46] = (float)srcHalf[55];
- dst[47] = (float)srcHalf[60];
- dst[48] = (float)srcHalf[21];
- dst[49] = (float)srcHalf[34];
- dst[50] = (float)srcHalf[37];
- dst[51] = (float)srcHalf[47];
- dst[52] = (float)srcHalf[50];
- dst[53] = (float)srcHalf[56];
- dst[54] = (float)srcHalf[59];
- dst[55] = (float)srcHalf[61];
- dst[56] = (float)srcHalf[35];
- dst[57] = (float)srcHalf[36];
- dst[58] = (float)srcHalf[48];
- dst[59] = (float)srcHalf[49];
- dst[60] = (float)srcHalf[57];
- dst[61] = (float)srcHalf[58];
- dst[62] = (float)srcHalf[62];
- dst[63] = (float)srcHalf[63];
- }
- //
- // If we can form the correct ordering in xmm registers,
- // we can use F16C to convert from HALF -> FLOAT. However,
- // making the correct order isn't trivial.
- //
- // We want to re-order a source 8x8 matrix from:
- //
- // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28
- // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42
- // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43
- // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53 (A)
- // 32 33 34 35 36 37 38 39 --> 10 19 23 32 39 45 52 54
- // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60
- // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61
- // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63
- //
- // Which looks like a mess, right?
- //
- // Now, check out the NE/SW diagonals of (A). Along those lines,
- // we have runs of contiguous values! If we rewrite (A) a bit, we get:
- //
- // 0
- // 1 2
- // 5 4 3
- // 6 7 8 9
- // 14 13 12 11 10
- // 15 16 17 18 19 20
- // 27 26 25 24 23 22 21 (B)
- // 28 29 30 31 32 33 34 35
- // 42 41 40 39 38 37 36
- // 43 44 45 46 47 48
- // 53 52 51 50 49
- // 54 55 56 57
- // 60 59 58
- // 61 62
- // 63
- //
- // In this ordering, the columns are the rows (A). If we can 'transpose'
- // (B), we'll achieve our goal. But we want this to fit nicely into
- // xmm registers and still be able to load large runs efficiently.
- // Also, notice that the odd rows are in ascending order, while
- // the even rows are in descending order.
- //
- // If we 'fold' the bottom half up into the top, we can preserve ordered
- // runs accross rows, and still keep all the correct values in columns.
- // After transposing, we'll need to rotate things back into place.
- // This gives us:
- //
- // 0 | 42 41 40 39 38 37 36
- // 1 2 | 43 44 45 46 47 48
- // 5 4 3 | 53 52 51 50 49
- // 6 7 8 9 | 54 55 56 57 (C)
- // 14 13 12 11 10 | 60 59 58
- // 15 16 17 18 19 20 | 61 62
- // 27 26 25 24 23 22 21 | 61
- // 28 29 30 31 32 33 34 35
- //
- // But hang on. We still have the backwards descending rows to deal with.
- // Lets reverse the even rows so that all values are in ascending order
- //
- // 36 37 38 39 40 41 42 | 0
- // 1 2 | 43 44 45 46 47 48
- // 49 50 51 52 53 | 3 4 5
- // 6 7 8 9 | 54 55 56 57 (D)
- // 58 59 60 | 10 11 12 13 14
- // 15 16 17 18 19 20 | 61 62
- // 61 | 21 22 23 24 25 26 27
- // 28 29 30 31 32 33 34 35
- //
- // If we can form (D), we will then:
- // 1) Reverse the even rows
- // 2) Transpose
- // 3) Rotate the rows
- //
- // and we'll have (A).
- //
- void
- fromHalfZigZag_f16c (unsigned short *src, float *dst)
- {
- #if defined IMF_HAVE_GCC_INLINEASM_64
- __asm__
- /* x3 <- 0
- * x8 <- [ 0- 7]
- * x6 <- [56-63]
- * x9 <- [21-28]
- * x7 <- [28-35]
- * x3 <- [ 6- 9] (lower half) */
-
- ("vpxor %%xmm3, %%xmm3, %%xmm3 \n"
- "vmovdqa (%0), %%xmm8 \n"
- "vmovdqa 112(%0), %%xmm6 \n"
- "vmovdqu 42(%0), %%xmm9 \n"
- "vmovdqu 56(%0), %%xmm7 \n"
- "vmovq 12(%0), %%xmm3 \n"
- /* Setup rows 0-2 of A in xmm0-xmm2
- * x1 <- x8 >> 16 (1 value)
- * x2 <- x8 << 32 (2 values)
- * x0 <- alignr([35-42], x8, 2)
- * x1 <- blend(x1, [41-48])
- * x2 <- blend(x2, [49-56]) */
- "vpsrldq $2, %%xmm8, %%xmm1 \n"
- "vpslldq $4, %%xmm8, %%xmm2 \n"
- "vpalignr $2, 70(%0), %%xmm8, %%xmm0 \n"
- "vpblendw $0xfc, 82(%0), %%xmm1, %%xmm1 \n"
- "vpblendw $0x1f, 98(%0), %%xmm2, %%xmm2 \n"
-
- /* Setup rows 4-6 of A in xmm4-xmm6
- * x4 <- x6 >> 32 (2 values)
- * x5 <- x6 << 16 (1 value)
- * x6 <- alignr(x6,x9,14)
- * x4 <- blend(x4, [ 7-14])
- * x5 <- blend(x5, [15-22]) */
- "vpsrldq $4, %%xmm6, %%xmm4 \n"
- "vpslldq $2, %%xmm6, %%xmm5 \n"
- "vpalignr $14, %%xmm6, %%xmm9, %%xmm6 \n"
- "vpblendw $0xf8, 14(%0), %%xmm4, %%xmm4 \n"
- "vpblendw $0x3f, 30(%0), %%xmm5, %%xmm5 \n"
- /* Load the upper half of row 3 into xmm3
- * x3 <- [54-57] (upper half) */
- "vpinsrq $1, 108(%0), %%xmm3, %%xmm3\n"
- /* Reverse the even rows. We're not using PSHUFB as
- * that requires loading an extra constant all the time,
- * and we're alreadly pretty memory bound.
- */
- "vpshuflw $0x1b, %%xmm0, %%xmm0 \n"
- "vpshuflw $0x1b, %%xmm2, %%xmm2 \n"
- "vpshuflw $0x1b, %%xmm4, %%xmm4 \n"
- "vpshuflw $0x1b, %%xmm6, %%xmm6 \n"
- "vpshufhw $0x1b, %%xmm0, %%xmm0 \n"
- "vpshufhw $0x1b, %%xmm2, %%xmm2 \n"
- "vpshufhw $0x1b, %%xmm4, %%xmm4 \n"
- "vpshufhw $0x1b, %%xmm6, %%xmm6 \n"
- "vpshufd $0x4e, %%xmm0, %%xmm0 \n"
- "vpshufd $0x4e, %%xmm2, %%xmm2 \n"
- "vpshufd $0x4e, %%xmm4, %%xmm4 \n"
- "vpshufd $0x4e, %%xmm6, %%xmm6 \n"
- /* Transpose xmm0-xmm7 into xmm8-xmm15 */
- "vpunpcklwd %%xmm1, %%xmm0, %%xmm8 \n"
- "vpunpcklwd %%xmm3, %%xmm2, %%xmm9 \n"
- "vpunpcklwd %%xmm5, %%xmm4, %%xmm10 \n"
- "vpunpcklwd %%xmm7, %%xmm6, %%xmm11 \n"
- "vpunpckhwd %%xmm1, %%xmm0, %%xmm12 \n"
- "vpunpckhwd %%xmm3, %%xmm2, %%xmm13 \n"
- "vpunpckhwd %%xmm5, %%xmm4, %%xmm14 \n"
- "vpunpckhwd %%xmm7, %%xmm6, %%xmm15 \n"
-
- "vpunpckldq %%xmm9, %%xmm8, %%xmm0 \n"
- "vpunpckldq %%xmm11, %%xmm10, %%xmm1 \n"
- "vpunpckhdq %%xmm9, %%xmm8, %%xmm2 \n"
- "vpunpckhdq %%xmm11, %%xmm10, %%xmm3 \n"
- "vpunpckldq %%xmm13, %%xmm12, %%xmm4 \n"
- "vpunpckldq %%xmm15, %%xmm14, %%xmm5 \n"
- "vpunpckhdq %%xmm13, %%xmm12, %%xmm6 \n"
- "vpunpckhdq %%xmm15, %%xmm14, %%xmm7 \n"
-
- "vpunpcklqdq %%xmm1, %%xmm0, %%xmm8 \n"
- "vpunpckhqdq %%xmm1, %%xmm0, %%xmm9 \n"
- "vpunpcklqdq %%xmm3, %%xmm2, %%xmm10 \n"
- "vpunpckhqdq %%xmm3, %%xmm2, %%xmm11 \n"
- "vpunpcklqdq %%xmm4, %%xmm5, %%xmm12 \n"
- "vpunpckhqdq %%xmm5, %%xmm4, %%xmm13 \n"
- "vpunpcklqdq %%xmm7, %%xmm6, %%xmm14 \n"
- "vpunpckhqdq %%xmm7, %%xmm6, %%xmm15 \n"
- /* Rotate the rows to get the correct final order.
- * Rotating xmm12 isn't needed, as we can handle
- * the rotation in the PUNPCKLQDQ above. Rotating
- * xmm8 isn't needed as it's already in the right order
- */
- "vpalignr $2, %%xmm9, %%xmm9, %%xmm9 \n"
- "vpalignr $4, %%xmm10, %%xmm10, %%xmm10 \n"
- "vpalignr $6, %%xmm11, %%xmm11, %%xmm11 \n"
- "vpalignr $10, %%xmm13, %%xmm13, %%xmm13 \n"
- "vpalignr $12, %%xmm14, %%xmm14, %%xmm14 \n"
- "vpalignr $14, %%xmm15, %%xmm15, %%xmm15 \n"
- /* Convert from half -> float */
- "vcvtph2ps %%xmm8, %%ymm8 \n"
- "vcvtph2ps %%xmm9, %%ymm9 \n"
- "vcvtph2ps %%xmm10, %%ymm10 \n"
- "vcvtph2ps %%xmm11, %%ymm11 \n"
- "vcvtph2ps %%xmm12, %%ymm12 \n"
- "vcvtph2ps %%xmm13, %%ymm13 \n"
- "vcvtph2ps %%xmm14, %%ymm14 \n"
- "vcvtph2ps %%xmm15, %%ymm15 \n"
-
- /* Move float values to dst */
- "vmovaps %%ymm8, (%1) \n"
- "vmovaps %%ymm9, 32(%1) \n"
- "vmovaps %%ymm10, 64(%1) \n"
- "vmovaps %%ymm11, 96(%1) \n"
- "vmovaps %%ymm12, 128(%1) \n"
- "vmovaps %%ymm13, 160(%1) \n"
- "vmovaps %%ymm14, 192(%1) \n"
- "vmovaps %%ymm15, 224(%1) \n"
- #ifndef __AVX__
- "vzeroupper \n"
- #endif /* __AVX__ */
- : /* Output */
- : /* Input */ "r"(src), "r"(dst)
- : /* Clobber */ "memory",
- #ifndef __AVX__
- "%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15"
- #else
- "%ymm0", "%ymm1", "%ymm2", "%ymm3",
- "%ymm4", "%ymm5", "%ymm6", "%ymm7",
- "%ymm8", "%ymm9", "%ymm10", "%ymm11",
- "%ymm12", "%ymm13", "%ymm14", "%ymm15"
- #endif /* __AVX__ */
- );
- #else
- fromHalfZigZag_scalar(src, dst);
- #endif /* defined IMF_HAVE_GCC_INLINEASM_64 */
- }
- //
- // Inverse 8x8 DCT, only inverting the DC. This assumes that
- // all AC frequencies are 0.
- //
- #ifndef IMF_HAVE_SSE2
- void
- dctInverse8x8DcOnly (float *data)
- {
- float val = data[0] * 3.535536e-01f * 3.535536e-01f;
- for (int i = 0; i < 64; ++i)
- data[i] = val;
- }
- #else /* IMF_HAVE_SSE2 */
- void
- dctInverse8x8DcOnly (float *data)
- {
- __m128 src = _mm_set1_ps (data[0] * 3.535536e-01f * 3.535536e-01f);
- __m128 *dst = (__m128 *)data;
- for (int i = 0; i < 16; ++i)
- dst[i] = src;
- }
- #endif /* IMF_HAVE_SSE2 */
- //
- // Full 8x8 Inverse DCT:
- //
- // Simple inverse DCT on an 8x8 block, with scalar ops only.
- // Operates on data in-place.
- //
- // This is based on the iDCT formuation (y = frequency domain,
- // x = spatial domain)
- //
- // [x0] [ ][y0] [ ][y1]
- // [x1] = [ M1 ][y2] + [ M2 ][y3]
- // [x2] [ ][y4] [ ][y5]
- // [x3] [ ][y6] [ ][y7]
- //
- // [x7] [ ][y0] [ ][y1]
- // [x6] = [ M1 ][y2] - [ M2 ][y3]
- // [x5] [ ][y4] [ ][y5]
- // [x4] [ ][y6] [ ][y7]
- //
- // where M1: M2:
- //
- // [a c a f] [b d e g]
- // [a f -a -c] [d -g -b -e]
- // [a -f -a c] [e -b g d]
- // [a -c a -f] [g -e d -b]
- //
- // and the constants are as defined below..
- //
- // If you know how many of the lower rows are zero, that can
- // be passed in to help speed things up. If you don't know,
- // just set zeroedRows=0.
- //
- //
- // Default implementation
- //
- template <int zeroedRows>
- void
- dctInverse8x8_scalar (float *data)
- {
- const float a = .5f * cosf (3.14159f / 4.0f);
- const float b = .5f * cosf (3.14159f / 16.0f);
- const float c = .5f * cosf (3.14159f / 8.0f);
- const float d = .5f * cosf (3.f*3.14159f / 16.0f);
- const float e = .5f * cosf (5.f*3.14159f / 16.0f);
- const float f = .5f * cosf (3.f*3.14159f / 8.0f);
- const float g = .5f * cosf (7.f*3.14159f / 16.0f);
- float alpha[4], beta[4], theta[4], gamma[4];
- float *rowPtr = NULL;
- //
- // First pass - row wise.
- //
- // This looks less-compact than the description above in
- // an attempt to fold together common sub-expressions.
- //
- for (int row = 0; row < 8 - zeroedRows; ++row)
- {
- rowPtr = data + row * 8;
- alpha[0] = c * rowPtr[2];
- alpha[1] = f * rowPtr[2];
- alpha[2] = c * rowPtr[6];
- alpha[3] = f * rowPtr[6];
- beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7];
- beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7];
- beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7];
- beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7];
- theta[0] = a * (rowPtr[0] + rowPtr[4]);
- theta[3] = a * (rowPtr[0] - rowPtr[4]);
- theta[1] = alpha[0] + alpha[3];
- theta[2] = alpha[1] - alpha[2];
- gamma[0] = theta[0] + theta[1];
- gamma[1] = theta[3] + theta[2];
- gamma[2] = theta[3] - theta[2];
- gamma[3] = theta[0] - theta[1];
- rowPtr[0] = gamma[0] + beta[0];
- rowPtr[1] = gamma[1] + beta[1];
- rowPtr[2] = gamma[2] + beta[2];
- rowPtr[3] = gamma[3] + beta[3];
- rowPtr[4] = gamma[3] - beta[3];
- rowPtr[5] = gamma[2] - beta[2];
- rowPtr[6] = gamma[1] - beta[1];
- rowPtr[7] = gamma[0] - beta[0];
- }
- //
- // Second pass - column wise.
- //
- for (int column = 0; column < 8; ++column)
- {
- alpha[0] = c * data[16+column];
- alpha[1] = f * data[16+column];
- alpha[2] = c * data[48+column];
- alpha[3] = f * data[48+column];
- beta[0] = b * data[8+column] + d * data[24+column] +
- e * data[40+column] + g * data[56+column];
- beta[1] = d * data[8+column] - g * data[24+column] -
- b * data[40+column] - e * data[56+column];
- beta[2] = e * data[8+column] - b * data[24+column] +
- g * data[40+column] + d * data[56+column];
- beta[3] = g * data[8+column] - e * data[24+column] +
- d * data[40+column] - b * data[56+column];
- theta[0] = a * (data[column] + data[32+column]);
- theta[3] = a * (data[column] - data[32+column]);
- theta[1] = alpha[0] + alpha[3];
- theta[2] = alpha[1] - alpha[2];
- gamma[0] = theta[0] + theta[1];
- gamma[1] = theta[3] + theta[2];
- gamma[2] = theta[3] - theta[2];
- gamma[3] = theta[0] - theta[1];
- data[ column] = gamma[0] + beta[0];
- data[ 8 + column] = gamma[1] + beta[1];
- data[16 + column] = gamma[2] + beta[2];
- data[24 + column] = gamma[3] + beta[3];
- data[32 + column] = gamma[3] - beta[3];
- data[40 + column] = gamma[2] - beta[2];
- data[48 + column] = gamma[1] - beta[1];
- data[56 + column] = gamma[0] - beta[0];
- }
- }
- //
- // SSE2 Implementation
- //
- template <int zeroedRows>
- void
- dctInverse8x8_sse2 (float *data)
- {
- #ifdef IMF_HAVE_SSE2
- __m128 a = {3.535536e-01f,3.535536e-01f,3.535536e-01f,3.535536e-01f};
- __m128 b = {4.903927e-01f,4.903927e-01f,4.903927e-01f,4.903927e-01f};
- __m128 c = {4.619398e-01f,4.619398e-01f,4.619398e-01f,4.619398e-01f};
- __m128 d = {4.157349e-01f,4.157349e-01f,4.157349e-01f,4.157349e-01f};
- __m128 e = {2.777855e-01f,2.777855e-01f,2.777855e-01f,2.777855e-01f};
- __m128 f = {1.913422e-01f,1.913422e-01f,1.913422e-01f,1.913422e-01f};
- __m128 g = {9.754573e-02f,9.754573e-02f,9.754573e-02f,9.754573e-02f};
- __m128 c0 = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f};
- __m128 c1 = {4.619398e-01f, 1.913422e-01f,-1.913422e-01f,-4.619398e-01f};
- __m128 c2 = {3.535536e-01f,-3.535536e-01f,-3.535536e-01f, 3.535536e-01f};
- __m128 c3 = {1.913422e-01f,-4.619398e-01f, 4.619398e-01f,-1.913422e-01f};
- __m128 c4 = {4.903927e-01f, 4.157349e-01f, 2.777855e-01f, 9.754573e-02f};
- __m128 c5 = {4.157349e-01f,-9.754573e-02f,-4.903927e-01f,-2.777855e-01f};
- __m128 c6 = {2.777855e-01f,-4.903927e-01f, 9.754573e-02f, 4.157349e-01f};
- __m128 c7 = {9.754573e-02f,-2.777855e-01f, 4.157349e-01f,-4.903927e-01f};
- __m128 *srcVec = (__m128 *)data;
- __m128 x[8], evenSum, oddSum;
- __m128 in[8], alpha[4], beta[4], theta[4], gamma[4];
-
- //
- // Rows -
- //
- // Treat this just like matrix-vector multiplication. The
- // trick is to note that:
- //
- // [M00 M01 M02 M03][v0] [(v0 M00) + (v1 M01) + (v2 M02) + (v3 M03)]
- // [M10 M11 M12 M13][v1] = [(v0 M10) + (v1 M11) + (v2 M12) + (v3 M13)]
- // [M20 M21 M22 M23][v2] [(v0 M20) + (v1 M21) + (v2 M22) + (v3 M23)]
- // [M30 M31 M32 M33][v3] [(v0 M30) + (v1 M31) + (v2 M32) + (v3 M33)]
- //
- // Then, we can fill a register with v_i and multiply by the i-th column
- // of M, accumulating across all i-s.
- //
- // The kids refer to the populating of a register with a single value
- // "broadcasting", and it can be done with a shuffle instruction. It
- // seems to be the slowest part of the whole ordeal.
- //
- // Our matrix columns are stored above in c0-c7. c0-3 make up M1, and
- // c4-7 are from M2.
- //
- #define DCT_INVERSE_8x8_SS2_ROW_LOOP(i) \
- /* \
- * Broadcast the components of the row \
- */ \
- \
- x[0] = _mm_shuffle_ps (srcVec[2 * i], \
- srcVec[2 * i], \
- _MM_SHUFFLE (0, 0, 0, 0)); \
- \
- x[1] = _mm_shuffle_ps (srcVec[2 * i], \
- srcVec[2 * i], \
- _MM_SHUFFLE (1, 1, 1, 1)); \
- \
- x[2] = _mm_shuffle_ps (srcVec[2 * i], \
- srcVec[2 * i], \
- _MM_SHUFFLE (2, 2, 2, 2)); \
- \
- x[3] = _mm_shuffle_ps (srcVec[2 * i], \
- srcVec[2 * i], \
- _MM_SHUFFLE (3, 3, 3, 3)); \
- \
- x[4] = _mm_shuffle_ps (srcVec[2 * i + 1], \
- srcVec[2 * i + 1], \
- _MM_SHUFFLE (0, 0, 0, 0)); \
- \
- x[5] = _mm_shuffle_ps (srcVec[2 * i + 1], \
- srcVec[2 * i + 1], \
- _MM_SHUFFLE (1, 1, 1, 1)); \
- \
- x[6] = _mm_shuffle_ps (srcVec[2 * i + 1], \
- srcVec[2 * i + 1], \
- _MM_SHUFFLE (2, 2, 2, 2)); \
- \
- x[7] = _mm_shuffle_ps (srcVec[2 * i + 1], \
- srcVec[2 * i + 1], \
- _MM_SHUFFLE (3, 3, 3, 3)); \
- /* \
- * Multiply the components by each column of the matrix \
- */ \
- \
- x[0] = _mm_mul_ps (x[0], c0); \
- x[2] = _mm_mul_ps (x[2], c1); \
- x[4] = _mm_mul_ps (x[4], c2); \
- x[6] = _mm_mul_ps (x[6], c3); \
- \
- x[1] = _mm_mul_ps (x[1], c4); \
- x[3] = _mm_mul_ps (x[3], c5); \
- x[5] = _mm_mul_ps (x[5], c6); \
- x[7] = _mm_mul_ps (x[7], c7); \
- \
- /* \
- * Add across \
- */ \
- \
- evenSum = _mm_setzero_ps(); \
- evenSum = _mm_add_ps (evenSum, x[0]); \
- evenSum = _mm_add_ps (evenSum, x[2]); \
- evenSum = _mm_add_ps (evenSum, x[4]); \
- evenSum = _mm_add_ps (evenSum, x[6]); \
- \
- oddSum = _mm_setzero_ps(); \
- oddSum = _mm_add_ps (oddSum, x[1]); \
- oddSum = _mm_add_ps (oddSum, x[3]); \
- oddSum = _mm_add_ps (oddSum, x[5]); \
- oddSum = _mm_add_ps (oddSum, x[7]); \
- \
- /* \
- * Final Sum: \
- * out [0, 1, 2, 3] = evenSum + oddSum \
- * out [7, 6, 5, 4] = evenSum - oddSum \
- */ \
- \
- srcVec[2 * i] = _mm_add_ps (evenSum, oddSum); \
- srcVec[2 * i + 1] = _mm_sub_ps (evenSum, oddSum); \
- srcVec[2 * i + 1] = _mm_shuffle_ps (srcVec[2 * i + 1], \
- srcVec[2 * i + 1], \
- _MM_SHUFFLE (0, 1, 2, 3));
- switch (zeroedRows)
- {
- case 0:
- default:
- DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (7)
- break;
- case 1:
- DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
- break;
- case 2:
- DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
- break;
- case 3:
- DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
- break;
- case 4:
- DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
- break;
- case 5:
- DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
- break;
- case 6:
- DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
- DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
- break;
- case 7:
- DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
- break;
- }
- //
- // Columns -
- //
- // This is slightly more straightforward, if less readable. Here
- // we just operate on 4 columns at a time, in two batches.
- //
- // The slight mess is to try and cache sub-expressions, which
- // we ignore in the row-wise pass.
- //
- for (int col = 0; col < 2; ++col)
- {
- for (int i = 0; i < 8; ++i)
- in[i] = srcVec[2 * i + col];
- alpha[0] = _mm_mul_ps (c, in[2]);
- alpha[1] = _mm_mul_ps (f, in[2]);
- alpha[2] = _mm_mul_ps (c, in[6]);
- alpha[3] = _mm_mul_ps (f, in[6]);
- beta[0] = _mm_add_ps (_mm_add_ps (_mm_mul_ps (in[1], b),
- _mm_mul_ps (in[3], d)),
- _mm_add_ps (_mm_mul_ps (in[5], e),
- _mm_mul_ps (in[7], g)));
- beta[1] = _mm_sub_ps (_mm_sub_ps (_mm_mul_ps (in[1], d),
- _mm_mul_ps (in[3], g)),
- _mm_add_ps (_mm_mul_ps (in[5], b),
- _mm_mul_ps (in[7], e)));
- beta[2] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], e),
- _mm_mul_ps (in[3], b)),
- _mm_add_ps (_mm_mul_ps (in[5], g),
- _mm_mul_ps (in[7], d)));
- beta[3] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], g),
- _mm_mul_ps (in[3], e)),
- _mm_sub_ps (_mm_mul_ps (in[5], d),
- _mm_mul_ps (in[7], b)));
- theta[0] = _mm_mul_ps (a, _mm_add_ps (in[0], in[4]));
- theta[3] = _mm_mul_ps (a, _mm_sub_ps (in[0], in[4]));
- theta[1] = _mm_add_ps (alpha[0], alpha[3]);
- theta[2] = _mm_sub_ps (alpha[1], alpha[2]);
- gamma[0] = _mm_add_ps (theta[0], theta[1]);
- gamma[1] = _mm_add_ps (theta[3], theta[2]);
- gamma[2] = _mm_sub_ps (theta[3], theta[2]);
- gamma[3] = _mm_sub_ps (theta[0], theta[1]);
- srcVec[ col] = _mm_add_ps (gamma[0], beta[0]);
- srcVec[2+col] = _mm_add_ps (gamma[1], beta[1]);
- srcVec[4+col] = _mm_add_ps (gamma[2], beta[2]);
- srcVec[6+col] = _mm_add_ps (gamma[3], beta[3]);
- srcVec[ 8+col] = _mm_sub_ps (gamma[3], beta[3]);
- srcVec[10+col] = _mm_sub_ps (gamma[2], beta[2]);
- srcVec[12+col] = _mm_sub_ps (gamma[1], beta[1]);
- srcVec[14+col] = _mm_sub_ps (gamma[0], beta[0]);
- }
- #else /* IMF_HAVE_SSE2 */
- dctInverse8x8_scalar<zeroedRows> (data);
- #endif /* IMF_HAVE_SSE2 */
- }
- //
- // AVX Implementation
- //
- #define STR(A) #A
- #define IDCT_AVX_SETUP_2_ROWS(_DST0, _DST1, _TMP0, _TMP1, \
- _OFF00, _OFF01, _OFF10, _OFF11) \
- "vmovaps " STR(_OFF00) "(%0), %%xmm" STR(_TMP0) " \n" \
- "vmovaps " STR(_OFF01) "(%0), %%xmm" STR(_TMP1) " \n" \
- " \n" \
- "vinsertf128 $1, " STR(_OFF10) "(%0), %%ymm" STR(_TMP0) ", %%ymm" STR(_TMP0) " \n" \
- "vinsertf128 $1, " STR(_OFF11) "(%0), %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP1) " \n" \
- " \n" \
- "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \
- "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n" \
- " \n" \
- "vunpcklps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP0) " \n" \
- "vunpckhps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP1) " \n" \
- " \n" \
- "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \
- "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n"
- #define IDCT_AVX_MMULT_ROWS(_SRC) \
- /* Broadcast the source values into y12-y15 */ \
- "vpermilps $0x00, " STR(_SRC) ", %%ymm12 \n" \
- "vpermilps $0x55, " STR(_SRC) ", %%ymm13 \n" \
- "vpermilps $0xaa, " STR(_SRC) ", %%ymm14 \n" \
- "vpermilps $0xff, " STR(_SRC) ", %%ymm15 \n" \
- \
- /* Multiple coefs and the broadcasted values */ \
- "vmulps %%ymm12, %%ymm8, %%ymm12 \n" \
- "vmulps %%ymm13, %%ymm9, %%ymm13 \n" \
- "vmulps %%ymm14, %%ymm10, %%ymm14 \n" \
- "vmulps %%ymm15, %%ymm11, %%ymm15 \n" \
- \
- /* Accumulate the result back into the source */ \
- "vaddps %%ymm13, %%ymm12, %%ymm12 \n" \
- "vaddps %%ymm15, %%ymm14, %%ymm14 \n" \
- "vaddps %%ymm14, %%ymm12, " STR(_SRC) "\n"
- #define IDCT_AVX_EO_TO_ROW_HALVES(_EVEN, _ODD, _FRONT, _BACK) \
- "vsubps " STR(_ODD) "," STR(_EVEN) "," STR(_BACK) "\n" \
- "vaddps " STR(_ODD) "," STR(_EVEN) "," STR(_FRONT) "\n" \
- /* Reverse the back half */ \
- "vpermilps $0x1b," STR(_BACK) "," STR(_BACK) "\n"
- /* In order to allow for path paths when we know certain rows
- * of the 8x8 block are zero, most of the body of the DCT is
- * in the following macro. Statements are wrapped in a ROWn()
- * macro, where n is the lowest row in the 8x8 block in which
- * they depend.
- *
- * This should work for the cases where we have 2-8 full rows.
- * the 1-row case is special, and we'll handle it seperately.
- */
- #define IDCT_AVX_BODY \
- /* ==============================================
- * Row 1D DCT
- * ----------------------------------------------
- */ \
- \
- /* Setup for the row-oriented 1D DCT. Assuming that (%0) holds
- * the row-major 8x8 block, load ymm0-3 with the even columns
- * and ymm4-7 with the odd columns. The lower half of the ymm
- * holds one row, while the upper half holds the next row.
- *
- * If our source is:
- * a0 a1 a2 a3 a4 a5 a6 a7
- * b0 b1 b2 b3 b4 b5 b6 b7
- *
- * We'll be forming:
- * a0 a2 a4 a6 b0 b2 b4 b6
- * a1 a3 a5 a7 b1 b3 b5 b7
- */ \
- ROW0( IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48) ) \
- ROW2( IDCT_AVX_SETUP_2_ROWS(1, 5, 12, 13, 64, 80, 96, 112) ) \
- ROW4( IDCT_AVX_SETUP_2_ROWS(2, 6, 10, 11, 128, 144, 160, 176) ) \
- ROW6( IDCT_AVX_SETUP_2_ROWS(3, 7, 8, 9, 192, 208, 224, 240) ) \
- \
- /* Multiple the even columns (ymm0-3) by the matrix M1
- * storing the results back in ymm0-3
- *
- * Assume that (%1) holds the matrix in column major order
- */ \
- "vbroadcastf128 (%1), %%ymm8 \n" \
- "vbroadcastf128 16(%1), %%ymm9 \n" \
- "vbroadcastf128 32(%1), %%ymm10 \n" \
- "vbroadcastf128 48(%1), %%ymm11 \n" \
- \
- ROW0( IDCT_AVX_MMULT_ROWS(%%ymm0) ) \
- ROW2( IDCT_AVX_MMULT_ROWS(%%ymm1) ) \
- ROW4( IDCT_AVX_MMULT_ROWS(%%ymm2) ) \
- ROW6( IDCT_AVX_MMULT_ROWS(%%ymm3) ) \
- \
- /* Repeat, but with the odd columns (ymm4-7) and the
- * matrix M2
- */ \
- "vbroadcastf128 64(%1), %%ymm8 \n" \
- "vbroadcastf128 80(%1), %%ymm9 \n" \
- "vbroadcastf128 96(%1), %%ymm10 \n" \
- "vbroadcastf128 112(%1), %%ymm11 \n" \
- \
- ROW0( IDCT_AVX_MMULT_ROWS(%%ymm4) ) \
- ROW2( IDCT_AVX_MMULT_ROWS(%%ymm5) ) \
- ROW4( IDCT_AVX_MMULT_ROWS(%%ymm6) ) \
- ROW6( IDCT_AVX_MMULT_ROWS(%%ymm7) ) \
- \
- /* Sum the M1 (ymm0-3) and M2 (ymm4-7) results to get the
- * front halves of the results, and difference to get the
- * back halves. The front halfs end up in ymm0-3, the back
- * halves end up in ymm12-15.
- */ \
- ROW0( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) ) \
- ROW2( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm1, %%ymm5, %%ymm1, %%ymm13) ) \
- ROW4( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm2, %%ymm6, %%ymm2, %%ymm14) ) \
- ROW6( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm3, %%ymm7, %%ymm3, %%ymm15) ) \
- \
- /* Reassemble the rows halves into ymm0-7 */ \
- ROW7( "vperm2f128 $0x13, %%ymm3, %%ymm15, %%ymm7 \n" ) \
- ROW6( "vperm2f128 $0x02, %%ymm3, %%ymm15, %%ymm6 \n" ) \
- ROW5( "vperm2f128 $0x13, %%ymm2, %%ymm14, %%ymm5 \n" ) \
- ROW4( "vperm2f128 $0x02, %%ymm2, %%ymm14, %%ymm4 \n" ) \
- ROW3( "vperm2f128 $0x13, %%ymm1, %%ymm13, %%ymm3 \n" ) \
- ROW2( "vperm2f128 $0x02, %%ymm1, %%ymm13, %%ymm2 \n" ) \
- ROW1( "vperm2f128 $0x13, %%ymm0, %%ymm12, %%ymm1 \n" ) \
- ROW0( "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" ) \
- \
- \
- /* ==============================================
- * Column 1D DCT
- * ----------------------------------------------
- */ \
- \
- /* Rows should be in ymm0-7, and M2 columns should still be
- * preserved in ymm8-11. M2 has 4 unique values (and +-
- * versions of each), and all (positive) values appear in
- * the first column (and row), which is in ymm8.
- *
- * For the column-wise DCT, we need to:
- * 1) Broadcast each element a row of M2 into 4 vectors
- * 2) Multiple the odd rows (ymm1,3,5,7) by the broadcasts.
- * 3) Accumulate into ymm12-15 for the odd outputs.
- *
- * Instead of doing 16 broadcasts for each element in M2,
- * do 4, filling y8-11 with:
- *
- * ymm8: [ b b b b | b b b b ]
- * ymm9: [ d d d d | d d d d ]
- * ymm10: [ e e e e | e e e e ]
- * ymm11: [ g g g g | g g g g ]
- *
- * And deal with the negative values by subtracting during accum.
- */ \
- "vpermilps $0xff, %%ymm8, %%ymm11 \n" \
- "vpermilps $0xaa, %%ymm8, %%ymm10 \n" \
- "vpermilps $0x55, %%ymm8, %%ymm9 \n" \
- "vpermilps $0x00, %%ymm8, %%ymm8 \n" \
- \
- /* This one is easy, since we have ymm12-15 open for scratch
- * ymm12 = b ymm1 + d ymm3 + e ymm5 + g ymm7
- */ \
- ROW1( "vmulps %%ymm1, %%ymm8, %%ymm12 \n" ) \
- ROW3( "vmulps %%ymm3, %%ymm9, %%ymm13 \n" ) \
- ROW5( "vmulps %%ymm5, %%ymm10, %%ymm14 \n" ) \
- ROW7( "vmulps %%ymm7, %%ymm11, %%ymm15 \n" ) \
- \
- ROW3( "vaddps %%ymm12, %%ymm13, %%ymm12 \n" ) \
- ROW7( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \
- ROW5( "vaddps %%ymm12, %%ymm14, %%ymm12 \n" ) \
- \
- /* Tricker, since only y13-15 are open for scratch
- * ymm13 = d ymm1 - g ymm3 - b ymm5 - e ymm7
- */ \
- ROW1( "vmulps %%ymm1, %%ymm9, %%ymm13 \n" ) \
- ROW3( "vmulps %%ymm3, %%ymm11, %%ymm14 \n" ) \
- ROW5( "vmulps %%ymm5, %%ymm8, %%ymm15 \n" ) \
- \
- ROW5( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \
- ROW3( "vsubps %%ymm14, %%ymm13, %%ymm13 \n" ) \
- \
- ROW7( "vmulps %%ymm7, %%ymm10, %%ymm15 \n" ) \
- ROW7( "vsubps %%ymm15, %%ymm13, %%ymm13 \n" ) \
- \
- /* Tricker still, as only y14-15 are open for scratch
- * ymm14 = e ymm1 - b ymm3 + g ymm5 + d ymm7
- */ \
- ROW1( "vmulps %%ymm1, %%ymm10, %%ymm14 \n" ) \
- ROW3( "vmulps %%ymm3, %%ymm8, %%ymm15 \n" ) \
- \
- ROW3( "vsubps %%ymm15, %%ymm14, %%ymm14 \n" ) \
- \
- ROW5( "vmulps %%ymm5, %%ymm11, %%ymm15 \n" ) \
- ROW5( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \
- \
- ROW7( "vmulps %%ymm7, %%ymm9, %%ymm15 \n" ) \
- ROW7( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \
- \
- \
- /* Easy, as we can blow away ymm1,3,5,7 for scratch
- * ymm15 = g ymm1 - e ymm3 + d ymm5 - b ymm7
- */ \
- ROW1( "vmulps %%ymm1, %%ymm11, %%ymm15 \n" ) \
- ROW3( "vmulps %%ymm3, %%ymm10, %%ymm3 \n" ) \
- ROW5( "vmulps %%ymm5, %%ymm9, %%ymm5 \n" ) \
- ROW7( "vmulps %%ymm7, %%ymm8, %%ymm7 \n" ) \
- \
- ROW5( "vaddps %%ymm15, %%ymm5, %%ymm15 \n" ) \
- ROW7( "vaddps %%ymm3, %%ymm7, %%ymm3 \n" ) \
- ROW3( "vsubps %%ymm3, %%ymm15, %%ymm15 \n" ) \
- \
- \
- /* Load coefs for M1. Because we're going to broadcast
- * coefs, we don't need to load the actual structure from
- * M1. Instead, just load enough that we can broadcast.
- * There are only 6 unique values in M1, but they're in +-
- * pairs, leaving only 3 unique coefs if we add and subtract
- * properly.
- *
- * Fill ymm1 with coef[2] = [ a a c f | a a c f ]
- * Broadcast ymm5 with [ f f f f | f f f f ]
- * Broadcast ymm3 with [ c c c c | c c c c ]
- * Broadcast ymm1 with [ a a a a | a a a a ]
- */ \
- "vbroadcastf128 8(%1), %%ymm1 \n" \
- "vpermilps $0xff, %%ymm1, %%ymm5 \n" \
- "vpermilps $0xaa, %%ymm1, %%ymm3 \n" \
- "vpermilps $0x00, %%ymm1, %%ymm1 \n" \
- \
- /* If we expand E = [M1] [x0 x2 x4 x6]^t, we get the following
- * common expressions:
- *
- * E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6)
- * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
- *
- * E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
- * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
- *
- * Afterwards, ymm8-11 will hold the even outputs.
- */ \
- \
- /* ymm11 = (a ymm0 + a ymm4), ymm1 = (a ymm0 - a ymm4) */ \
- ROW0( "vmulps %%ymm1, %%ymm0, %%ymm11 \n" ) \
- ROW4( "vmulps %%ymm1, %%ymm4, %%ymm4 \n" ) \
- ROW0( "vmovaps %%ymm11, %%ymm1 \n" ) \
- ROW4( "vaddps %%ymm4, %%ymm11, %%ymm11 \n" ) \
- ROW4( "vsubps %%ymm4, %%ymm1, %%ymm1 \n" ) \
- \
- /* ymm7 = (c ymm2 + f ymm6) */ \
- ROW2( "vmulps %%ymm3, %%ymm2, %%ymm7 \n" ) \
- ROW6( "vmulps %%ymm5, %%ymm6, %%ymm9 \n" ) \
- ROW6( "vaddps %%ymm9, %%ymm7, %%ymm7 \n" ) \
- \
- /* E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6)
- * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
- */ \
- ROW0( "vmovaps %%ymm11, %%ymm8 \n" ) \
- ROW2( "vaddps %%ymm7, %%ymm8, %%ymm8 \n" ) \
- ROW2( "vsubps %%ymm7, %%ymm11, %%ymm11 \n" ) \
- \
- /* ymm7 = (f ymm2 - c ymm6) */ \
- ROW2( "vmulps %%ymm5, %%ymm2, %%ymm7 \n" ) \
- ROW6( "vmulps %%ymm3, %%ymm6, %%ymm9 \n" ) \
- ROW6( "vsubps %%ymm9, %%ymm7, %%ymm7 \n" ) \
- \
- /* E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
- * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
- */ \
- ROW0( "vmovaps %%ymm1, %%ymm9 \n" ) \
- ROW0( "vmovaps %%ymm1, %%ymm10 \n" ) \
- ROW2( "vaddps %%ymm7, %%ymm1, %%ymm9 \n" ) \
- ROW2( "vsubps %%ymm7, %%ymm1, %%ymm10 \n" ) \
- \
- /* Add the even (ymm8-11) and the odds (ymm12-15),
- * placing the results into ymm0-7
- */ \
- "vaddps %%ymm12, %%ymm8, %%ymm0 \n" \
- "vaddps %%ymm13, %%ymm9, %%ymm1 \n" \
- "vaddps %%ymm14, %%ymm10, %%ymm2 \n" \
- "vaddps %%ymm15, %%ymm11, %%ymm3 \n" \
- \
- "vsubps %%ymm12, %%ymm8, %%ymm7 \n" \
- "vsubps %%ymm13, %%ymm9, %%ymm6 \n" \
- "vsubps %%ymm14, %%ymm10, %%ymm5 \n" \
- "vsubps %%ymm15, %%ymm11, %%ymm4 \n" \
- \
- /* Copy out the results from ymm0-7 */ \
- "vmovaps %%ymm0, (%0) \n" \
- "vmovaps %%ymm1, 32(%0) \n" \
- "vmovaps %%ymm2, 64(%0) \n" \
- "vmovaps %%ymm3, 96(%0) \n" \
- "vmovaps %%ymm4, 128(%0) \n" \
- "vmovaps %%ymm5, 160(%0) \n" \
- "vmovaps %%ymm6, 192(%0) \n" \
- "vmovaps %%ymm7, 224(%0) \n"
- /* Output, input, and clobber (OIC) sections of the inline asm */
- #define IDCT_AVX_OIC(_IN0) \
- : /* Output */ \
- : /* Input */ "r"(_IN0), "r"(sAvxCoef) \
- : /* Clobber */ "memory", \
- "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
- "%xmm4", "%xmm5", "%xmm6", "%xmm7", \
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",\
- "%xmm12", "%xmm13", "%xmm14", "%xmm15"
- /* Include vzeroupper for non-AVX builds */
- #ifndef __AVX__
- #define IDCT_AVX_ASM(_IN0) \
- __asm__( \
- IDCT_AVX_BODY \
- "vzeroupper \n" \
- IDCT_AVX_OIC(_IN0) \
- );
- #else /* __AVX__ */
- #define IDCT_AVX_ASM(_IN0) \
- __asm__( \
- IDCT_AVX_BODY \
- IDCT_AVX_OIC(_IN0) \
- );
- #endif /* __AVX__ */
- template <int zeroedRows>
- void
- dctInverse8x8_avx (float *data)
- {
- #if defined IMF_HAVE_GCC_INLINEASM_64
- /* The column-major version of M1, followed by the
- * column-major version of M2:
- *
- * [ a c a f ] [ b d e g ]
- * M1 = [ a f -a -c ] M2 = [ d -g -b -e ]
- * [ a -f -a c ] [ e -b g d ]
- * [ a -c a -f ] [ g -e d -b ]
- */
- const float sAvxCoef[32] __attribute__((aligned(32))) = {
- 3.535536e-01, 3.535536e-01, 3.535536e-01, 3.535536e-01, /* a a a a */
- 4.619398e-01, 1.913422e-01, -1.913422e-01, -4.619398e-01, /* c f -f -c */
- 3.535536e-01, -3.535536e-01, -3.535536e-01, 3.535536e-01, /* a -a -a a */
- 1.913422e-01, -4.619398e-01, 4.619398e-01, -1.913422e-01, /* f -c c -f */
- 4.903927e-01, 4.157349e-01, 2.777855e-01, 9.754573e-02, /* b d e g */
- 4.157349e-01, -9.754573e-02, -4.903927e-01, -2.777855e-01, /* d -g -b -e */
- 2.777855e-01, -4.903927e-01, 9.754573e-02, 4.157349e-01, /* e -b g d */
- 9.754573e-02, -2.777855e-01, 4.157349e-01, -4.903927e-01 /* g -e d -b */
- };
- #define ROW0(_X) _X
- #define ROW1(_X) _X
- #define ROW2(_X) _X
- #define ROW3(_X) _X
- #define ROW4(_X) _X
- #define ROW5(_X) _X
- #define ROW6(_X) _X
- #define ROW7(_X) _X
- if (zeroedRows == 0) {
- IDCT_AVX_ASM(data)
- } else if (zeroedRows == 1) {
- #undef ROW7
- #define ROW7(_X)
- IDCT_AVX_ASM(data)
- } else if (zeroedRows == 2) {
- #undef ROW6
- #define ROW6(_X)
- IDCT_AVX_ASM(data)
- } else if (zeroedRows == 3) {
- #undef ROW5
- #define ROW5(_X)
- IDCT_AVX_ASM(data)
- } else if (zeroedRows == 4) {
- #undef ROW4
- #define ROW4(_X)
- IDCT_AVX_ASM(data)
- } else if (zeroedRows == 5) {
- #undef ROW3
- #define ROW3(_X)
- IDCT_AVX_ASM(data)
- } else if (zeroedRows == 6) {
- #undef ROW2
- #define ROW2(_X)
- IDCT_AVX_ASM(data)
- } else if (zeroedRows == 7) {
- __asm__(
- /* ==============================================
- * Row 1D DCT
- * ----------------------------------------------
- */
- IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48)
- "vbroadcastf128 (%1), %%ymm8 \n"
- "vbroadcastf128 16(%1), %%ymm9 \n"
- "vbroadcastf128 32(%1), %%ymm10 \n"
- "vbroadcastf128 48(%1), %%ymm11 \n"
- /* Stash a vector of [a a a a | a a a a] away in ymm2 */
- "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n"
- IDCT_AVX_MMULT_ROWS(%%ymm0)
- "vbroadcastf128 64(%1), %%ymm8 \n"
- "vbroadcastf128 80(%1), %%ymm9 \n"
- "vbroadcastf128 96(%1), %%ymm10 \n"
- "vbroadcastf128 112(%1), %%ymm11 \n"
- IDCT_AVX_MMULT_ROWS(%%ymm4)
- IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12)
- "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n"
- /* ==============================================
- * Column 1D DCT
- * ----------------------------------------------
- */
- /* DC only, so multiple by a and we're done */
- "vmulps %%ymm2, %%ymm0, %%ymm0 \n"
- /* Copy out results */
- "vmovaps %%ymm0, (%0) \n"
- "vmovaps %%ymm0, 32(%0) \n"
- "vmovaps %%ymm0, 64(%0) \n"
- "vmovaps %%ymm0, 96(%0) \n"
- "vmovaps %%ymm0, 128(%0) \n"
- "vmovaps %%ymm0, 160(%0) \n"
- "vmovaps %%ymm0, 192(%0) \n"
- "vmovaps %%ymm0, 224(%0) \n"
- #ifndef __AVX__
- "vzeroupper \n"
- #endif /* __AVX__ */
- IDCT_AVX_OIC(data)
- );
- } else {
- assert(false); // Invalid template instance parameter
- }
- #else /* IMF_HAVE_GCC_INLINEASM_64 */
- dctInverse8x8_scalar<zeroedRows>(data);
- #endif /* IMF_HAVE_GCC_INLINEASM_64 */
- }
- //
- // Full 8x8 Forward DCT:
- //
- // Base forward 8x8 DCT implementation. Works on the data in-place
- //
- // The implementation describedin Pennebaker + Mitchell,
- // section 4.3.2, and illustrated in figure 4-7
- //
- // The basic idea is that the 1D DCT math reduces to:
- //
- // 2*out_0 = c_4 [(s_07 + s_34) + (s_12 + s_56)]
- // 2*out_4 = c_4 [(s_07 + s_34) - (s_12 + s_56)]
- //
- // {2*out_2, 2*out_6} = rot_6 ((d_12 - d_56), (s_07 - s_34))
- //
- // {2*out_3, 2*out_5} = rot_-3 (d_07 - c_4 (s_12 - s_56),
- // d_34 - c_4 (d_12 + d_56))
- //
- // {2*out_1, 2*out_7} = rot_-1 (d_07 + c_4 (s_12 - s_56),
- // -d_34 - c_4 (d_12 + d_56))
- //
- // where:
- //
- // c_i = cos(i*pi/16)
- // s_i = sin(i*pi/16)
- //
- // s_ij = in_i + in_j
- // d_ij = in_i - in_j
- //
- // rot_i(x, y) = {c_i*x + s_i*y, -s_i*x + c_i*y}
- //
- // We'll run the DCT in two passes. First, run the 1D DCT on
- // the rows, in-place. Then, run over the columns in-place,
- // and be done with it.
- //
- #ifndef IMF_HAVE_SSE2
- //
- // Default implementation
- //
- void
- dctForward8x8 (float *data)
- {
- float A0, A1, A2, A3, A4, A5, A6, A7;
- float K0, K1, rot_x, rot_y;
- float *srcPtr = data;
- float *dstPtr = data;
- const float c1 = cosf (3.14159f * 1.0f / 16.0f);
- const float c2 = cosf (3.14159f * 2.0f / 16.0f);
- const float c3 = cosf (3.14159f * 3.0f / 16.0f);
- const float c4 = cosf (3.14159f * 4.0f / 16.0f);
- const float c5 = cosf (3.14159f * 5.0f / 16.0f);
- const float c6 = cosf (3.14159f * 6.0f / 16.0f);
- const float c7 = cosf (3.14159f * 7.0f / 16.0f);
- const float c1Half = .5f * c1;
- const float c2Half = .5f * c2;
- const float c3Half = .5f * c3;
- const float c5Half = .5f * c5;
- const float c6Half = .5f * c6;
- const float c7Half = .5f * c7;
- //
- // First pass - do a 1D DCT over the rows and write the
- // results back in place
- //
- for (int row=0; row<8; ++row)
- {
- float *srcRowPtr = srcPtr + 8 * row;
- float *dstRowPtr = dstPtr + 8 * row;
- A0 = srcRowPtr[0] + srcRowPtr[7];
- A1 = srcRowPtr[1] + srcRowPtr[2];
- A2 = srcRowPtr[1] - srcRowPtr[2];
- A3 = srcRowPtr[3] + srcRowPtr[4];
- A4 = srcRowPtr[3] - srcRowPtr[4];
- A5 = srcRowPtr[5] + srcRowPtr[6];
- A6 = srcRowPtr[5] - srcRowPtr[6];
- A7 = srcRowPtr[0] - srcRowPtr[7];
- K0 = c4 * (A0 + A3);
- K1 = c4 * (A1 + A5);
- dstRowPtr[0] = .5f * (K0 + K1);
- dstRowPtr[4] = .5f * (K0 - K1);
- //
- // (2*dst2, 2*dst6) = rot 6 (d12 - d56, s07 - s34)
- //
- rot_x = A2 - A6;
- rot_y = A0 - A3;
- dstRowPtr[2] = c6Half * rot_x + c2Half * rot_y;
- dstRowPtr[6] = c6Half * rot_y - c2Half * rot_x;
- //
- // K0, K1 are active until after dst[1],dst[7]
- // as well as dst[3], dst[5] are computed.
- //
- K0 = c4 * (A1 - A5);
- K1 = -1 * c4 * (A2 + A6);
- //
- // Two ways to do a rotation:
- //
- // rot i (x, y) =
- // X = c_i*x + s_i*y
- // Y = -s_i*x + c_i*y
- //
- // OR
- //
- // X = c_i*(x+y) + (s_i-c_i)*y
- // Y = c_i*y - (s_i+c_i)*x
- //
- // the first case has 4 multiplies, but fewer constants,
- // while the 2nd case has fewer multiplies but takes more space.
- //
- // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 )
- //
- rot_x = A7 - K0;
- rot_y = A4 + K1;
- dstRowPtr[3] = c3Half * rot_x - c5Half * rot_y;
- dstRowPtr[5] = c5Half * rot_x + c3Half * rot_y;
- //
- // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 )
- //
- rot_x = A7 + K0;
- rot_y = K1 - A4;
- //
- // A: 4, 7 are inactive. All A's are inactive
- //
- dstRowPtr[1] = c1Half * rot_x - c7Half * rot_y;
- dstRowPtr[7] = c7Half * rot_x + c1Half * rot_y;
- }
- //
- // Second pass - do the same, but on the columns
- //
- for (int column = 0; column < 8; ++column)
- {
- A0 = srcPtr[ column] + srcPtr[56 + column];
- A7 = srcPtr[ column] - srcPtr[56 + column];
- A1 = srcPtr[ 8 + column] + srcPtr[16 + column];
- A2 = srcPtr[ 8 + column] - srcPtr[16 + column];
- A3 = srcPtr[24 + column] + srcPtr[32 + column];
- A4 = srcPtr[24 + column] - srcPtr[32 + column];
- A5 = srcPtr[40 + column] + srcPtr[48 + column];
- A6 = srcPtr[40 + column] - srcPtr[48 + column];
- K0 = c4 * (A0 + A3);
- K1 = c4 * (A1 + A5);
- dstPtr[ column] = .5f * (K0 + K1);
- dstPtr[32+column] = .5f * (K0 - K1);
- //
- // (2*dst2, 2*dst6) = rot 6 ( d12 - d56, s07 - s34 )
- //
- rot_x = A2 - A6;
- rot_y = A0 - A3;
- dstPtr[16+column] = .5f * (c6 * rot_x + c2 * rot_y);
- dstPtr[48+column] = .5f * (c6 * rot_y - c2 * rot_x);
- //
- // K0, K1 are active until after dst[1],dst[7]
- // as well as dst[3], dst[5] are computed.
- //
- K0 = c4 * (A1 - A5);
- K1 = -1 * c4 * (A2 + A6);
- //
- // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 )
- //
- rot_x = A7 - K0;
- rot_y = A4 + K1;
- dstPtr[24+column] = .5f * (c3 * rot_x - c5 * rot_y);
- dstPtr[40+column] = .5f * (c5 * rot_x + c3 * rot_y);
- //
- // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 )
- //
- rot_x = A7 + K0;
- rot_y = K1 - A4;
- dstPtr[ 8+column] = .5f * (c1 * rot_x - c7 * rot_y);
- dstPtr[56+column] = .5f * (c7 * rot_x + c1 * rot_y);
- }
- }
- #else /* IMF_HAVE_SSE2 */
- //
- // SSE2 implementation
- //
- // Here, we're always doing a column-wise operation
- // plus transposes. This might be faster to do differently
- // between rows-wise and column-wise
- //
- void
- dctForward8x8 (float *data)
- {
- __m128 *srcVec = (__m128 *)data;
- __m128 a0Vec, a1Vec, a2Vec, a3Vec, a4Vec, a5Vec, a6Vec, a7Vec;
- __m128 k0Vec, k1Vec, rotXVec, rotYVec;
- __m128 transTmp[4], transTmp2[4];
- __m128 c4Vec = { .70710678f, .70710678f, .70710678f, .70710678f};
- __m128 c4NegVec = {-.70710678f, -.70710678f, -.70710678f, -.70710678f};
- __m128 c1HalfVec = {.490392640f, .490392640f, .490392640f, .490392640f};
- __m128 c2HalfVec = {.461939770f, .461939770f, .461939770f, .461939770f};
- __m128 c3HalfVec = {.415734810f, .415734810f, .415734810f, .415734810f};
- __m128 c5HalfVec = {.277785120f, .277785120f, .277785120f, .277785120f};
- __m128 c6HalfVec = {.191341720f, .191341720f, .191341720f, .191341720f};
- __m128 c7HalfVec = {.097545161f, .097545161f, .097545161f, .097545161f};
- __m128 halfVec = {.5f, .5f, .5f, .5f};
- for (int iter = 0; iter < 2; ++iter)
- {
- //
- // Operate on 4 columns at a time. The
- // offsets into our row-major array are:
- // 0: 0 1
- // 1: 2 3
- // 2: 4 5
- // 3: 6 7
- // 4: 8 9
- // 5: 10 11
- // 6: 12 13
- // 7: 14 15
- //
- for (int pass=0; pass<2; ++pass)
- {
- a0Vec = _mm_add_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
- a1Vec = _mm_add_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
- a3Vec = _mm_add_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
- a5Vec = _mm_add_ps (srcVec[10 + pass], srcVec[12 + pass]);
-
- a7Vec = _mm_sub_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
- a2Vec = _mm_sub_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
- a4Vec = _mm_sub_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
- a6Vec = _mm_sub_ps (srcVec[10 + pass], srcVec[12 + pass]);
- //
- // First stage; Compute out_0 and out_4
- //
- k0Vec = _mm_add_ps (a0Vec, a3Vec);
- k1Vec = _mm_add_ps (a1Vec, a5Vec);
- k0Vec = _mm_mul_ps (c4Vec, k0Vec);
- k1Vec = _mm_mul_ps (c4Vec, k1Vec);
- srcVec[0 + pass] = _mm_add_ps (k0Vec, k1Vec);
- srcVec[8 + pass] = _mm_sub_ps (k0Vec, k1Vec);
- srcVec[0 + pass] = _mm_mul_ps (srcVec[0 + pass], halfVec );
- srcVec[8 + pass] = _mm_mul_ps (srcVec[8 + pass], halfVec );
- //
- // Second stage; Compute out_2 and out_6
- //
-
- k0Vec = _mm_sub_ps (a2Vec, a6Vec);
- k1Vec = _mm_sub_ps (a0Vec, a3Vec);
- srcVec[ 4 + pass] = _mm_add_ps (_mm_mul_ps (c6HalfVec, k0Vec),
- _mm_mul_ps (c2HalfVec, k1Vec));
- srcVec[12 + pass] = _mm_sub_ps (_mm_mul_ps (c6HalfVec, k1Vec),
- _mm_mul_ps (c2HalfVec, k0Vec));
- //
- // Precompute K0 and K1 for the remaining stages
- //
- k0Vec = _mm_mul_ps (_mm_sub_ps (a1Vec, a5Vec), c4Vec);
- k1Vec = _mm_mul_ps (_mm_add_ps (a2Vec, a6Vec), c4NegVec);
- //
- // Third Stage, compute out_3 and out_5
- //
- rotXVec = _mm_sub_ps (a7Vec, k0Vec);
- rotYVec = _mm_add_ps (a4Vec, k1Vec);
- srcVec[ 6 + pass] = _mm_sub_ps (_mm_mul_ps (c3HalfVec, rotXVec),
- _mm_mul_ps (c5HalfVec, rotYVec));
- srcVec[10 + pass] = _mm_add_ps (_mm_mul_ps (c5HalfVec, rotXVec),
- _mm_mul_ps (c3HalfVec, rotYVec));
- //
- // Fourth Stage, compute out_1 and out_7
- //
- rotXVec = _mm_add_ps (a7Vec, k0Vec);
- rotYVec = _mm_sub_ps (k1Vec, a4Vec);
- srcVec[ 2 + pass] = _mm_sub_ps (_mm_mul_ps (c1HalfVec, rotXVec),
- _mm_mul_ps (c7HalfVec, rotYVec));
- srcVec[14 + pass] = _mm_add_ps (_mm_mul_ps (c7HalfVec, rotXVec),
- _mm_mul_ps (c1HalfVec, rotYVec));
- }
- //
- // Transpose the matrix, in 4x4 blocks. So, if we have our
- // 8x8 matrix divied into 4x4 blocks:
- //
- // M0 | M1 M0t | M2t
- // ----+--- --> -----+------
- // M2 | M3 M1t | M3t
- //
- //
- // M0t, done in place, the first half.
- //
- transTmp[0] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0x44);
- transTmp[1] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0x44);
- transTmp[3] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0xEE);
- transTmp[2] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0xEE);
- //
- // M3t, also done in place, the first half.
- //
- transTmp2[0] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0x44);
- transTmp2[1] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0x44);
- transTmp2[2] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0xEE);
- transTmp2[3] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0xEE);
- //
- // M0t, the second half.
- //
- srcVec[0] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
- srcVec[4] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
- srcVec[2] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
- srcVec[6] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
- //
- // M3t, the second half.
- //
- srcVec[ 9] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
- srcVec[13] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
- srcVec[11] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
- srcVec[15] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
- //
- // M1 and M2 need to be done at the same time, because we're
- // swapping.
- //
- // First, the first half of M1t
- //
- transTmp[0] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0x44);
- transTmp[1] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0x44);
- transTmp[2] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0xEE);
- transTmp[3] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0xEE);
- //
- // And the first half of M2t
- //
- transTmp2[0] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0x44);
- transTmp2[1] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0x44);
- transTmp2[2] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0xEE);
- transTmp2[3] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0xEE);
- //
- // Second half of M1t
- //
- srcVec[ 8] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
- srcVec[12] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
- srcVec[10] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
- srcVec[14] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
- //
- // Second half of M2
- //
- srcVec[1] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
- srcVec[5] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
- srcVec[3] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
- srcVec[7] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
- }
- }
- #endif /* IMF_HAVE_SSE2 */
- } // anonymous namespace
- OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
- #endif
|