ImfDwaCompressorSimd.h 81 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172
  1. ///////////////////////////////////////////////////////////////////////////
  2. //
  3. // Copyright (c) 2009-2014 DreamWorks Animation LLC.
  4. //
  5. // All rights reserved.
  6. //
  7. // Redistribution and use in source and binary forms, with or without
  8. // modification, are permitted provided that the following conditions are
  9. // met:
  10. // * Redistributions of source code must retain the above copyright
  11. // notice, this list of conditions and the following disclaimer.
  12. // * Redistributions in binary form must reproduce the above
  13. // copyright notice, this list of conditions and the following disclaimer
  14. // in the documentation and/or other materials provided with the
  15. // distribution.
  16. // * Neither the name of DreamWorks Animation nor the names of
  17. // its contributors may be used to endorse or promote products derived
  18. // from this software without specific prior written permission.
  19. //
  20. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. //
  32. ///////////////////////////////////////////////////////////////////////////
  33. #ifndef IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
  34. #define IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
  35. //
  36. // Various SSE accelerated functions, used by Imf::DwaCompressor.
  37. // These have been separated into a separate .h file, as the fast
  38. // paths are done with template specialization.
  39. //
  40. // Unless otherwise noted, all pointers are assumed to be 32-byte
  41. // aligned. Unaligned pointers may risk seg-faulting.
  42. //
  43. #include "ImfNamespace.h"
  44. #include "ImfSimd.h"
  45. #include "ImfSystemSpecific.h"
  46. #include "OpenEXRConfig.h"
  47. #include <half.h>
  48. #include <assert.h>
  49. #include <algorithm>
  50. OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
  51. #define _SSE_ALIGNMENT 32
  52. #define _SSE_ALIGNMENT_MASK 0x0F
  53. #define _AVX_ALIGNMENT_MASK 0x1F
  54. //
  55. // Test if we should enable GCC inline asm paths for AVX
  56. //
  57. #ifdef OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX
  58. #define IMF_HAVE_GCC_INLINEASM
  59. #ifdef __LP64__
  60. #define IMF_HAVE_GCC_INLINEASM_64
  61. #endif /* __LP64__ */
  62. #endif /* OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX */
  63. //
  64. // A simple 64-element array, aligned properly for SIMD access.
  65. //
  66. template <class T>
  67. class SimdAlignedBuffer64
  68. {
  69. public:
  70. SimdAlignedBuffer64(): _buffer (0), _handle (0)
  71. {
  72. alloc();
  73. }
  74. SimdAlignedBuffer64(const SimdAlignedBuffer64 &rhs): _handle(0)
  75. {
  76. alloc();
  77. memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
  78. }
  79. SimdAlignedBuffer64 &operator=(const SimdAlignedBuffer64 &rhs)
  80. {
  81. memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
  82. return *this;
  83. }
  84. #if __cplusplus >= 201103L
  85. SimdAlignedBuffer64(SimdAlignedBuffer64 &&rhs) noexcept
  86. : _handle(rhs._handle), _buffer(rhs._buffer)
  87. {
  88. rhs._handle = nullptr;
  89. rhs._buffer = nullptr;
  90. }
  91. SimdAlignedBuffer64 &operator=(SimdAlignedBuffer64 &&rhs) noexcept
  92. {
  93. std::swap(_handle, rhs._handle);
  94. std::swap(_buffer, rhs._buffer);
  95. return *this;
  96. }
  97. #endif
  98. ~SimdAlignedBuffer64 ()
  99. {
  100. if (_handle)
  101. EXRFreeAligned (_handle);
  102. _handle = 0;
  103. _buffer = 0;
  104. }
  105. void alloc()
  106. {
  107. //
  108. // Try EXRAllocAligned first - but it might fallback to
  109. // unaligned allocs. If so, overalloc.
  110. //
  111. _handle = (char *) EXRAllocAligned
  112. (64 * sizeof(T), _SSE_ALIGNMENT);
  113. if (((size_t)_handle & (_SSE_ALIGNMENT - 1)) == 0)
  114. {
  115. _buffer = (T *)_handle;
  116. return;
  117. }
  118. EXRFreeAligned(_handle);
  119. _handle = (char *) EXRAllocAligned
  120. (64 * sizeof(T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT);
  121. char *aligned = _handle;
  122. while ((size_t)aligned & (_SSE_ALIGNMENT - 1))
  123. aligned++;
  124. _buffer = (T *)aligned;
  125. }
  126. T *_buffer;
  127. private:
  128. char *_handle;
  129. };
  130. typedef SimdAlignedBuffer64<float> SimdAlignedBuffer64f;
  131. typedef SimdAlignedBuffer64<unsigned short> SimdAlignedBuffer64us;
  132. namespace {
  133. //
  134. // Color space conversion, Inverse 709 CSC, Y'CbCr -> R'G'B'
  135. //
  136. void
  137. csc709Inverse (float &comp0, float &comp1, float &comp2)
  138. {
  139. float src[3];
  140. src[0] = comp0;
  141. src[1] = comp1;
  142. src[2] = comp2;
  143. comp0 = src[0] + 1.5747f * src[2];
  144. comp1 = src[0] - 0.1873f * src[1] - 0.4682f * src[2];
  145. comp2 = src[0] + 1.8556f * src[1];
  146. }
  147. #ifndef IMF_HAVE_SSE2
  148. //
  149. // Scalar color space conversion, based on 709 primiary chromaticies.
  150. // No scaling or offsets, just the matrix
  151. //
  152. void
  153. csc709Inverse64 (float *comp0, float *comp1, float *comp2)
  154. {
  155. for (int i = 0; i < 64; ++i)
  156. csc709Inverse (comp0[i], comp1[i], comp2[i]);
  157. }
  158. #else /* IMF_HAVE_SSE2 */
  159. //
  160. // SSE2 color space conversion
  161. //
  162. void
  163. csc709Inverse64 (float *comp0, float *comp1, float *comp2)
  164. {
  165. __m128 c0 = { 1.5747f, 1.5747f, 1.5747f, 1.5747f};
  166. __m128 c1 = { 1.8556f, 1.8556f, 1.8556f, 1.8556f};
  167. __m128 c2 = {-0.1873f, -0.1873f, -0.1873f, -0.1873f};
  168. __m128 c3 = {-0.4682f, -0.4682f, -0.4682f, -0.4682f};
  169. __m128 *r = (__m128 *)comp0;
  170. __m128 *g = (__m128 *)comp1;
  171. __m128 *b = (__m128 *)comp2;
  172. __m128 src[3];
  173. #define CSC_INVERSE_709_SSE2_LOOP(i) \
  174. src[0] = r[i]; \
  175. src[1] = g[i]; \
  176. src[2] = b[i]; \
  177. \
  178. r[i] = _mm_add_ps (r[i], _mm_mul_ps (src[2], c0)); \
  179. \
  180. g[i] = _mm_mul_ps (g[i], c2); \
  181. src[2] = _mm_mul_ps (src[2], c3); \
  182. g[i] = _mm_add_ps (g[i], src[0]); \
  183. g[i] = _mm_add_ps (g[i], src[2]); \
  184. \
  185. b[i] = _mm_mul_ps (c1, src[1]); \
  186. b[i] = _mm_add_ps (b[i], src[0]);
  187. CSC_INVERSE_709_SSE2_LOOP (0)
  188. CSC_INVERSE_709_SSE2_LOOP (1)
  189. CSC_INVERSE_709_SSE2_LOOP (2)
  190. CSC_INVERSE_709_SSE2_LOOP (3)
  191. CSC_INVERSE_709_SSE2_LOOP (4)
  192. CSC_INVERSE_709_SSE2_LOOP (5)
  193. CSC_INVERSE_709_SSE2_LOOP (6)
  194. CSC_INVERSE_709_SSE2_LOOP (7)
  195. CSC_INVERSE_709_SSE2_LOOP (8)
  196. CSC_INVERSE_709_SSE2_LOOP (9)
  197. CSC_INVERSE_709_SSE2_LOOP (10)
  198. CSC_INVERSE_709_SSE2_LOOP (11)
  199. CSC_INVERSE_709_SSE2_LOOP (12)
  200. CSC_INVERSE_709_SSE2_LOOP (13)
  201. CSC_INVERSE_709_SSE2_LOOP (14)
  202. CSC_INVERSE_709_SSE2_LOOP (15)
  203. }
  204. #endif /* IMF_HAVE_SSE2 */
  205. //
  206. // Color space conversion, Forward 709 CSC, R'G'B' -> Y'CbCr
  207. //
  208. // Simple FPU color space conversion. Based on the 709
  209. // primary chromaticies, with no scaling or offsets.
  210. //
  211. void
  212. csc709Forward64 (float *comp0, float *comp1, float *comp2)
  213. {
  214. float src[3];
  215. for (int i = 0; i<64; ++i)
  216. {
  217. src[0] = comp0[i];
  218. src[1] = comp1[i];
  219. src[2] = comp2[i];
  220. comp0[i] = 0.2126f * src[0] + 0.7152f * src[1] + 0.0722f * src[2];
  221. comp1[i] = -0.1146f * src[0] - 0.3854f * src[1] + 0.5000f * src[2];
  222. comp2[i] = 0.5000f * src[0] - 0.4542f * src[1] - 0.0458f * src[2];
  223. }
  224. }
  225. //
  226. // Byte interleaving of 2 byte arrays:
  227. // src0 = AAAA
  228. // src1 = BBBB
  229. // dst = ABABABAB
  230. //
  231. // numBytes is the size of each of the source buffers
  232. //
  233. #ifndef IMF_HAVE_SSE2
  234. //
  235. // Scalar default implementation
  236. //
  237. void
  238. interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
  239. {
  240. for (int x = 0; x < numBytes; ++x)
  241. {
  242. dst[2 * x] = src0[x];
  243. dst[2 * x + 1] = src1[x];
  244. }
  245. }
  246. #else /* IMF_HAVE_SSE2 */
  247. //
  248. // SSE2 byte interleaving
  249. //
  250. void
  251. interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
  252. {
  253. int dstAlignment = (size_t)dst % 16;
  254. int src0Alignment = (size_t)src0 % 16;
  255. int src1Alignment = (size_t)src1 % 16;
  256. __m128i *dst_epi8 = (__m128i*)dst;
  257. __m128i *src0_epi8 = (__m128i*)src0;
  258. __m128i *src1_epi8 = (__m128i*)src1;
  259. int sseWidth = numBytes / 16;
  260. if ((!dstAlignment) && (!src0Alignment) && (!src1Alignment))
  261. {
  262. __m128i tmp0, tmp1;
  263. //
  264. // Aligned loads and stores
  265. //
  266. for (int x = 0; x < sseWidth; ++x)
  267. {
  268. tmp0 = src0_epi8[x];
  269. tmp1 = src1_epi8[x];
  270. _mm_stream_si128 (&dst_epi8[2 * x],
  271. _mm_unpacklo_epi8 (tmp0, tmp1));
  272. _mm_stream_si128 (&dst_epi8[2 * x + 1],
  273. _mm_unpackhi_epi8 (tmp0, tmp1));
  274. }
  275. //
  276. // Then do run the leftovers one at a time
  277. //
  278. for (int x = 16 * sseWidth; x < numBytes; ++x)
  279. {
  280. dst[2 * x] = src0[x];
  281. dst[2 * x + 1] = src1[x];
  282. }
  283. }
  284. else if ((!dstAlignment) && (src0Alignment == 8) && (src1Alignment == 8))
  285. {
  286. //
  287. // Aligned stores, but catch up a few values so we can
  288. // use aligned loads
  289. //
  290. for (int x = 0; x < std::min (numBytes, 8); ++x)
  291. {
  292. dst[2 * x] = src0[x];
  293. dst[2 * x + 1] = src1[x];
  294. }
  295. if (numBytes > 8)
  296. {
  297. dst_epi8 = (__m128i*)&dst[16];
  298. src0_epi8 = (__m128i*)&src0[8];
  299. src1_epi8 = (__m128i*)&src1[8];
  300. sseWidth = (numBytes - 8) / 16;
  301. for (int x=0; x<sseWidth; ++x)
  302. {
  303. _mm_stream_si128 (&dst_epi8[2 * x],
  304. _mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x]));
  305. _mm_stream_si128 (&dst_epi8[2 * x + 1],
  306. _mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x]));
  307. }
  308. //
  309. // Then do run the leftovers one at a time
  310. //
  311. for (int x = 16 * sseWidth + 8; x < numBytes; ++x)
  312. {
  313. dst[2 * x] = src0[x];
  314. dst[2 * x + 1] = src1[x];
  315. }
  316. }
  317. }
  318. else
  319. {
  320. //
  321. // Unaligned everything
  322. //
  323. for (int x = 0; x < sseWidth; ++x)
  324. {
  325. __m128i tmpSrc0_epi8 = _mm_loadu_si128 (&src0_epi8[x]);
  326. __m128i tmpSrc1_epi8 = _mm_loadu_si128 (&src1_epi8[x]);
  327. _mm_storeu_si128 (&dst_epi8[2 * x],
  328. _mm_unpacklo_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
  329. _mm_storeu_si128 (&dst_epi8[2 * x + 1],
  330. _mm_unpackhi_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
  331. }
  332. //
  333. // Then do run the leftovers one at a time
  334. //
  335. for (int x = 16 * sseWidth; x < numBytes; ++x)
  336. {
  337. dst[2 * x] = src0[x];
  338. dst[2 * x + 1] = src1[x];
  339. }
  340. }
  341. }
  342. #endif /* IMF_HAVE_SSE2 */
  343. //
  344. // Float -> half float conversion
  345. //
  346. // To enable F16C based conversion, we can't rely on compile-time
  347. // detection, hence the multiple defined versions. Pick one based
  348. // on runtime cpuid detection.
  349. //
  350. //
  351. // Default boring conversion
  352. //
  353. void
  354. convertFloatToHalf64_scalar (unsigned short *dst, float *src)
  355. {
  356. for (int i=0; i<64; ++i)
  357. dst[i] = ((half)src[i]).bits();
  358. }
  359. //
  360. // F16C conversion - Assumes aligned src and dst
  361. //
  362. void
  363. convertFloatToHalf64_f16c (unsigned short *dst, float *src)
  364. {
  365. //
  366. // Ordinarly, I'd avoid using inline asm and prefer intrinsics.
  367. // However, in order to get the intrinsics, we need to tell
  368. // the compiler to generate VEX instructions.
  369. //
  370. // (On the GCC side, -mf16c goes ahead and activates -mavc,
  371. // resulting in VEX code. Without -mf16c, no intrinsics..)
  372. //
  373. // Now, it's quite likely that we'll find ourselves in situations
  374. // where we want to build *without* VEX, in order to maintain
  375. // maximum compatability. But to get there with intrinsics,
  376. // we'd need to break out code into a separate file. Bleh.
  377. // I'll take the asm.
  378. //
  379. #if defined IMF_HAVE_GCC_INLINEASM
  380. __asm__
  381. ("vmovaps (%0), %%ymm0 \n"
  382. "vmovaps 0x20(%0), %%ymm1 \n"
  383. "vmovaps 0x40(%0), %%ymm2 \n"
  384. "vmovaps 0x60(%0), %%ymm3 \n"
  385. "vcvtps2ph $0, %%ymm0, %%xmm0 \n"
  386. "vcvtps2ph $0, %%ymm1, %%xmm1 \n"
  387. "vcvtps2ph $0, %%ymm2, %%xmm2 \n"
  388. "vcvtps2ph $0, %%ymm3, %%xmm3 \n"
  389. "vmovdqa %%xmm0, 0x00(%1) \n"
  390. "vmovdqa %%xmm1, 0x10(%1) \n"
  391. "vmovdqa %%xmm2, 0x20(%1) \n"
  392. "vmovdqa %%xmm3, 0x30(%1) \n"
  393. "vmovaps 0x80(%0), %%ymm0 \n"
  394. "vmovaps 0xa0(%0), %%ymm1 \n"
  395. "vmovaps 0xc0(%0), %%ymm2 \n"
  396. "vmovaps 0xe0(%0), %%ymm3 \n"
  397. "vcvtps2ph $0, %%ymm0, %%xmm0 \n"
  398. "vcvtps2ph $0, %%ymm1, %%xmm1 \n"
  399. "vcvtps2ph $0, %%ymm2, %%xmm2 \n"
  400. "vcvtps2ph $0, %%ymm3, %%xmm3 \n"
  401. "vmovdqa %%xmm0, 0x40(%1) \n"
  402. "vmovdqa %%xmm1, 0x50(%1) \n"
  403. "vmovdqa %%xmm2, 0x60(%1) \n"
  404. "vmovdqa %%xmm3, 0x70(%1) \n"
  405. #ifndef __AVX__
  406. "vzeroupper \n"
  407. #endif /* __AVX__ */
  408. : /* Output */
  409. : /* Input */ "r"(src), "r"(dst)
  410. #ifndef __AVX__
  411. : /* Clobber */ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"
  412. #else
  413. : /* Clobber */ "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"
  414. #endif /* __AVX__ */
  415. );
  416. #else
  417. convertFloatToHalf64_scalar (dst, src);
  418. #endif /* IMF_HAVE_GCC_INLINEASM */
  419. }
  420. //
  421. // Convert an 8x8 block of HALF from zig-zag order to
  422. // FLOAT in normal order. The order we want is:
  423. //
  424. // src dst
  425. // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28
  426. // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42
  427. // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43
  428. // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53
  429. // 32 33 34 35 36 37 38 39 10 19 23 32 39 45 52 54
  430. // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60
  431. // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61
  432. // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63
  433. //
  434. void
  435. fromHalfZigZag_scalar (unsigned short *src, float *dst)
  436. {
  437. half *srcHalf = (half *)src;
  438. dst[0] = (float)srcHalf[0];
  439. dst[1] = (float)srcHalf[1];
  440. dst[2] = (float)srcHalf[5];
  441. dst[3] = (float)srcHalf[6];
  442. dst[4] = (float)srcHalf[14];
  443. dst[5] = (float)srcHalf[15];
  444. dst[6] = (float)srcHalf[27];
  445. dst[7] = (float)srcHalf[28];
  446. dst[8] = (float)srcHalf[2];
  447. dst[9] = (float)srcHalf[4];
  448. dst[10] = (float)srcHalf[7];
  449. dst[11] = (float)srcHalf[13];
  450. dst[12] = (float)srcHalf[16];
  451. dst[13] = (float)srcHalf[26];
  452. dst[14] = (float)srcHalf[29];
  453. dst[15] = (float)srcHalf[42];
  454. dst[16] = (float)srcHalf[3];
  455. dst[17] = (float)srcHalf[8];
  456. dst[18] = (float)srcHalf[12];
  457. dst[19] = (float)srcHalf[17];
  458. dst[20] = (float)srcHalf[25];
  459. dst[21] = (float)srcHalf[30];
  460. dst[22] = (float)srcHalf[41];
  461. dst[23] = (float)srcHalf[43];
  462. dst[24] = (float)srcHalf[9];
  463. dst[25] = (float)srcHalf[11];
  464. dst[26] = (float)srcHalf[18];
  465. dst[27] = (float)srcHalf[24];
  466. dst[28] = (float)srcHalf[31];
  467. dst[29] = (float)srcHalf[40];
  468. dst[30] = (float)srcHalf[44];
  469. dst[31] = (float)srcHalf[53];
  470. dst[32] = (float)srcHalf[10];
  471. dst[33] = (float)srcHalf[19];
  472. dst[34] = (float)srcHalf[23];
  473. dst[35] = (float)srcHalf[32];
  474. dst[36] = (float)srcHalf[39];
  475. dst[37] = (float)srcHalf[45];
  476. dst[38] = (float)srcHalf[52];
  477. dst[39] = (float)srcHalf[54];
  478. dst[40] = (float)srcHalf[20];
  479. dst[41] = (float)srcHalf[22];
  480. dst[42] = (float)srcHalf[33];
  481. dst[43] = (float)srcHalf[38];
  482. dst[44] = (float)srcHalf[46];
  483. dst[45] = (float)srcHalf[51];
  484. dst[46] = (float)srcHalf[55];
  485. dst[47] = (float)srcHalf[60];
  486. dst[48] = (float)srcHalf[21];
  487. dst[49] = (float)srcHalf[34];
  488. dst[50] = (float)srcHalf[37];
  489. dst[51] = (float)srcHalf[47];
  490. dst[52] = (float)srcHalf[50];
  491. dst[53] = (float)srcHalf[56];
  492. dst[54] = (float)srcHalf[59];
  493. dst[55] = (float)srcHalf[61];
  494. dst[56] = (float)srcHalf[35];
  495. dst[57] = (float)srcHalf[36];
  496. dst[58] = (float)srcHalf[48];
  497. dst[59] = (float)srcHalf[49];
  498. dst[60] = (float)srcHalf[57];
  499. dst[61] = (float)srcHalf[58];
  500. dst[62] = (float)srcHalf[62];
  501. dst[63] = (float)srcHalf[63];
  502. }
  503. //
  504. // If we can form the correct ordering in xmm registers,
  505. // we can use F16C to convert from HALF -> FLOAT. However,
  506. // making the correct order isn't trivial.
  507. //
  508. // We want to re-order a source 8x8 matrix from:
  509. //
  510. // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28
  511. // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42
  512. // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43
  513. // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53 (A)
  514. // 32 33 34 35 36 37 38 39 --> 10 19 23 32 39 45 52 54
  515. // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60
  516. // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61
  517. // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63
  518. //
  519. // Which looks like a mess, right?
  520. //
  521. // Now, check out the NE/SW diagonals of (A). Along those lines,
  522. // we have runs of contiguous values! If we rewrite (A) a bit, we get:
  523. //
  524. // 0
  525. // 1 2
  526. // 5 4 3
  527. // 6 7 8 9
  528. // 14 13 12 11 10
  529. // 15 16 17 18 19 20
  530. // 27 26 25 24 23 22 21 (B)
  531. // 28 29 30 31 32 33 34 35
  532. // 42 41 40 39 38 37 36
  533. // 43 44 45 46 47 48
  534. // 53 52 51 50 49
  535. // 54 55 56 57
  536. // 60 59 58
  537. // 61 62
  538. // 63
  539. //
  540. // In this ordering, the columns are the rows (A). If we can 'transpose'
  541. // (B), we'll achieve our goal. But we want this to fit nicely into
  542. // xmm registers and still be able to load large runs efficiently.
  543. // Also, notice that the odd rows are in ascending order, while
  544. // the even rows are in descending order.
  545. //
  546. // If we 'fold' the bottom half up into the top, we can preserve ordered
  547. // runs accross rows, and still keep all the correct values in columns.
  548. // After transposing, we'll need to rotate things back into place.
  549. // This gives us:
  550. //
  551. // 0 | 42 41 40 39 38 37 36
  552. // 1 2 | 43 44 45 46 47 48
  553. // 5 4 3 | 53 52 51 50 49
  554. // 6 7 8 9 | 54 55 56 57 (C)
  555. // 14 13 12 11 10 | 60 59 58
  556. // 15 16 17 18 19 20 | 61 62
  557. // 27 26 25 24 23 22 21 | 61
  558. // 28 29 30 31 32 33 34 35
  559. //
  560. // But hang on. We still have the backwards descending rows to deal with.
  561. // Lets reverse the even rows so that all values are in ascending order
  562. //
  563. // 36 37 38 39 40 41 42 | 0
  564. // 1 2 | 43 44 45 46 47 48
  565. // 49 50 51 52 53 | 3 4 5
  566. // 6 7 8 9 | 54 55 56 57 (D)
  567. // 58 59 60 | 10 11 12 13 14
  568. // 15 16 17 18 19 20 | 61 62
  569. // 61 | 21 22 23 24 25 26 27
  570. // 28 29 30 31 32 33 34 35
  571. //
  572. // If we can form (D), we will then:
  573. // 1) Reverse the even rows
  574. // 2) Transpose
  575. // 3) Rotate the rows
  576. //
  577. // and we'll have (A).
  578. //
  579. void
  580. fromHalfZigZag_f16c (unsigned short *src, float *dst)
  581. {
  582. #if defined IMF_HAVE_GCC_INLINEASM_64
  583. __asm__
  584. /* x3 <- 0
  585. * x8 <- [ 0- 7]
  586. * x6 <- [56-63]
  587. * x9 <- [21-28]
  588. * x7 <- [28-35]
  589. * x3 <- [ 6- 9] (lower half) */
  590. ("vpxor %%xmm3, %%xmm3, %%xmm3 \n"
  591. "vmovdqa (%0), %%xmm8 \n"
  592. "vmovdqa 112(%0), %%xmm6 \n"
  593. "vmovdqu 42(%0), %%xmm9 \n"
  594. "vmovdqu 56(%0), %%xmm7 \n"
  595. "vmovq 12(%0), %%xmm3 \n"
  596. /* Setup rows 0-2 of A in xmm0-xmm2
  597. * x1 <- x8 >> 16 (1 value)
  598. * x2 <- x8 << 32 (2 values)
  599. * x0 <- alignr([35-42], x8, 2)
  600. * x1 <- blend(x1, [41-48])
  601. * x2 <- blend(x2, [49-56]) */
  602. "vpsrldq $2, %%xmm8, %%xmm1 \n"
  603. "vpslldq $4, %%xmm8, %%xmm2 \n"
  604. "vpalignr $2, 70(%0), %%xmm8, %%xmm0 \n"
  605. "vpblendw $0xfc, 82(%0), %%xmm1, %%xmm1 \n"
  606. "vpblendw $0x1f, 98(%0), %%xmm2, %%xmm2 \n"
  607. /* Setup rows 4-6 of A in xmm4-xmm6
  608. * x4 <- x6 >> 32 (2 values)
  609. * x5 <- x6 << 16 (1 value)
  610. * x6 <- alignr(x6,x9,14)
  611. * x4 <- blend(x4, [ 7-14])
  612. * x5 <- blend(x5, [15-22]) */
  613. "vpsrldq $4, %%xmm6, %%xmm4 \n"
  614. "vpslldq $2, %%xmm6, %%xmm5 \n"
  615. "vpalignr $14, %%xmm6, %%xmm9, %%xmm6 \n"
  616. "vpblendw $0xf8, 14(%0), %%xmm4, %%xmm4 \n"
  617. "vpblendw $0x3f, 30(%0), %%xmm5, %%xmm5 \n"
  618. /* Load the upper half of row 3 into xmm3
  619. * x3 <- [54-57] (upper half) */
  620. "vpinsrq $1, 108(%0), %%xmm3, %%xmm3\n"
  621. /* Reverse the even rows. We're not using PSHUFB as
  622. * that requires loading an extra constant all the time,
  623. * and we're alreadly pretty memory bound.
  624. */
  625. "vpshuflw $0x1b, %%xmm0, %%xmm0 \n"
  626. "vpshuflw $0x1b, %%xmm2, %%xmm2 \n"
  627. "vpshuflw $0x1b, %%xmm4, %%xmm4 \n"
  628. "vpshuflw $0x1b, %%xmm6, %%xmm6 \n"
  629. "vpshufhw $0x1b, %%xmm0, %%xmm0 \n"
  630. "vpshufhw $0x1b, %%xmm2, %%xmm2 \n"
  631. "vpshufhw $0x1b, %%xmm4, %%xmm4 \n"
  632. "vpshufhw $0x1b, %%xmm6, %%xmm6 \n"
  633. "vpshufd $0x4e, %%xmm0, %%xmm0 \n"
  634. "vpshufd $0x4e, %%xmm2, %%xmm2 \n"
  635. "vpshufd $0x4e, %%xmm4, %%xmm4 \n"
  636. "vpshufd $0x4e, %%xmm6, %%xmm6 \n"
  637. /* Transpose xmm0-xmm7 into xmm8-xmm15 */
  638. "vpunpcklwd %%xmm1, %%xmm0, %%xmm8 \n"
  639. "vpunpcklwd %%xmm3, %%xmm2, %%xmm9 \n"
  640. "vpunpcklwd %%xmm5, %%xmm4, %%xmm10 \n"
  641. "vpunpcklwd %%xmm7, %%xmm6, %%xmm11 \n"
  642. "vpunpckhwd %%xmm1, %%xmm0, %%xmm12 \n"
  643. "vpunpckhwd %%xmm3, %%xmm2, %%xmm13 \n"
  644. "vpunpckhwd %%xmm5, %%xmm4, %%xmm14 \n"
  645. "vpunpckhwd %%xmm7, %%xmm6, %%xmm15 \n"
  646. "vpunpckldq %%xmm9, %%xmm8, %%xmm0 \n"
  647. "vpunpckldq %%xmm11, %%xmm10, %%xmm1 \n"
  648. "vpunpckhdq %%xmm9, %%xmm8, %%xmm2 \n"
  649. "vpunpckhdq %%xmm11, %%xmm10, %%xmm3 \n"
  650. "vpunpckldq %%xmm13, %%xmm12, %%xmm4 \n"
  651. "vpunpckldq %%xmm15, %%xmm14, %%xmm5 \n"
  652. "vpunpckhdq %%xmm13, %%xmm12, %%xmm6 \n"
  653. "vpunpckhdq %%xmm15, %%xmm14, %%xmm7 \n"
  654. "vpunpcklqdq %%xmm1, %%xmm0, %%xmm8 \n"
  655. "vpunpckhqdq %%xmm1, %%xmm0, %%xmm9 \n"
  656. "vpunpcklqdq %%xmm3, %%xmm2, %%xmm10 \n"
  657. "vpunpckhqdq %%xmm3, %%xmm2, %%xmm11 \n"
  658. "vpunpcklqdq %%xmm4, %%xmm5, %%xmm12 \n"
  659. "vpunpckhqdq %%xmm5, %%xmm4, %%xmm13 \n"
  660. "vpunpcklqdq %%xmm7, %%xmm6, %%xmm14 \n"
  661. "vpunpckhqdq %%xmm7, %%xmm6, %%xmm15 \n"
  662. /* Rotate the rows to get the correct final order.
  663. * Rotating xmm12 isn't needed, as we can handle
  664. * the rotation in the PUNPCKLQDQ above. Rotating
  665. * xmm8 isn't needed as it's already in the right order
  666. */
  667. "vpalignr $2, %%xmm9, %%xmm9, %%xmm9 \n"
  668. "vpalignr $4, %%xmm10, %%xmm10, %%xmm10 \n"
  669. "vpalignr $6, %%xmm11, %%xmm11, %%xmm11 \n"
  670. "vpalignr $10, %%xmm13, %%xmm13, %%xmm13 \n"
  671. "vpalignr $12, %%xmm14, %%xmm14, %%xmm14 \n"
  672. "vpalignr $14, %%xmm15, %%xmm15, %%xmm15 \n"
  673. /* Convert from half -> float */
  674. "vcvtph2ps %%xmm8, %%ymm8 \n"
  675. "vcvtph2ps %%xmm9, %%ymm9 \n"
  676. "vcvtph2ps %%xmm10, %%ymm10 \n"
  677. "vcvtph2ps %%xmm11, %%ymm11 \n"
  678. "vcvtph2ps %%xmm12, %%ymm12 \n"
  679. "vcvtph2ps %%xmm13, %%ymm13 \n"
  680. "vcvtph2ps %%xmm14, %%ymm14 \n"
  681. "vcvtph2ps %%xmm15, %%ymm15 \n"
  682. /* Move float values to dst */
  683. "vmovaps %%ymm8, (%1) \n"
  684. "vmovaps %%ymm9, 32(%1) \n"
  685. "vmovaps %%ymm10, 64(%1) \n"
  686. "vmovaps %%ymm11, 96(%1) \n"
  687. "vmovaps %%ymm12, 128(%1) \n"
  688. "vmovaps %%ymm13, 160(%1) \n"
  689. "vmovaps %%ymm14, 192(%1) \n"
  690. "vmovaps %%ymm15, 224(%1) \n"
  691. #ifndef __AVX__
  692. "vzeroupper \n"
  693. #endif /* __AVX__ */
  694. : /* Output */
  695. : /* Input */ "r"(src), "r"(dst)
  696. : /* Clobber */ "memory",
  697. #ifndef __AVX__
  698. "%xmm0", "%xmm1", "%xmm2", "%xmm3",
  699. "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  700. "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  701. "%xmm12", "%xmm13", "%xmm14", "%xmm15"
  702. #else
  703. "%ymm0", "%ymm1", "%ymm2", "%ymm3",
  704. "%ymm4", "%ymm5", "%ymm6", "%ymm7",
  705. "%ymm8", "%ymm9", "%ymm10", "%ymm11",
  706. "%ymm12", "%ymm13", "%ymm14", "%ymm15"
  707. #endif /* __AVX__ */
  708. );
  709. #else
  710. fromHalfZigZag_scalar(src, dst);
  711. #endif /* defined IMF_HAVE_GCC_INLINEASM_64 */
  712. }
  713. //
  714. // Inverse 8x8 DCT, only inverting the DC. This assumes that
  715. // all AC frequencies are 0.
  716. //
  717. #ifndef IMF_HAVE_SSE2
  718. void
  719. dctInverse8x8DcOnly (float *data)
  720. {
  721. float val = data[0] * 3.535536e-01f * 3.535536e-01f;
  722. for (int i = 0; i < 64; ++i)
  723. data[i] = val;
  724. }
  725. #else /* IMF_HAVE_SSE2 */
  726. void
  727. dctInverse8x8DcOnly (float *data)
  728. {
  729. __m128 src = _mm_set1_ps (data[0] * 3.535536e-01f * 3.535536e-01f);
  730. __m128 *dst = (__m128 *)data;
  731. for (int i = 0; i < 16; ++i)
  732. dst[i] = src;
  733. }
  734. #endif /* IMF_HAVE_SSE2 */
  735. //
  736. // Full 8x8 Inverse DCT:
  737. //
  738. // Simple inverse DCT on an 8x8 block, with scalar ops only.
  739. // Operates on data in-place.
  740. //
  741. // This is based on the iDCT formuation (y = frequency domain,
  742. // x = spatial domain)
  743. //
  744. // [x0] [ ][y0] [ ][y1]
  745. // [x1] = [ M1 ][y2] + [ M2 ][y3]
  746. // [x2] [ ][y4] [ ][y5]
  747. // [x3] [ ][y6] [ ][y7]
  748. //
  749. // [x7] [ ][y0] [ ][y1]
  750. // [x6] = [ M1 ][y2] - [ M2 ][y3]
  751. // [x5] [ ][y4] [ ][y5]
  752. // [x4] [ ][y6] [ ][y7]
  753. //
  754. // where M1: M2:
  755. //
  756. // [a c a f] [b d e g]
  757. // [a f -a -c] [d -g -b -e]
  758. // [a -f -a c] [e -b g d]
  759. // [a -c a -f] [g -e d -b]
  760. //
  761. // and the constants are as defined below..
  762. //
  763. // If you know how many of the lower rows are zero, that can
  764. // be passed in to help speed things up. If you don't know,
  765. // just set zeroedRows=0.
  766. //
  767. //
  768. // Default implementation
  769. //
  770. template <int zeroedRows>
  771. void
  772. dctInverse8x8_scalar (float *data)
  773. {
  774. const float a = .5f * cosf (3.14159f / 4.0f);
  775. const float b = .5f * cosf (3.14159f / 16.0f);
  776. const float c = .5f * cosf (3.14159f / 8.0f);
  777. const float d = .5f * cosf (3.f*3.14159f / 16.0f);
  778. const float e = .5f * cosf (5.f*3.14159f / 16.0f);
  779. const float f = .5f * cosf (3.f*3.14159f / 8.0f);
  780. const float g = .5f * cosf (7.f*3.14159f / 16.0f);
  781. float alpha[4], beta[4], theta[4], gamma[4];
  782. float *rowPtr = NULL;
  783. //
  784. // First pass - row wise.
  785. //
  786. // This looks less-compact than the description above in
  787. // an attempt to fold together common sub-expressions.
  788. //
  789. for (int row = 0; row < 8 - zeroedRows; ++row)
  790. {
  791. rowPtr = data + row * 8;
  792. alpha[0] = c * rowPtr[2];
  793. alpha[1] = f * rowPtr[2];
  794. alpha[2] = c * rowPtr[6];
  795. alpha[3] = f * rowPtr[6];
  796. beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7];
  797. beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7];
  798. beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7];
  799. beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7];
  800. theta[0] = a * (rowPtr[0] + rowPtr[4]);
  801. theta[3] = a * (rowPtr[0] - rowPtr[4]);
  802. theta[1] = alpha[0] + alpha[3];
  803. theta[2] = alpha[1] - alpha[2];
  804. gamma[0] = theta[0] + theta[1];
  805. gamma[1] = theta[3] + theta[2];
  806. gamma[2] = theta[3] - theta[2];
  807. gamma[3] = theta[0] - theta[1];
  808. rowPtr[0] = gamma[0] + beta[0];
  809. rowPtr[1] = gamma[1] + beta[1];
  810. rowPtr[2] = gamma[2] + beta[2];
  811. rowPtr[3] = gamma[3] + beta[3];
  812. rowPtr[4] = gamma[3] - beta[3];
  813. rowPtr[5] = gamma[2] - beta[2];
  814. rowPtr[6] = gamma[1] - beta[1];
  815. rowPtr[7] = gamma[0] - beta[0];
  816. }
  817. //
  818. // Second pass - column wise.
  819. //
  820. for (int column = 0; column < 8; ++column)
  821. {
  822. alpha[0] = c * data[16+column];
  823. alpha[1] = f * data[16+column];
  824. alpha[2] = c * data[48+column];
  825. alpha[3] = f * data[48+column];
  826. beta[0] = b * data[8+column] + d * data[24+column] +
  827. e * data[40+column] + g * data[56+column];
  828. beta[1] = d * data[8+column] - g * data[24+column] -
  829. b * data[40+column] - e * data[56+column];
  830. beta[2] = e * data[8+column] - b * data[24+column] +
  831. g * data[40+column] + d * data[56+column];
  832. beta[3] = g * data[8+column] - e * data[24+column] +
  833. d * data[40+column] - b * data[56+column];
  834. theta[0] = a * (data[column] + data[32+column]);
  835. theta[3] = a * (data[column] - data[32+column]);
  836. theta[1] = alpha[0] + alpha[3];
  837. theta[2] = alpha[1] - alpha[2];
  838. gamma[0] = theta[0] + theta[1];
  839. gamma[1] = theta[3] + theta[2];
  840. gamma[2] = theta[3] - theta[2];
  841. gamma[3] = theta[0] - theta[1];
  842. data[ column] = gamma[0] + beta[0];
  843. data[ 8 + column] = gamma[1] + beta[1];
  844. data[16 + column] = gamma[2] + beta[2];
  845. data[24 + column] = gamma[3] + beta[3];
  846. data[32 + column] = gamma[3] - beta[3];
  847. data[40 + column] = gamma[2] - beta[2];
  848. data[48 + column] = gamma[1] - beta[1];
  849. data[56 + column] = gamma[0] - beta[0];
  850. }
  851. }
  852. //
  853. // SSE2 Implementation
  854. //
  855. template <int zeroedRows>
  856. void
  857. dctInverse8x8_sse2 (float *data)
  858. {
  859. #ifdef IMF_HAVE_SSE2
  860. __m128 a = {3.535536e-01f,3.535536e-01f,3.535536e-01f,3.535536e-01f};
  861. __m128 b = {4.903927e-01f,4.903927e-01f,4.903927e-01f,4.903927e-01f};
  862. __m128 c = {4.619398e-01f,4.619398e-01f,4.619398e-01f,4.619398e-01f};
  863. __m128 d = {4.157349e-01f,4.157349e-01f,4.157349e-01f,4.157349e-01f};
  864. __m128 e = {2.777855e-01f,2.777855e-01f,2.777855e-01f,2.777855e-01f};
  865. __m128 f = {1.913422e-01f,1.913422e-01f,1.913422e-01f,1.913422e-01f};
  866. __m128 g = {9.754573e-02f,9.754573e-02f,9.754573e-02f,9.754573e-02f};
  867. __m128 c0 = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f};
  868. __m128 c1 = {4.619398e-01f, 1.913422e-01f,-1.913422e-01f,-4.619398e-01f};
  869. __m128 c2 = {3.535536e-01f,-3.535536e-01f,-3.535536e-01f, 3.535536e-01f};
  870. __m128 c3 = {1.913422e-01f,-4.619398e-01f, 4.619398e-01f,-1.913422e-01f};
  871. __m128 c4 = {4.903927e-01f, 4.157349e-01f, 2.777855e-01f, 9.754573e-02f};
  872. __m128 c5 = {4.157349e-01f,-9.754573e-02f,-4.903927e-01f,-2.777855e-01f};
  873. __m128 c6 = {2.777855e-01f,-4.903927e-01f, 9.754573e-02f, 4.157349e-01f};
  874. __m128 c7 = {9.754573e-02f,-2.777855e-01f, 4.157349e-01f,-4.903927e-01f};
  875. __m128 *srcVec = (__m128 *)data;
  876. __m128 x[8], evenSum, oddSum;
  877. __m128 in[8], alpha[4], beta[4], theta[4], gamma[4];
  878. //
  879. // Rows -
  880. //
  881. // Treat this just like matrix-vector multiplication. The
  882. // trick is to note that:
  883. //
  884. // [M00 M01 M02 M03][v0] [(v0 M00) + (v1 M01) + (v2 M02) + (v3 M03)]
  885. // [M10 M11 M12 M13][v1] = [(v0 M10) + (v1 M11) + (v2 M12) + (v3 M13)]
  886. // [M20 M21 M22 M23][v2] [(v0 M20) + (v1 M21) + (v2 M22) + (v3 M23)]
  887. // [M30 M31 M32 M33][v3] [(v0 M30) + (v1 M31) + (v2 M32) + (v3 M33)]
  888. //
  889. // Then, we can fill a register with v_i and multiply by the i-th column
  890. // of M, accumulating across all i-s.
  891. //
  892. // The kids refer to the populating of a register with a single value
  893. // "broadcasting", and it can be done with a shuffle instruction. It
  894. // seems to be the slowest part of the whole ordeal.
  895. //
  896. // Our matrix columns are stored above in c0-c7. c0-3 make up M1, and
  897. // c4-7 are from M2.
  898. //
  899. #define DCT_INVERSE_8x8_SS2_ROW_LOOP(i) \
  900. /* \
  901. * Broadcast the components of the row \
  902. */ \
  903. \
  904. x[0] = _mm_shuffle_ps (srcVec[2 * i], \
  905. srcVec[2 * i], \
  906. _MM_SHUFFLE (0, 0, 0, 0)); \
  907. \
  908. x[1] = _mm_shuffle_ps (srcVec[2 * i], \
  909. srcVec[2 * i], \
  910. _MM_SHUFFLE (1, 1, 1, 1)); \
  911. \
  912. x[2] = _mm_shuffle_ps (srcVec[2 * i], \
  913. srcVec[2 * i], \
  914. _MM_SHUFFLE (2, 2, 2, 2)); \
  915. \
  916. x[3] = _mm_shuffle_ps (srcVec[2 * i], \
  917. srcVec[2 * i], \
  918. _MM_SHUFFLE (3, 3, 3, 3)); \
  919. \
  920. x[4] = _mm_shuffle_ps (srcVec[2 * i + 1], \
  921. srcVec[2 * i + 1], \
  922. _MM_SHUFFLE (0, 0, 0, 0)); \
  923. \
  924. x[5] = _mm_shuffle_ps (srcVec[2 * i + 1], \
  925. srcVec[2 * i + 1], \
  926. _MM_SHUFFLE (1, 1, 1, 1)); \
  927. \
  928. x[6] = _mm_shuffle_ps (srcVec[2 * i + 1], \
  929. srcVec[2 * i + 1], \
  930. _MM_SHUFFLE (2, 2, 2, 2)); \
  931. \
  932. x[7] = _mm_shuffle_ps (srcVec[2 * i + 1], \
  933. srcVec[2 * i + 1], \
  934. _MM_SHUFFLE (3, 3, 3, 3)); \
  935. /* \
  936. * Multiply the components by each column of the matrix \
  937. */ \
  938. \
  939. x[0] = _mm_mul_ps (x[0], c0); \
  940. x[2] = _mm_mul_ps (x[2], c1); \
  941. x[4] = _mm_mul_ps (x[4], c2); \
  942. x[6] = _mm_mul_ps (x[6], c3); \
  943. \
  944. x[1] = _mm_mul_ps (x[1], c4); \
  945. x[3] = _mm_mul_ps (x[3], c5); \
  946. x[5] = _mm_mul_ps (x[5], c6); \
  947. x[7] = _mm_mul_ps (x[7], c7); \
  948. \
  949. /* \
  950. * Add across \
  951. */ \
  952. \
  953. evenSum = _mm_setzero_ps(); \
  954. evenSum = _mm_add_ps (evenSum, x[0]); \
  955. evenSum = _mm_add_ps (evenSum, x[2]); \
  956. evenSum = _mm_add_ps (evenSum, x[4]); \
  957. evenSum = _mm_add_ps (evenSum, x[6]); \
  958. \
  959. oddSum = _mm_setzero_ps(); \
  960. oddSum = _mm_add_ps (oddSum, x[1]); \
  961. oddSum = _mm_add_ps (oddSum, x[3]); \
  962. oddSum = _mm_add_ps (oddSum, x[5]); \
  963. oddSum = _mm_add_ps (oddSum, x[7]); \
  964. \
  965. /* \
  966. * Final Sum: \
  967. * out [0, 1, 2, 3] = evenSum + oddSum \
  968. * out [7, 6, 5, 4] = evenSum - oddSum \
  969. */ \
  970. \
  971. srcVec[2 * i] = _mm_add_ps (evenSum, oddSum); \
  972. srcVec[2 * i + 1] = _mm_sub_ps (evenSum, oddSum); \
  973. srcVec[2 * i + 1] = _mm_shuffle_ps (srcVec[2 * i + 1], \
  974. srcVec[2 * i + 1], \
  975. _MM_SHUFFLE (0, 1, 2, 3));
  976. switch (zeroedRows)
  977. {
  978. case 0:
  979. default:
  980. DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
  981. DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
  982. DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
  983. DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
  984. DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
  985. DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
  986. DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
  987. DCT_INVERSE_8x8_SS2_ROW_LOOP (7)
  988. break;
  989. case 1:
  990. DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
  991. DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
  992. DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
  993. DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
  994. DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
  995. DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
  996. DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
  997. break;
  998. case 2:
  999. DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
  1000. DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
  1001. DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
  1002. DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
  1003. DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
  1004. DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
  1005. break;
  1006. case 3:
  1007. DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
  1008. DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
  1009. DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
  1010. DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
  1011. DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
  1012. break;
  1013. case 4:
  1014. DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
  1015. DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
  1016. DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
  1017. DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
  1018. break;
  1019. case 5:
  1020. DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
  1021. DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
  1022. DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
  1023. break;
  1024. case 6:
  1025. DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
  1026. DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
  1027. break;
  1028. case 7:
  1029. DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
  1030. break;
  1031. }
  1032. //
  1033. // Columns -
  1034. //
  1035. // This is slightly more straightforward, if less readable. Here
  1036. // we just operate on 4 columns at a time, in two batches.
  1037. //
  1038. // The slight mess is to try and cache sub-expressions, which
  1039. // we ignore in the row-wise pass.
  1040. //
  1041. for (int col = 0; col < 2; ++col)
  1042. {
  1043. for (int i = 0; i < 8; ++i)
  1044. in[i] = srcVec[2 * i + col];
  1045. alpha[0] = _mm_mul_ps (c, in[2]);
  1046. alpha[1] = _mm_mul_ps (f, in[2]);
  1047. alpha[2] = _mm_mul_ps (c, in[6]);
  1048. alpha[3] = _mm_mul_ps (f, in[6]);
  1049. beta[0] = _mm_add_ps (_mm_add_ps (_mm_mul_ps (in[1], b),
  1050. _mm_mul_ps (in[3], d)),
  1051. _mm_add_ps (_mm_mul_ps (in[5], e),
  1052. _mm_mul_ps (in[7], g)));
  1053. beta[1] = _mm_sub_ps (_mm_sub_ps (_mm_mul_ps (in[1], d),
  1054. _mm_mul_ps (in[3], g)),
  1055. _mm_add_ps (_mm_mul_ps (in[5], b),
  1056. _mm_mul_ps (in[7], e)));
  1057. beta[2] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], e),
  1058. _mm_mul_ps (in[3], b)),
  1059. _mm_add_ps (_mm_mul_ps (in[5], g),
  1060. _mm_mul_ps (in[7], d)));
  1061. beta[3] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], g),
  1062. _mm_mul_ps (in[3], e)),
  1063. _mm_sub_ps (_mm_mul_ps (in[5], d),
  1064. _mm_mul_ps (in[7], b)));
  1065. theta[0] = _mm_mul_ps (a, _mm_add_ps (in[0], in[4]));
  1066. theta[3] = _mm_mul_ps (a, _mm_sub_ps (in[0], in[4]));
  1067. theta[1] = _mm_add_ps (alpha[0], alpha[3]);
  1068. theta[2] = _mm_sub_ps (alpha[1], alpha[2]);
  1069. gamma[0] = _mm_add_ps (theta[0], theta[1]);
  1070. gamma[1] = _mm_add_ps (theta[3], theta[2]);
  1071. gamma[2] = _mm_sub_ps (theta[3], theta[2]);
  1072. gamma[3] = _mm_sub_ps (theta[0], theta[1]);
  1073. srcVec[ col] = _mm_add_ps (gamma[0], beta[0]);
  1074. srcVec[2+col] = _mm_add_ps (gamma[1], beta[1]);
  1075. srcVec[4+col] = _mm_add_ps (gamma[2], beta[2]);
  1076. srcVec[6+col] = _mm_add_ps (gamma[3], beta[3]);
  1077. srcVec[ 8+col] = _mm_sub_ps (gamma[3], beta[3]);
  1078. srcVec[10+col] = _mm_sub_ps (gamma[2], beta[2]);
  1079. srcVec[12+col] = _mm_sub_ps (gamma[1], beta[1]);
  1080. srcVec[14+col] = _mm_sub_ps (gamma[0], beta[0]);
  1081. }
  1082. #else /* IMF_HAVE_SSE2 */
  1083. dctInverse8x8_scalar<zeroedRows> (data);
  1084. #endif /* IMF_HAVE_SSE2 */
  1085. }
  1086. //
  1087. // AVX Implementation
  1088. //
  1089. #define STR(A) #A
  1090. #define IDCT_AVX_SETUP_2_ROWS(_DST0, _DST1, _TMP0, _TMP1, \
  1091. _OFF00, _OFF01, _OFF10, _OFF11) \
  1092. "vmovaps " STR(_OFF00) "(%0), %%xmm" STR(_TMP0) " \n" \
  1093. "vmovaps " STR(_OFF01) "(%0), %%xmm" STR(_TMP1) " \n" \
  1094. " \n" \
  1095. "vinsertf128 $1, " STR(_OFF10) "(%0), %%ymm" STR(_TMP0) ", %%ymm" STR(_TMP0) " \n" \
  1096. "vinsertf128 $1, " STR(_OFF11) "(%0), %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP1) " \n" \
  1097. " \n" \
  1098. "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \
  1099. "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n" \
  1100. " \n" \
  1101. "vunpcklps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP0) " \n" \
  1102. "vunpckhps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP1) " \n" \
  1103. " \n" \
  1104. "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \
  1105. "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n"
  1106. #define IDCT_AVX_MMULT_ROWS(_SRC) \
  1107. /* Broadcast the source values into y12-y15 */ \
  1108. "vpermilps $0x00, " STR(_SRC) ", %%ymm12 \n" \
  1109. "vpermilps $0x55, " STR(_SRC) ", %%ymm13 \n" \
  1110. "vpermilps $0xaa, " STR(_SRC) ", %%ymm14 \n" \
  1111. "vpermilps $0xff, " STR(_SRC) ", %%ymm15 \n" \
  1112. \
  1113. /* Multiple coefs and the broadcasted values */ \
  1114. "vmulps %%ymm12, %%ymm8, %%ymm12 \n" \
  1115. "vmulps %%ymm13, %%ymm9, %%ymm13 \n" \
  1116. "vmulps %%ymm14, %%ymm10, %%ymm14 \n" \
  1117. "vmulps %%ymm15, %%ymm11, %%ymm15 \n" \
  1118. \
  1119. /* Accumulate the result back into the source */ \
  1120. "vaddps %%ymm13, %%ymm12, %%ymm12 \n" \
  1121. "vaddps %%ymm15, %%ymm14, %%ymm14 \n" \
  1122. "vaddps %%ymm14, %%ymm12, " STR(_SRC) "\n"
  1123. #define IDCT_AVX_EO_TO_ROW_HALVES(_EVEN, _ODD, _FRONT, _BACK) \
  1124. "vsubps " STR(_ODD) "," STR(_EVEN) "," STR(_BACK) "\n" \
  1125. "vaddps " STR(_ODD) "," STR(_EVEN) "," STR(_FRONT) "\n" \
  1126. /* Reverse the back half */ \
  1127. "vpermilps $0x1b," STR(_BACK) "," STR(_BACK) "\n"
  1128. /* In order to allow for path paths when we know certain rows
  1129. * of the 8x8 block are zero, most of the body of the DCT is
  1130. * in the following macro. Statements are wrapped in a ROWn()
  1131. * macro, where n is the lowest row in the 8x8 block in which
  1132. * they depend.
  1133. *
  1134. * This should work for the cases where we have 2-8 full rows.
  1135. * the 1-row case is special, and we'll handle it seperately.
  1136. */
  1137. #define IDCT_AVX_BODY \
  1138. /* ==============================================
  1139. * Row 1D DCT
  1140. * ----------------------------------------------
  1141. */ \
  1142. \
  1143. /* Setup for the row-oriented 1D DCT. Assuming that (%0) holds
  1144. * the row-major 8x8 block, load ymm0-3 with the even columns
  1145. * and ymm4-7 with the odd columns. The lower half of the ymm
  1146. * holds one row, while the upper half holds the next row.
  1147. *
  1148. * If our source is:
  1149. * a0 a1 a2 a3 a4 a5 a6 a7
  1150. * b0 b1 b2 b3 b4 b5 b6 b7
  1151. *
  1152. * We'll be forming:
  1153. * a0 a2 a4 a6 b0 b2 b4 b6
  1154. * a1 a3 a5 a7 b1 b3 b5 b7
  1155. */ \
  1156. ROW0( IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48) ) \
  1157. ROW2( IDCT_AVX_SETUP_2_ROWS(1, 5, 12, 13, 64, 80, 96, 112) ) \
  1158. ROW4( IDCT_AVX_SETUP_2_ROWS(2, 6, 10, 11, 128, 144, 160, 176) ) \
  1159. ROW6( IDCT_AVX_SETUP_2_ROWS(3, 7, 8, 9, 192, 208, 224, 240) ) \
  1160. \
  1161. /* Multiple the even columns (ymm0-3) by the matrix M1
  1162. * storing the results back in ymm0-3
  1163. *
  1164. * Assume that (%1) holds the matrix in column major order
  1165. */ \
  1166. "vbroadcastf128 (%1), %%ymm8 \n" \
  1167. "vbroadcastf128 16(%1), %%ymm9 \n" \
  1168. "vbroadcastf128 32(%1), %%ymm10 \n" \
  1169. "vbroadcastf128 48(%1), %%ymm11 \n" \
  1170. \
  1171. ROW0( IDCT_AVX_MMULT_ROWS(%%ymm0) ) \
  1172. ROW2( IDCT_AVX_MMULT_ROWS(%%ymm1) ) \
  1173. ROW4( IDCT_AVX_MMULT_ROWS(%%ymm2) ) \
  1174. ROW6( IDCT_AVX_MMULT_ROWS(%%ymm3) ) \
  1175. \
  1176. /* Repeat, but with the odd columns (ymm4-7) and the
  1177. * matrix M2
  1178. */ \
  1179. "vbroadcastf128 64(%1), %%ymm8 \n" \
  1180. "vbroadcastf128 80(%1), %%ymm9 \n" \
  1181. "vbroadcastf128 96(%1), %%ymm10 \n" \
  1182. "vbroadcastf128 112(%1), %%ymm11 \n" \
  1183. \
  1184. ROW0( IDCT_AVX_MMULT_ROWS(%%ymm4) ) \
  1185. ROW2( IDCT_AVX_MMULT_ROWS(%%ymm5) ) \
  1186. ROW4( IDCT_AVX_MMULT_ROWS(%%ymm6) ) \
  1187. ROW6( IDCT_AVX_MMULT_ROWS(%%ymm7) ) \
  1188. \
  1189. /* Sum the M1 (ymm0-3) and M2 (ymm4-7) results to get the
  1190. * front halves of the results, and difference to get the
  1191. * back halves. The front halfs end up in ymm0-3, the back
  1192. * halves end up in ymm12-15.
  1193. */ \
  1194. ROW0( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) ) \
  1195. ROW2( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm1, %%ymm5, %%ymm1, %%ymm13) ) \
  1196. ROW4( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm2, %%ymm6, %%ymm2, %%ymm14) ) \
  1197. ROW6( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm3, %%ymm7, %%ymm3, %%ymm15) ) \
  1198. \
  1199. /* Reassemble the rows halves into ymm0-7 */ \
  1200. ROW7( "vperm2f128 $0x13, %%ymm3, %%ymm15, %%ymm7 \n" ) \
  1201. ROW6( "vperm2f128 $0x02, %%ymm3, %%ymm15, %%ymm6 \n" ) \
  1202. ROW5( "vperm2f128 $0x13, %%ymm2, %%ymm14, %%ymm5 \n" ) \
  1203. ROW4( "vperm2f128 $0x02, %%ymm2, %%ymm14, %%ymm4 \n" ) \
  1204. ROW3( "vperm2f128 $0x13, %%ymm1, %%ymm13, %%ymm3 \n" ) \
  1205. ROW2( "vperm2f128 $0x02, %%ymm1, %%ymm13, %%ymm2 \n" ) \
  1206. ROW1( "vperm2f128 $0x13, %%ymm0, %%ymm12, %%ymm1 \n" ) \
  1207. ROW0( "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" ) \
  1208. \
  1209. \
  1210. /* ==============================================
  1211. * Column 1D DCT
  1212. * ----------------------------------------------
  1213. */ \
  1214. \
  1215. /* Rows should be in ymm0-7, and M2 columns should still be
  1216. * preserved in ymm8-11. M2 has 4 unique values (and +-
  1217. * versions of each), and all (positive) values appear in
  1218. * the first column (and row), which is in ymm8.
  1219. *
  1220. * For the column-wise DCT, we need to:
  1221. * 1) Broadcast each element a row of M2 into 4 vectors
  1222. * 2) Multiple the odd rows (ymm1,3,5,7) by the broadcasts.
  1223. * 3) Accumulate into ymm12-15 for the odd outputs.
  1224. *
  1225. * Instead of doing 16 broadcasts for each element in M2,
  1226. * do 4, filling y8-11 with:
  1227. *
  1228. * ymm8: [ b b b b | b b b b ]
  1229. * ymm9: [ d d d d | d d d d ]
  1230. * ymm10: [ e e e e | e e e e ]
  1231. * ymm11: [ g g g g | g g g g ]
  1232. *
  1233. * And deal with the negative values by subtracting during accum.
  1234. */ \
  1235. "vpermilps $0xff, %%ymm8, %%ymm11 \n" \
  1236. "vpermilps $0xaa, %%ymm8, %%ymm10 \n" \
  1237. "vpermilps $0x55, %%ymm8, %%ymm9 \n" \
  1238. "vpermilps $0x00, %%ymm8, %%ymm8 \n" \
  1239. \
  1240. /* This one is easy, since we have ymm12-15 open for scratch
  1241. * ymm12 = b ymm1 + d ymm3 + e ymm5 + g ymm7
  1242. */ \
  1243. ROW1( "vmulps %%ymm1, %%ymm8, %%ymm12 \n" ) \
  1244. ROW3( "vmulps %%ymm3, %%ymm9, %%ymm13 \n" ) \
  1245. ROW5( "vmulps %%ymm5, %%ymm10, %%ymm14 \n" ) \
  1246. ROW7( "vmulps %%ymm7, %%ymm11, %%ymm15 \n" ) \
  1247. \
  1248. ROW3( "vaddps %%ymm12, %%ymm13, %%ymm12 \n" ) \
  1249. ROW7( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \
  1250. ROW5( "vaddps %%ymm12, %%ymm14, %%ymm12 \n" ) \
  1251. \
  1252. /* Tricker, since only y13-15 are open for scratch
  1253. * ymm13 = d ymm1 - g ymm3 - b ymm5 - e ymm7
  1254. */ \
  1255. ROW1( "vmulps %%ymm1, %%ymm9, %%ymm13 \n" ) \
  1256. ROW3( "vmulps %%ymm3, %%ymm11, %%ymm14 \n" ) \
  1257. ROW5( "vmulps %%ymm5, %%ymm8, %%ymm15 \n" ) \
  1258. \
  1259. ROW5( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \
  1260. ROW3( "vsubps %%ymm14, %%ymm13, %%ymm13 \n" ) \
  1261. \
  1262. ROW7( "vmulps %%ymm7, %%ymm10, %%ymm15 \n" ) \
  1263. ROW7( "vsubps %%ymm15, %%ymm13, %%ymm13 \n" ) \
  1264. \
  1265. /* Tricker still, as only y14-15 are open for scratch
  1266. * ymm14 = e ymm1 - b ymm3 + g ymm5 + d ymm7
  1267. */ \
  1268. ROW1( "vmulps %%ymm1, %%ymm10, %%ymm14 \n" ) \
  1269. ROW3( "vmulps %%ymm3, %%ymm8, %%ymm15 \n" ) \
  1270. \
  1271. ROW3( "vsubps %%ymm15, %%ymm14, %%ymm14 \n" ) \
  1272. \
  1273. ROW5( "vmulps %%ymm5, %%ymm11, %%ymm15 \n" ) \
  1274. ROW5( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \
  1275. \
  1276. ROW7( "vmulps %%ymm7, %%ymm9, %%ymm15 \n" ) \
  1277. ROW7( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \
  1278. \
  1279. \
  1280. /* Easy, as we can blow away ymm1,3,5,7 for scratch
  1281. * ymm15 = g ymm1 - e ymm3 + d ymm5 - b ymm7
  1282. */ \
  1283. ROW1( "vmulps %%ymm1, %%ymm11, %%ymm15 \n" ) \
  1284. ROW3( "vmulps %%ymm3, %%ymm10, %%ymm3 \n" ) \
  1285. ROW5( "vmulps %%ymm5, %%ymm9, %%ymm5 \n" ) \
  1286. ROW7( "vmulps %%ymm7, %%ymm8, %%ymm7 \n" ) \
  1287. \
  1288. ROW5( "vaddps %%ymm15, %%ymm5, %%ymm15 \n" ) \
  1289. ROW7( "vaddps %%ymm3, %%ymm7, %%ymm3 \n" ) \
  1290. ROW3( "vsubps %%ymm3, %%ymm15, %%ymm15 \n" ) \
  1291. \
  1292. \
  1293. /* Load coefs for M1. Because we're going to broadcast
  1294. * coefs, we don't need to load the actual structure from
  1295. * M1. Instead, just load enough that we can broadcast.
  1296. * There are only 6 unique values in M1, but they're in +-
  1297. * pairs, leaving only 3 unique coefs if we add and subtract
  1298. * properly.
  1299. *
  1300. * Fill ymm1 with coef[2] = [ a a c f | a a c f ]
  1301. * Broadcast ymm5 with [ f f f f | f f f f ]
  1302. * Broadcast ymm3 with [ c c c c | c c c c ]
  1303. * Broadcast ymm1 with [ a a a a | a a a a ]
  1304. */ \
  1305. "vbroadcastf128 8(%1), %%ymm1 \n" \
  1306. "vpermilps $0xff, %%ymm1, %%ymm5 \n" \
  1307. "vpermilps $0xaa, %%ymm1, %%ymm3 \n" \
  1308. "vpermilps $0x00, %%ymm1, %%ymm1 \n" \
  1309. \
  1310. /* If we expand E = [M1] [x0 x2 x4 x6]^t, we get the following
  1311. * common expressions:
  1312. *
  1313. * E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6)
  1314. * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
  1315. *
  1316. * E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
  1317. * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
  1318. *
  1319. * Afterwards, ymm8-11 will hold the even outputs.
  1320. */ \
  1321. \
  1322. /* ymm11 = (a ymm0 + a ymm4), ymm1 = (a ymm0 - a ymm4) */ \
  1323. ROW0( "vmulps %%ymm1, %%ymm0, %%ymm11 \n" ) \
  1324. ROW4( "vmulps %%ymm1, %%ymm4, %%ymm4 \n" ) \
  1325. ROW0( "vmovaps %%ymm11, %%ymm1 \n" ) \
  1326. ROW4( "vaddps %%ymm4, %%ymm11, %%ymm11 \n" ) \
  1327. ROW4( "vsubps %%ymm4, %%ymm1, %%ymm1 \n" ) \
  1328. \
  1329. /* ymm7 = (c ymm2 + f ymm6) */ \
  1330. ROW2( "vmulps %%ymm3, %%ymm2, %%ymm7 \n" ) \
  1331. ROW6( "vmulps %%ymm5, %%ymm6, %%ymm9 \n" ) \
  1332. ROW6( "vaddps %%ymm9, %%ymm7, %%ymm7 \n" ) \
  1333. \
  1334. /* E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6)
  1335. * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
  1336. */ \
  1337. ROW0( "vmovaps %%ymm11, %%ymm8 \n" ) \
  1338. ROW2( "vaddps %%ymm7, %%ymm8, %%ymm8 \n" ) \
  1339. ROW2( "vsubps %%ymm7, %%ymm11, %%ymm11 \n" ) \
  1340. \
  1341. /* ymm7 = (f ymm2 - c ymm6) */ \
  1342. ROW2( "vmulps %%ymm5, %%ymm2, %%ymm7 \n" ) \
  1343. ROW6( "vmulps %%ymm3, %%ymm6, %%ymm9 \n" ) \
  1344. ROW6( "vsubps %%ymm9, %%ymm7, %%ymm7 \n" ) \
  1345. \
  1346. /* E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
  1347. * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
  1348. */ \
  1349. ROW0( "vmovaps %%ymm1, %%ymm9 \n" ) \
  1350. ROW0( "vmovaps %%ymm1, %%ymm10 \n" ) \
  1351. ROW2( "vaddps %%ymm7, %%ymm1, %%ymm9 \n" ) \
  1352. ROW2( "vsubps %%ymm7, %%ymm1, %%ymm10 \n" ) \
  1353. \
  1354. /* Add the even (ymm8-11) and the odds (ymm12-15),
  1355. * placing the results into ymm0-7
  1356. */ \
  1357. "vaddps %%ymm12, %%ymm8, %%ymm0 \n" \
  1358. "vaddps %%ymm13, %%ymm9, %%ymm1 \n" \
  1359. "vaddps %%ymm14, %%ymm10, %%ymm2 \n" \
  1360. "vaddps %%ymm15, %%ymm11, %%ymm3 \n" \
  1361. \
  1362. "vsubps %%ymm12, %%ymm8, %%ymm7 \n" \
  1363. "vsubps %%ymm13, %%ymm9, %%ymm6 \n" \
  1364. "vsubps %%ymm14, %%ymm10, %%ymm5 \n" \
  1365. "vsubps %%ymm15, %%ymm11, %%ymm4 \n" \
  1366. \
  1367. /* Copy out the results from ymm0-7 */ \
  1368. "vmovaps %%ymm0, (%0) \n" \
  1369. "vmovaps %%ymm1, 32(%0) \n" \
  1370. "vmovaps %%ymm2, 64(%0) \n" \
  1371. "vmovaps %%ymm3, 96(%0) \n" \
  1372. "vmovaps %%ymm4, 128(%0) \n" \
  1373. "vmovaps %%ymm5, 160(%0) \n" \
  1374. "vmovaps %%ymm6, 192(%0) \n" \
  1375. "vmovaps %%ymm7, 224(%0) \n"
  1376. /* Output, input, and clobber (OIC) sections of the inline asm */
  1377. #define IDCT_AVX_OIC(_IN0) \
  1378. : /* Output */ \
  1379. : /* Input */ "r"(_IN0), "r"(sAvxCoef) \
  1380. : /* Clobber */ "memory", \
  1381. "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
  1382. "%xmm4", "%xmm5", "%xmm6", "%xmm7", \
  1383. "%xmm8", "%xmm9", "%xmm10", "%xmm11",\
  1384. "%xmm12", "%xmm13", "%xmm14", "%xmm15"
  1385. /* Include vzeroupper for non-AVX builds */
  1386. #ifndef __AVX__
  1387. #define IDCT_AVX_ASM(_IN0) \
  1388. __asm__( \
  1389. IDCT_AVX_BODY \
  1390. "vzeroupper \n" \
  1391. IDCT_AVX_OIC(_IN0) \
  1392. );
  1393. #else /* __AVX__ */
  1394. #define IDCT_AVX_ASM(_IN0) \
  1395. __asm__( \
  1396. IDCT_AVX_BODY \
  1397. IDCT_AVX_OIC(_IN0) \
  1398. );
  1399. #endif /* __AVX__ */
  1400. template <int zeroedRows>
  1401. void
  1402. dctInverse8x8_avx (float *data)
  1403. {
  1404. #if defined IMF_HAVE_GCC_INLINEASM_64
  1405. /* The column-major version of M1, followed by the
  1406. * column-major version of M2:
  1407. *
  1408. * [ a c a f ] [ b d e g ]
  1409. * M1 = [ a f -a -c ] M2 = [ d -g -b -e ]
  1410. * [ a -f -a c ] [ e -b g d ]
  1411. * [ a -c a -f ] [ g -e d -b ]
  1412. */
  1413. const float sAvxCoef[32] __attribute__((aligned(32))) = {
  1414. 3.535536e-01, 3.535536e-01, 3.535536e-01, 3.535536e-01, /* a a a a */
  1415. 4.619398e-01, 1.913422e-01, -1.913422e-01, -4.619398e-01, /* c f -f -c */
  1416. 3.535536e-01, -3.535536e-01, -3.535536e-01, 3.535536e-01, /* a -a -a a */
  1417. 1.913422e-01, -4.619398e-01, 4.619398e-01, -1.913422e-01, /* f -c c -f */
  1418. 4.903927e-01, 4.157349e-01, 2.777855e-01, 9.754573e-02, /* b d e g */
  1419. 4.157349e-01, -9.754573e-02, -4.903927e-01, -2.777855e-01, /* d -g -b -e */
  1420. 2.777855e-01, -4.903927e-01, 9.754573e-02, 4.157349e-01, /* e -b g d */
  1421. 9.754573e-02, -2.777855e-01, 4.157349e-01, -4.903927e-01 /* g -e d -b */
  1422. };
  1423. #define ROW0(_X) _X
  1424. #define ROW1(_X) _X
  1425. #define ROW2(_X) _X
  1426. #define ROW3(_X) _X
  1427. #define ROW4(_X) _X
  1428. #define ROW5(_X) _X
  1429. #define ROW6(_X) _X
  1430. #define ROW7(_X) _X
  1431. if (zeroedRows == 0) {
  1432. IDCT_AVX_ASM(data)
  1433. } else if (zeroedRows == 1) {
  1434. #undef ROW7
  1435. #define ROW7(_X)
  1436. IDCT_AVX_ASM(data)
  1437. } else if (zeroedRows == 2) {
  1438. #undef ROW6
  1439. #define ROW6(_X)
  1440. IDCT_AVX_ASM(data)
  1441. } else if (zeroedRows == 3) {
  1442. #undef ROW5
  1443. #define ROW5(_X)
  1444. IDCT_AVX_ASM(data)
  1445. } else if (zeroedRows == 4) {
  1446. #undef ROW4
  1447. #define ROW4(_X)
  1448. IDCT_AVX_ASM(data)
  1449. } else if (zeroedRows == 5) {
  1450. #undef ROW3
  1451. #define ROW3(_X)
  1452. IDCT_AVX_ASM(data)
  1453. } else if (zeroedRows == 6) {
  1454. #undef ROW2
  1455. #define ROW2(_X)
  1456. IDCT_AVX_ASM(data)
  1457. } else if (zeroedRows == 7) {
  1458. __asm__(
  1459. /* ==============================================
  1460. * Row 1D DCT
  1461. * ----------------------------------------------
  1462. */
  1463. IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48)
  1464. "vbroadcastf128 (%1), %%ymm8 \n"
  1465. "vbroadcastf128 16(%1), %%ymm9 \n"
  1466. "vbroadcastf128 32(%1), %%ymm10 \n"
  1467. "vbroadcastf128 48(%1), %%ymm11 \n"
  1468. /* Stash a vector of [a a a a | a a a a] away in ymm2 */
  1469. "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n"
  1470. IDCT_AVX_MMULT_ROWS(%%ymm0)
  1471. "vbroadcastf128 64(%1), %%ymm8 \n"
  1472. "vbroadcastf128 80(%1), %%ymm9 \n"
  1473. "vbroadcastf128 96(%1), %%ymm10 \n"
  1474. "vbroadcastf128 112(%1), %%ymm11 \n"
  1475. IDCT_AVX_MMULT_ROWS(%%ymm4)
  1476. IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12)
  1477. "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n"
  1478. /* ==============================================
  1479. * Column 1D DCT
  1480. * ----------------------------------------------
  1481. */
  1482. /* DC only, so multiple by a and we're done */
  1483. "vmulps %%ymm2, %%ymm0, %%ymm0 \n"
  1484. /* Copy out results */
  1485. "vmovaps %%ymm0, (%0) \n"
  1486. "vmovaps %%ymm0, 32(%0) \n"
  1487. "vmovaps %%ymm0, 64(%0) \n"
  1488. "vmovaps %%ymm0, 96(%0) \n"
  1489. "vmovaps %%ymm0, 128(%0) \n"
  1490. "vmovaps %%ymm0, 160(%0) \n"
  1491. "vmovaps %%ymm0, 192(%0) \n"
  1492. "vmovaps %%ymm0, 224(%0) \n"
  1493. #ifndef __AVX__
  1494. "vzeroupper \n"
  1495. #endif /* __AVX__ */
  1496. IDCT_AVX_OIC(data)
  1497. );
  1498. } else {
  1499. assert(false); // Invalid template instance parameter
  1500. }
  1501. #else /* IMF_HAVE_GCC_INLINEASM_64 */
  1502. dctInverse8x8_scalar<zeroedRows>(data);
  1503. #endif /* IMF_HAVE_GCC_INLINEASM_64 */
  1504. }
  1505. //
  1506. // Full 8x8 Forward DCT:
  1507. //
  1508. // Base forward 8x8 DCT implementation. Works on the data in-place
  1509. //
  1510. // The implementation describedin Pennebaker + Mitchell,
  1511. // section 4.3.2, and illustrated in figure 4-7
  1512. //
  1513. // The basic idea is that the 1D DCT math reduces to:
  1514. //
  1515. // 2*out_0 = c_4 [(s_07 + s_34) + (s_12 + s_56)]
  1516. // 2*out_4 = c_4 [(s_07 + s_34) - (s_12 + s_56)]
  1517. //
  1518. // {2*out_2, 2*out_6} = rot_6 ((d_12 - d_56), (s_07 - s_34))
  1519. //
  1520. // {2*out_3, 2*out_5} = rot_-3 (d_07 - c_4 (s_12 - s_56),
  1521. // d_34 - c_4 (d_12 + d_56))
  1522. //
  1523. // {2*out_1, 2*out_7} = rot_-1 (d_07 + c_4 (s_12 - s_56),
  1524. // -d_34 - c_4 (d_12 + d_56))
  1525. //
  1526. // where:
  1527. //
  1528. // c_i = cos(i*pi/16)
  1529. // s_i = sin(i*pi/16)
  1530. //
  1531. // s_ij = in_i + in_j
  1532. // d_ij = in_i - in_j
  1533. //
  1534. // rot_i(x, y) = {c_i*x + s_i*y, -s_i*x + c_i*y}
  1535. //
  1536. // We'll run the DCT in two passes. First, run the 1D DCT on
  1537. // the rows, in-place. Then, run over the columns in-place,
  1538. // and be done with it.
  1539. //
  1540. #ifndef IMF_HAVE_SSE2
  1541. //
  1542. // Default implementation
  1543. //
  1544. void
  1545. dctForward8x8 (float *data)
  1546. {
  1547. float A0, A1, A2, A3, A4, A5, A6, A7;
  1548. float K0, K1, rot_x, rot_y;
  1549. float *srcPtr = data;
  1550. float *dstPtr = data;
  1551. const float c1 = cosf (3.14159f * 1.0f / 16.0f);
  1552. const float c2 = cosf (3.14159f * 2.0f / 16.0f);
  1553. const float c3 = cosf (3.14159f * 3.0f / 16.0f);
  1554. const float c4 = cosf (3.14159f * 4.0f / 16.0f);
  1555. const float c5 = cosf (3.14159f * 5.0f / 16.0f);
  1556. const float c6 = cosf (3.14159f * 6.0f / 16.0f);
  1557. const float c7 = cosf (3.14159f * 7.0f / 16.0f);
  1558. const float c1Half = .5f * c1;
  1559. const float c2Half = .5f * c2;
  1560. const float c3Half = .5f * c3;
  1561. const float c5Half = .5f * c5;
  1562. const float c6Half = .5f * c6;
  1563. const float c7Half = .5f * c7;
  1564. //
  1565. // First pass - do a 1D DCT over the rows and write the
  1566. // results back in place
  1567. //
  1568. for (int row=0; row<8; ++row)
  1569. {
  1570. float *srcRowPtr = srcPtr + 8 * row;
  1571. float *dstRowPtr = dstPtr + 8 * row;
  1572. A0 = srcRowPtr[0] + srcRowPtr[7];
  1573. A1 = srcRowPtr[1] + srcRowPtr[2];
  1574. A2 = srcRowPtr[1] - srcRowPtr[2];
  1575. A3 = srcRowPtr[3] + srcRowPtr[4];
  1576. A4 = srcRowPtr[3] - srcRowPtr[4];
  1577. A5 = srcRowPtr[5] + srcRowPtr[6];
  1578. A6 = srcRowPtr[5] - srcRowPtr[6];
  1579. A7 = srcRowPtr[0] - srcRowPtr[7];
  1580. K0 = c4 * (A0 + A3);
  1581. K1 = c4 * (A1 + A5);
  1582. dstRowPtr[0] = .5f * (K0 + K1);
  1583. dstRowPtr[4] = .5f * (K0 - K1);
  1584. //
  1585. // (2*dst2, 2*dst6) = rot 6 (d12 - d56, s07 - s34)
  1586. //
  1587. rot_x = A2 - A6;
  1588. rot_y = A0 - A3;
  1589. dstRowPtr[2] = c6Half * rot_x + c2Half * rot_y;
  1590. dstRowPtr[6] = c6Half * rot_y - c2Half * rot_x;
  1591. //
  1592. // K0, K1 are active until after dst[1],dst[7]
  1593. // as well as dst[3], dst[5] are computed.
  1594. //
  1595. K0 = c4 * (A1 - A5);
  1596. K1 = -1 * c4 * (A2 + A6);
  1597. //
  1598. // Two ways to do a rotation:
  1599. //
  1600. // rot i (x, y) =
  1601. // X = c_i*x + s_i*y
  1602. // Y = -s_i*x + c_i*y
  1603. //
  1604. // OR
  1605. //
  1606. // X = c_i*(x+y) + (s_i-c_i)*y
  1607. // Y = c_i*y - (s_i+c_i)*x
  1608. //
  1609. // the first case has 4 multiplies, but fewer constants,
  1610. // while the 2nd case has fewer multiplies but takes more space.
  1611. //
  1612. // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 )
  1613. //
  1614. rot_x = A7 - K0;
  1615. rot_y = A4 + K1;
  1616. dstRowPtr[3] = c3Half * rot_x - c5Half * rot_y;
  1617. dstRowPtr[5] = c5Half * rot_x + c3Half * rot_y;
  1618. //
  1619. // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 )
  1620. //
  1621. rot_x = A7 + K0;
  1622. rot_y = K1 - A4;
  1623. //
  1624. // A: 4, 7 are inactive. All A's are inactive
  1625. //
  1626. dstRowPtr[1] = c1Half * rot_x - c7Half * rot_y;
  1627. dstRowPtr[7] = c7Half * rot_x + c1Half * rot_y;
  1628. }
  1629. //
  1630. // Second pass - do the same, but on the columns
  1631. //
  1632. for (int column = 0; column < 8; ++column)
  1633. {
  1634. A0 = srcPtr[ column] + srcPtr[56 + column];
  1635. A7 = srcPtr[ column] - srcPtr[56 + column];
  1636. A1 = srcPtr[ 8 + column] + srcPtr[16 + column];
  1637. A2 = srcPtr[ 8 + column] - srcPtr[16 + column];
  1638. A3 = srcPtr[24 + column] + srcPtr[32 + column];
  1639. A4 = srcPtr[24 + column] - srcPtr[32 + column];
  1640. A5 = srcPtr[40 + column] + srcPtr[48 + column];
  1641. A6 = srcPtr[40 + column] - srcPtr[48 + column];
  1642. K0 = c4 * (A0 + A3);
  1643. K1 = c4 * (A1 + A5);
  1644. dstPtr[ column] = .5f * (K0 + K1);
  1645. dstPtr[32+column] = .5f * (K0 - K1);
  1646. //
  1647. // (2*dst2, 2*dst6) = rot 6 ( d12 - d56, s07 - s34 )
  1648. //
  1649. rot_x = A2 - A6;
  1650. rot_y = A0 - A3;
  1651. dstPtr[16+column] = .5f * (c6 * rot_x + c2 * rot_y);
  1652. dstPtr[48+column] = .5f * (c6 * rot_y - c2 * rot_x);
  1653. //
  1654. // K0, K1 are active until after dst[1],dst[7]
  1655. // as well as dst[3], dst[5] are computed.
  1656. //
  1657. K0 = c4 * (A1 - A5);
  1658. K1 = -1 * c4 * (A2 + A6);
  1659. //
  1660. // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 )
  1661. //
  1662. rot_x = A7 - K0;
  1663. rot_y = A4 + K1;
  1664. dstPtr[24+column] = .5f * (c3 * rot_x - c5 * rot_y);
  1665. dstPtr[40+column] = .5f * (c5 * rot_x + c3 * rot_y);
  1666. //
  1667. // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 )
  1668. //
  1669. rot_x = A7 + K0;
  1670. rot_y = K1 - A4;
  1671. dstPtr[ 8+column] = .5f * (c1 * rot_x - c7 * rot_y);
  1672. dstPtr[56+column] = .5f * (c7 * rot_x + c1 * rot_y);
  1673. }
  1674. }
  1675. #else /* IMF_HAVE_SSE2 */
  1676. //
  1677. // SSE2 implementation
  1678. //
  1679. // Here, we're always doing a column-wise operation
  1680. // plus transposes. This might be faster to do differently
  1681. // between rows-wise and column-wise
  1682. //
  1683. void
  1684. dctForward8x8 (float *data)
  1685. {
  1686. __m128 *srcVec = (__m128 *)data;
  1687. __m128 a0Vec, a1Vec, a2Vec, a3Vec, a4Vec, a5Vec, a6Vec, a7Vec;
  1688. __m128 k0Vec, k1Vec, rotXVec, rotYVec;
  1689. __m128 transTmp[4], transTmp2[4];
  1690. __m128 c4Vec = { .70710678f, .70710678f, .70710678f, .70710678f};
  1691. __m128 c4NegVec = {-.70710678f, -.70710678f, -.70710678f, -.70710678f};
  1692. __m128 c1HalfVec = {.490392640f, .490392640f, .490392640f, .490392640f};
  1693. __m128 c2HalfVec = {.461939770f, .461939770f, .461939770f, .461939770f};
  1694. __m128 c3HalfVec = {.415734810f, .415734810f, .415734810f, .415734810f};
  1695. __m128 c5HalfVec = {.277785120f, .277785120f, .277785120f, .277785120f};
  1696. __m128 c6HalfVec = {.191341720f, .191341720f, .191341720f, .191341720f};
  1697. __m128 c7HalfVec = {.097545161f, .097545161f, .097545161f, .097545161f};
  1698. __m128 halfVec = {.5f, .5f, .5f, .5f};
  1699. for (int iter = 0; iter < 2; ++iter)
  1700. {
  1701. //
  1702. // Operate on 4 columns at a time. The
  1703. // offsets into our row-major array are:
  1704. // 0: 0 1
  1705. // 1: 2 3
  1706. // 2: 4 5
  1707. // 3: 6 7
  1708. // 4: 8 9
  1709. // 5: 10 11
  1710. // 6: 12 13
  1711. // 7: 14 15
  1712. //
  1713. for (int pass=0; pass<2; ++pass)
  1714. {
  1715. a0Vec = _mm_add_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
  1716. a1Vec = _mm_add_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
  1717. a3Vec = _mm_add_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
  1718. a5Vec = _mm_add_ps (srcVec[10 + pass], srcVec[12 + pass]);
  1719. a7Vec = _mm_sub_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
  1720. a2Vec = _mm_sub_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
  1721. a4Vec = _mm_sub_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
  1722. a6Vec = _mm_sub_ps (srcVec[10 + pass], srcVec[12 + pass]);
  1723. //
  1724. // First stage; Compute out_0 and out_4
  1725. //
  1726. k0Vec = _mm_add_ps (a0Vec, a3Vec);
  1727. k1Vec = _mm_add_ps (a1Vec, a5Vec);
  1728. k0Vec = _mm_mul_ps (c4Vec, k0Vec);
  1729. k1Vec = _mm_mul_ps (c4Vec, k1Vec);
  1730. srcVec[0 + pass] = _mm_add_ps (k0Vec, k1Vec);
  1731. srcVec[8 + pass] = _mm_sub_ps (k0Vec, k1Vec);
  1732. srcVec[0 + pass] = _mm_mul_ps (srcVec[0 + pass], halfVec );
  1733. srcVec[8 + pass] = _mm_mul_ps (srcVec[8 + pass], halfVec );
  1734. //
  1735. // Second stage; Compute out_2 and out_6
  1736. //
  1737. k0Vec = _mm_sub_ps (a2Vec, a6Vec);
  1738. k1Vec = _mm_sub_ps (a0Vec, a3Vec);
  1739. srcVec[ 4 + pass] = _mm_add_ps (_mm_mul_ps (c6HalfVec, k0Vec),
  1740. _mm_mul_ps (c2HalfVec, k1Vec));
  1741. srcVec[12 + pass] = _mm_sub_ps (_mm_mul_ps (c6HalfVec, k1Vec),
  1742. _mm_mul_ps (c2HalfVec, k0Vec));
  1743. //
  1744. // Precompute K0 and K1 for the remaining stages
  1745. //
  1746. k0Vec = _mm_mul_ps (_mm_sub_ps (a1Vec, a5Vec), c4Vec);
  1747. k1Vec = _mm_mul_ps (_mm_add_ps (a2Vec, a6Vec), c4NegVec);
  1748. //
  1749. // Third Stage, compute out_3 and out_5
  1750. //
  1751. rotXVec = _mm_sub_ps (a7Vec, k0Vec);
  1752. rotYVec = _mm_add_ps (a4Vec, k1Vec);
  1753. srcVec[ 6 + pass] = _mm_sub_ps (_mm_mul_ps (c3HalfVec, rotXVec),
  1754. _mm_mul_ps (c5HalfVec, rotYVec));
  1755. srcVec[10 + pass] = _mm_add_ps (_mm_mul_ps (c5HalfVec, rotXVec),
  1756. _mm_mul_ps (c3HalfVec, rotYVec));
  1757. //
  1758. // Fourth Stage, compute out_1 and out_7
  1759. //
  1760. rotXVec = _mm_add_ps (a7Vec, k0Vec);
  1761. rotYVec = _mm_sub_ps (k1Vec, a4Vec);
  1762. srcVec[ 2 + pass] = _mm_sub_ps (_mm_mul_ps (c1HalfVec, rotXVec),
  1763. _mm_mul_ps (c7HalfVec, rotYVec));
  1764. srcVec[14 + pass] = _mm_add_ps (_mm_mul_ps (c7HalfVec, rotXVec),
  1765. _mm_mul_ps (c1HalfVec, rotYVec));
  1766. }
  1767. //
  1768. // Transpose the matrix, in 4x4 blocks. So, if we have our
  1769. // 8x8 matrix divied into 4x4 blocks:
  1770. //
  1771. // M0 | M1 M0t | M2t
  1772. // ----+--- --> -----+------
  1773. // M2 | M3 M1t | M3t
  1774. //
  1775. //
  1776. // M0t, done in place, the first half.
  1777. //
  1778. transTmp[0] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0x44);
  1779. transTmp[1] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0x44);
  1780. transTmp[3] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0xEE);
  1781. transTmp[2] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0xEE);
  1782. //
  1783. // M3t, also done in place, the first half.
  1784. //
  1785. transTmp2[0] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0x44);
  1786. transTmp2[1] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0x44);
  1787. transTmp2[2] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0xEE);
  1788. transTmp2[3] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0xEE);
  1789. //
  1790. // M0t, the second half.
  1791. //
  1792. srcVec[0] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
  1793. srcVec[4] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
  1794. srcVec[2] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
  1795. srcVec[6] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
  1796. //
  1797. // M3t, the second half.
  1798. //
  1799. srcVec[ 9] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
  1800. srcVec[13] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
  1801. srcVec[11] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
  1802. srcVec[15] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
  1803. //
  1804. // M1 and M2 need to be done at the same time, because we're
  1805. // swapping.
  1806. //
  1807. // First, the first half of M1t
  1808. //
  1809. transTmp[0] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0x44);
  1810. transTmp[1] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0x44);
  1811. transTmp[2] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0xEE);
  1812. transTmp[3] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0xEE);
  1813. //
  1814. // And the first half of M2t
  1815. //
  1816. transTmp2[0] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0x44);
  1817. transTmp2[1] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0x44);
  1818. transTmp2[2] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0xEE);
  1819. transTmp2[3] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0xEE);
  1820. //
  1821. // Second half of M1t
  1822. //
  1823. srcVec[ 8] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
  1824. srcVec[12] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
  1825. srcVec[10] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
  1826. srcVec[14] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
  1827. //
  1828. // Second half of M2
  1829. //
  1830. srcVec[1] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
  1831. srcVec[5] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
  1832. srcVec[3] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
  1833. srcVec[7] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
  1834. }
  1835. }
  1836. #endif /* IMF_HAVE_SSE2 */
  1837. } // anonymous namespace
  1838. OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
  1839. #endif