ImfOptimizedPixelReading.h 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646
  1. ///////////////////////////////////////////////////////////////////////////
  2. //
  3. // Copyright (c) 2012, Autodesk, Inc.
  4. //
  5. // All rights reserved.
  6. //
  7. // Implementation of IIF-specific file format and speed optimizations
  8. // provided by Innobec Technologies inc on behalf of Autodesk.
  9. //
  10. // Redistribution and use in source and binary forms, with or without
  11. // modification, are permitted provided that the following conditions are
  12. // met:
  13. // * Redistributions of source code must retain the above copyright
  14. // notice, this list of conditions and the following disclaimer.
  15. // * Redistributions in binary form must reproduce the above
  16. // copyright notice, this list of conditions and the following disclaimer
  17. // in the documentation and/or other materials provided with the
  18. // distribution.
  19. // * Neither the name of Industrial Light & Magic nor the names of
  20. // its contributors may be used to endorse or promote products derived
  21. // from this software without specific prior written permission.
  22. //
  23. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  26. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  27. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  28. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  29. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  30. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  31. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  32. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  33. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  34. //
  35. ///////////////////////////////////////////////////////////////////////////
  36. #pragma once
  37. #ifndef INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
  38. #define INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
  39. #include "ImfSimd.h"
  40. #include "ImfSystemSpecific.h"
  41. #include <iostream>
  42. #include "ImfChannelList.h"
  43. #include "ImfFrameBuffer.h"
  44. #include "ImfStringVectorAttribute.h"
  45. OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
  46. class OptimizationMode
  47. {
  48. public:
  49. bool _optimizable;
  50. int _ySampling;
  51. OptimizationMode() : _optimizable(false) {}
  52. };
  53. #ifdef IMF_HAVE_SSE2
  54. //------------------------------------------------------------------------
  55. // Test for SSE pointer alignemnt
  56. //------------------------------------------------------------------------
  57. EXR_FORCEINLINE
  58. bool
  59. isPointerSSEAligned (const void* EXR_RESTRICT pPointer)
  60. {
  61. uintptr_t trailingBits = ((uintptr_t)pPointer) & 15;
  62. return trailingBits == 0;
  63. }
  64. //------------------------------------------------------------------------
  65. // Load SSE from address into register
  66. //------------------------------------------------------------------------
  67. template<bool IS_ALIGNED>
  68. EXR_FORCEINLINE
  69. __m128i loadSSE (__m128i*& loadAddress)
  70. {
  71. // throw exception :: this is not accepted
  72. return _mm_loadu_si128 (loadAddress);
  73. }
  74. template<>
  75. EXR_FORCEINLINE
  76. __m128i loadSSE<false> (__m128i*& loadAddress)
  77. {
  78. return _mm_loadu_si128 (loadAddress);
  79. }
  80. template<>
  81. EXR_FORCEINLINE
  82. __m128i loadSSE<true> (__m128i*& loadAddress)
  83. {
  84. return _mm_load_si128 (loadAddress);
  85. }
  86. //------------------------------------------------------------------------
  87. // Store SSE from register into address
  88. //------------------------------------------------------------------------
  89. template<bool IS_ALIGNED>
  90. EXR_FORCEINLINE
  91. void storeSSE (__m128i*& storeAddress, __m128i& dataToStore)
  92. {
  93. }
  94. template<>
  95. EXR_FORCEINLINE
  96. void
  97. storeSSE<false> (__m128i*& storeAddress, __m128i& dataToStore)
  98. {
  99. _mm_storeu_si128 (storeAddress, dataToStore);
  100. }
  101. template<>
  102. EXR_FORCEINLINE
  103. void
  104. storeSSE<true> (__m128i*& storeAddress, __m128i& dataToStore)
  105. {
  106. _mm_stream_si128 (storeAddress, dataToStore);
  107. }
  108. //------------------------------------------------------------------------
  109. //
  110. // Write to RGBA
  111. //
  112. //------------------------------------------------------------------------
  113. //
  114. // Using SSE intrinsics
  115. //
  116. template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
  117. EXR_FORCEINLINE
  118. void writeToRGBASSETemplate
  119. (__m128i*& readPtrSSERed,
  120. __m128i*& readPtrSSEGreen,
  121. __m128i*& readPtrSSEBlue,
  122. __m128i*& readPtrSSEAlpha,
  123. __m128i*& writePtrSSE,
  124. const size_t& lPixelsToCopySSE)
  125. {
  126. for (size_t i = 0; i < lPixelsToCopySSE; ++i)
  127. {
  128. __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
  129. __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
  130. __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
  131. __m128i alphaRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEAlpha);
  132. __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
  133. greenRegister);
  134. __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
  135. alphaRegister);
  136. __m128i pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
  137. blueAlphaRegister);
  138. __m128i pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
  139. blueAlphaRegister);
  140. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
  141. ++writePtrSSE;
  142. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
  143. ++writePtrSSE;
  144. redGreenRegister = _mm_unpackhi_epi16 (redRegister, greenRegister);
  145. blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister, alphaRegister);
  146. pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
  147. blueAlphaRegister);
  148. pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
  149. blueAlphaRegister);
  150. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
  151. ++writePtrSSE;
  152. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
  153. ++writePtrSSE;
  154. ++readPtrSSEAlpha;
  155. ++readPtrSSEBlue;
  156. ++readPtrSSEGreen;
  157. ++readPtrSSERed;
  158. }
  159. }
  160. //
  161. // Not using SSE intrinsics. This is still faster than the alternative
  162. // because we have multiple read pointers and therefore we are able to
  163. // take advantage of data locality for write operations.
  164. //
  165. EXR_FORCEINLINE
  166. void writeToRGBANormal (unsigned short*& readPtrRed,
  167. unsigned short*& readPtrGreen,
  168. unsigned short*& readPtrBlue,
  169. unsigned short*& readPtrAlpha,
  170. unsigned short*& writePtr,
  171. const size_t& lPixelsToCopy)
  172. {
  173. for (size_t i = 0; i < lPixelsToCopy; ++i)
  174. {
  175. *(writePtr++) = *(readPtrRed++);
  176. *(writePtr++) = *(readPtrGreen++);
  177. *(writePtr++) = *(readPtrBlue++);
  178. *(writePtr++) = *(readPtrAlpha++);
  179. }
  180. }
  181. //
  182. // Determine which (template) version to use by checking whether pointers
  183. // are aligned
  184. //
  185. EXR_FORCEINLINE
  186. void optimizedWriteToRGBA (unsigned short*& readPtrRed,
  187. unsigned short*& readPtrGreen,
  188. unsigned short*& readPtrBlue,
  189. unsigned short*& readPtrAlpha,
  190. unsigned short*& writePtr,
  191. const size_t& pixelsToCopySSE,
  192. const size_t& pixelsToCopyNormal)
  193. {
  194. bool readPtrAreAligned = true;
  195. readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
  196. readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
  197. readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
  198. readPtrAreAligned &= isPointerSSEAligned(readPtrAlpha);
  199. bool writePtrIsAligned = isPointerSSEAligned(writePtr);
  200. if (!readPtrAreAligned && !writePtrIsAligned)
  201. {
  202. writeToRGBASSETemplate<false, false> ((__m128i*&)readPtrRed,
  203. (__m128i*&)readPtrGreen,
  204. (__m128i*&)readPtrBlue,
  205. (__m128i*&)readPtrAlpha,
  206. (__m128i*&)writePtr,
  207. pixelsToCopySSE);
  208. }
  209. else if (!readPtrAreAligned && writePtrIsAligned)
  210. {
  211. writeToRGBASSETemplate<false, true> ((__m128i*&)readPtrRed,
  212. (__m128i*&)readPtrGreen,
  213. (__m128i*&)readPtrBlue,
  214. (__m128i*&)readPtrAlpha,
  215. (__m128i*&)writePtr,
  216. pixelsToCopySSE);
  217. }
  218. else if (readPtrAreAligned && !writePtrIsAligned)
  219. {
  220. writeToRGBASSETemplate<true, false> ((__m128i*&)readPtrRed,
  221. (__m128i*&)readPtrGreen,
  222. (__m128i*&)readPtrBlue,
  223. (__m128i*&)readPtrAlpha,
  224. (__m128i*&)writePtr,
  225. pixelsToCopySSE);
  226. }
  227. else if(readPtrAreAligned && writePtrIsAligned)
  228. {
  229. writeToRGBASSETemplate<true, true> ((__m128i*&)readPtrRed,
  230. (__m128i*&)readPtrGreen,
  231. (__m128i*&)readPtrBlue,
  232. (__m128i*&)readPtrAlpha,
  233. (__m128i*&)writePtr,
  234. pixelsToCopySSE);
  235. }
  236. writeToRGBANormal (readPtrRed, readPtrGreen, readPtrBlue, readPtrAlpha,
  237. writePtr, pixelsToCopyNormal);
  238. }
  239. //------------------------------------------------------------------------
  240. //
  241. // Write to RGBA Fill A
  242. //
  243. //------------------------------------------------------------------------
  244. //
  245. // Using SSE intrinsics
  246. //
  247. template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
  248. EXR_FORCEINLINE
  249. void
  250. writeToRGBAFillASSETemplate (__m128i*& readPtrSSERed,
  251. __m128i*& readPtrSSEGreen,
  252. __m128i*& readPtrSSEBlue,
  253. const unsigned short& alphaFillValue,
  254. __m128i*& writePtrSSE,
  255. const size_t& pixelsToCopySSE)
  256. {
  257. const __m128i dummyAlphaRegister = _mm_set_epi16 (alphaFillValue,
  258. alphaFillValue,
  259. alphaFillValue,
  260. alphaFillValue,
  261. alphaFillValue,
  262. alphaFillValue,
  263. alphaFillValue,
  264. alphaFillValue);
  265. for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
  266. {
  267. __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
  268. __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
  269. __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
  270. __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
  271. greenRegister);
  272. __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
  273. dummyAlphaRegister);
  274. __m128i pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
  275. blueAlphaRegister);
  276. __m128i pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
  277. blueAlphaRegister);
  278. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
  279. ++writePtrSSE;
  280. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
  281. ++writePtrSSE;
  282. redGreenRegister = _mm_unpackhi_epi16 (redRegister,
  283. greenRegister);
  284. blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister,
  285. dummyAlphaRegister);
  286. pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
  287. blueAlphaRegister);
  288. pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
  289. blueAlphaRegister);
  290. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
  291. ++writePtrSSE;
  292. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
  293. ++writePtrSSE;
  294. ++readPtrSSEBlue;
  295. ++readPtrSSEGreen;
  296. ++readPtrSSERed;
  297. }
  298. }
  299. //
  300. // Not using SSE intrinsics. This is still faster than the alternative
  301. // because we have multiple read pointers and therefore we are able to
  302. // take advantage of data locality for write operations.
  303. //
  304. EXR_FORCEINLINE
  305. void
  306. writeToRGBAFillANormal (unsigned short*& readPtrRed,
  307. unsigned short*& readPtrGreen,
  308. unsigned short*& readPtrBlue,
  309. const unsigned short& alphaFillValue,
  310. unsigned short*& writePtr,
  311. const size_t& pixelsToCopy)
  312. {
  313. for (size_t i = 0; i < pixelsToCopy; ++i)
  314. {
  315. *(writePtr++) = *(readPtrRed++);
  316. *(writePtr++) = *(readPtrGreen++);
  317. *(writePtr++) = *(readPtrBlue++);
  318. *(writePtr++) = alphaFillValue;
  319. }
  320. }
  321. //
  322. // Determine which (template) version to use by checking whether pointers
  323. // are aligned.
  324. //
  325. EXR_FORCEINLINE
  326. void
  327. optimizedWriteToRGBAFillA (unsigned short*& readPtrRed,
  328. unsigned short*& readPtrGreen,
  329. unsigned short*& readPtrBlue,
  330. const unsigned short& alphaFillValue,
  331. unsigned short*& writePtr,
  332. const size_t& pixelsToCopySSE,
  333. const size_t& pixelsToCopyNormal)
  334. {
  335. bool readPtrAreAligned = true;
  336. readPtrAreAligned &= isPointerSSEAligned (readPtrRed);
  337. readPtrAreAligned &= isPointerSSEAligned (readPtrGreen);
  338. readPtrAreAligned &= isPointerSSEAligned (readPtrBlue);
  339. bool writePtrIsAligned = isPointerSSEAligned (writePtr);
  340. if (!readPtrAreAligned && !writePtrIsAligned)
  341. {
  342. writeToRGBAFillASSETemplate<false, false> ((__m128i*&)readPtrRed,
  343. (__m128i*&)readPtrGreen,
  344. (__m128i*&)readPtrBlue,
  345. alphaFillValue,
  346. (__m128i*&)writePtr,
  347. pixelsToCopySSE);
  348. }
  349. else if (!readPtrAreAligned && writePtrIsAligned)
  350. {
  351. writeToRGBAFillASSETemplate<false, true> ((__m128i*&)readPtrRed,
  352. (__m128i*&)readPtrGreen,
  353. (__m128i*&)readPtrBlue,
  354. alphaFillValue,
  355. (__m128i*&)writePtr,
  356. pixelsToCopySSE);
  357. }
  358. else if (readPtrAreAligned && !writePtrIsAligned)
  359. {
  360. writeToRGBAFillASSETemplate<true, false> ((__m128i*&)readPtrRed,
  361. (__m128i*&)readPtrGreen,
  362. (__m128i*&)readPtrBlue,
  363. alphaFillValue,
  364. (__m128i*&)writePtr,
  365. pixelsToCopySSE);
  366. }
  367. else if (readPtrAreAligned && writePtrIsAligned)
  368. {
  369. writeToRGBAFillASSETemplate<true, true> ((__m128i*&)readPtrRed,
  370. (__m128i*&)readPtrGreen,
  371. (__m128i*&)readPtrBlue,
  372. alphaFillValue,
  373. (__m128i*&)writePtr,
  374. pixelsToCopySSE);
  375. }
  376. writeToRGBAFillANormal (readPtrRed,
  377. readPtrGreen, readPtrBlue, alphaFillValue,
  378. writePtr, pixelsToCopyNormal);
  379. }
  380. //------------------------------------------------------------------------
  381. //
  382. // Write to RGB
  383. //
  384. //------------------------------------------------------------------------
  385. //
  386. // Using SSE intrinsics
  387. //
  388. template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
  389. EXR_FORCEINLINE
  390. void
  391. writeToRGBSSETemplate (__m128i*& readPtrSSERed,
  392. __m128i*& readPtrSSEGreen,
  393. __m128i*& readPtrSSEBlue,
  394. __m128i*& writePtrSSE,
  395. const size_t& pixelsToCopySSE)
  396. {
  397. for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
  398. {
  399. //
  400. // Need to shuffle and unpack pointers to obtain my first register
  401. // We must save 8 pixels at a time, so we must have the following three registers at the end:
  402. // 1) R1 G1 B1 R2 G2 B2 R3 G3
  403. // 2) B3 R4 G4 B4 R5 G5 B5 R6
  404. // 3) G6 B6 R7 G7 B7 R8 G8 B8
  405. //
  406. __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
  407. __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
  408. __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
  409. //
  410. // First register: R1 G1 B1 R2 G2 B2 R3 G3
  411. // Construct 2 registers and then unpack them to obtain our final result:
  412. //
  413. __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
  414. greenRegister);
  415. __m128i redBlueRegister = _mm_unpacklo_epi16 (redRegister,
  416. blueRegister);
  417. __m128i greenBlueRegister = _mm_unpacklo_epi16 (greenRegister,
  418. blueRegister);
  419. // Left Part (R1 G1 B1 R2)
  420. __m128i quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
  421. _MM_SHUFFLE(3,0,2,1));
  422. __m128i halfLeft = _mm_unpacklo_epi32 (redGreenRegister,
  423. quarterRight);
  424. // Right Part (G2 B2 R3 G3)
  425. __m128i quarterLeft = _mm_shuffle_epi32 (greenBlueRegister,
  426. _MM_SHUFFLE(3,2,0,1));
  427. quarterRight = _mm_shuffle_epi32 (redGreenRegister,
  428. _MM_SHUFFLE(3,0,1,2));
  429. __m128i halfRight = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
  430. __m128i fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
  431. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
  432. ++writePtrSSE;
  433. //
  434. // Second register: B3 R4 G4 B4 R5 G5 B5 R6
  435. //
  436. // Left Part (B3, R4, G4, B4)
  437. quarterLeft = _mm_shufflehi_epi16 (redBlueRegister,
  438. _MM_SHUFFLE(0, 3, 2, 1));
  439. quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
  440. _MM_SHUFFLE(1, 0, 3, 2));
  441. halfLeft = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
  442. // Update the registers
  443. redGreenRegister = _mm_unpackhi_epi16 (redRegister, greenRegister);
  444. redBlueRegister = _mm_unpackhi_epi16 (redRegister, blueRegister);
  445. greenBlueRegister = _mm_unpackhi_epi16 (greenRegister, blueRegister);
  446. // Right Part (R5 G5 B5 R6)
  447. quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
  448. _MM_SHUFFLE(3,0,2,1));
  449. halfRight = _mm_unpacklo_epi32 (redGreenRegister, quarterRight);
  450. fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
  451. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
  452. ++writePtrSSE;
  453. //
  454. // Third register: G6 B6 R7 G7 B7 R8 G8 B8
  455. //
  456. // Left part (G6 B6 R7 G7)
  457. quarterLeft = _mm_shuffle_epi32 (greenBlueRegister,
  458. _MM_SHUFFLE(3,2,0,1));
  459. quarterRight = _mm_shuffle_epi32 (redGreenRegister,
  460. _MM_SHUFFLE(3,0,1,2));
  461. halfLeft = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
  462. // Right part (B7 R8 G8 B8)
  463. quarterLeft = _mm_shufflehi_epi16 (redBlueRegister,
  464. _MM_SHUFFLE(0, 3, 2, 1));
  465. quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
  466. _MM_SHUFFLE(1, 0, 3, 2));
  467. halfRight = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
  468. fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
  469. storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
  470. ++writePtrSSE;
  471. //
  472. // Increment read pointers
  473. //
  474. ++readPtrSSEBlue;
  475. ++readPtrSSEGreen;
  476. ++readPtrSSERed;
  477. }
  478. }
  479. //
  480. // Not using SSE intrinsics. This is still faster than the alternative
  481. // because we have multiple read pointers and therefore we are able to
  482. // take advantage of data locality for write operations.
  483. //
  484. EXR_FORCEINLINE
  485. void
  486. writeToRGBNormal (unsigned short*& readPtrRed,
  487. unsigned short*& readPtrGreen,
  488. unsigned short*& readPtrBlue,
  489. unsigned short*& writePtr,
  490. const size_t& pixelsToCopy)
  491. {
  492. for (size_t i = 0; i < pixelsToCopy; ++i)
  493. {
  494. *(writePtr++) = *(readPtrRed++);
  495. *(writePtr++) = *(readPtrGreen++);
  496. *(writePtr++) = *(readPtrBlue++);
  497. }
  498. }
  499. //
  500. // Determine which (template) version to use by checking whether pointers
  501. // are aligned
  502. //
  503. EXR_FORCEINLINE
  504. void optimizedWriteToRGB (unsigned short*& readPtrRed,
  505. unsigned short*& readPtrGreen,
  506. unsigned short*& readPtrBlue,
  507. unsigned short*& writePtr,
  508. const size_t& pixelsToCopySSE,
  509. const size_t& pixelsToCopyNormal)
  510. {
  511. bool readPtrAreAligned = true;
  512. readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
  513. readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
  514. readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
  515. bool writePtrIsAligned = isPointerSSEAligned(writePtr);
  516. if (!readPtrAreAligned && !writePtrIsAligned)
  517. {
  518. writeToRGBSSETemplate<false, false> ((__m128i*&)readPtrRed,
  519. (__m128i*&)readPtrGreen,
  520. (__m128i*&)readPtrBlue,
  521. (__m128i*&)writePtr,
  522. pixelsToCopySSE);
  523. }
  524. else if (!readPtrAreAligned && writePtrIsAligned)
  525. {
  526. writeToRGBSSETemplate<false, true> ((__m128i*&)readPtrRed,
  527. (__m128i*&)readPtrGreen,
  528. (__m128i*&)readPtrBlue,
  529. (__m128i*&)writePtr,
  530. pixelsToCopySSE);
  531. }
  532. else if (readPtrAreAligned && !writePtrIsAligned)
  533. {
  534. writeToRGBSSETemplate<true, false> ((__m128i*&)readPtrRed,
  535. (__m128i*&)readPtrGreen,
  536. (__m128i*&)readPtrBlue,
  537. (__m128i*&)writePtr,
  538. pixelsToCopySSE);
  539. }
  540. else if (readPtrAreAligned && writePtrIsAligned)
  541. {
  542. writeToRGBSSETemplate<true, true> ((__m128i*&)readPtrRed,
  543. (__m128i*&)readPtrGreen,
  544. (__m128i*&)readPtrBlue,
  545. (__m128i*&)writePtr,
  546. pixelsToCopySSE);
  547. }
  548. writeToRGBNormal (readPtrRed, readPtrGreen, readPtrBlue,
  549. writePtr, pixelsToCopyNormal);
  550. }
  551. #else // ! defined IMF_HAVE_SSE2
  552. #endif // defined IMF_HAVE_SSE2
  553. OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
  554. #endif