ImfDwaCompressor.cpp 102 KB


  1. ///////////////////////////////////////////////////////////////////////////
  2. //
  3. // Copyright (c) 2009-2014 DreamWorks Animation LLC.
  4. //
  5. // All rights reserved.
  6. //
  7. // Redistribution and use in source and binary forms, with or without
  8. // modification, are permitted provided that the following conditions are
  9. // met:
  10. // * Redistributions of source code must retain the above copyright
  11. // notice, this list of conditions and the following disclaimer.
  12. // * Redistributions in binary form must reproduce the above
  13. // copyright notice, this list of conditions and the following disclaimer
  14. // in the documentation and/or other materials provided with the
  15. // distribution.
  16. // * Neither the name of DreamWorks Animation nor the names of
  17. // its contributors may be used to endorse or promote products derived
  18. // from this software without specific prior written permission.
  19. //
  20. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. //
  32. ///////////////////////////////////////////////////////////////////////////
  33. //---------------------------------------------------
  34. //
  35. // class DwaCompressor -- Store lossy RGB data by quantizing
  36. // DCT components.
  37. //
  38. // First, we try and figure out what compression strategy to take
  39. // based in channel name. For RGB channels, we want a lossy method
  40. // described below. But, if we have alpha, we should do something
  41. // different (and probably using RLE). If we have depth, or velocity,
  42. // or something else, just fall back to ZIP. The rules for deciding
  43. // which strategy to use are setup in initializeDefaultChannelRules().
  44. // When writing a file, the relevant rules needed to decode are written
  45. // into the start of the data block, making a self-contained file.
  46. // If initializeDefaultChannelRules() doesn't quite suite your naming
  47. // conventions, you can adjust the rules without breaking decoder
  48. // compatability.
  49. //
  50. // If we're going to lossy compress R, G, or B channels, it's easier
  51. // to toss bits in a more perceptual uniform space. One could argue
  52. // at length as to what constitutes perceptually uniform, expecially
  53. // when storing either scene/input/focal plane referred and output referred
  54. // data.
  55. //
  56. // We'll compromise. For values <= 1, we use a traditional power function
  57. // (without any of that straight-line business at the bottom). For values > 1,
  58. // we want something more like a log function, since power functions blow
  59. // up. At 1, we want a smooth blend between the functions. So, we use a
  60. // piecewise function that does just that - see dwaLookups.cpp for
  61. // a little more detail.
  62. //
  63. // Also, if we find that we have R, G, and B channels from the same layer,
  64. // we can get a bit more compression efficiency by transforming to a Y'CbCr
  65. // space. We use the 709 transform, but with Cb,Cr = 0 for an input of
  66. // (0, 0, 0), instead of the traditional Cb,Cr = .5. Shifting the zero point
  67. // makes no sense with large range data. Transforms are done to from
  68. // the perceptual space data, not the linear-light space data (R'G'B' ->
  69. // (Y'CbCr, not RGB -> YCbCr).
  70. //
  71. // Next, we forward DCT the data. This is done with a floating
  72. // point DCT, as we don't really have control over the src range. The
  73. // resulting values are dropped to half-float precision.
  74. //
  75. // Now, we need to quantize. Quantization departs from the usual way
  76. // of dividing and rounding. Instead, we start with some floating
  77. // point "base-error" value. From this, we can derive quantization
  78. // error for each DCT component. Take the standard JPEG quantization
  79. // tables and normalize them by the smallest value. Then, multiply
  80. // the normalized quant tables by our base-error value. This gives
  81. // a range of errors for each DCT component.
  82. //
  83. // For each DCT component, we want to find a quantized value that
  84. // is within +- the per-component error. Pick the quantized value
  85. // that has the fewest bits set in its' binary representation.
  86. // Brute-forcing the search would make for extremly inefficient
  87. // compression. Fortunatly, we can precompute a table to assist
  88. // with this search.
  89. //
  90. // For each 16-bit float value, there are at most 15 other values with
  91. // fewer bits set. We can precompute these values in a compact form, since
  92. // many source values have far fewer that 15 possible quantized values.
  93. // Now, instead of searching the entire range +- the component error,
  94. // we can just search at most 15 quantization candidates. The search can
  95. // be accelerated a bit more by sorting the candidates by the
  96. // number of bits set, in increasing order. Then, the search can stop
  97. // once a candidate is found w/i the per-component quantization
  98. // error range.
  99. //
  100. // The quantization strategy has the side-benefit that there is no
  101. // de-quantization step upon decode, so we don't bother recording
  102. // the quantization table.
  103. //
  104. // Ok. So we now have quantized values. Time for entropy coding. We
  105. // can use either static Huffman or zlib/DEFLATE. The static Huffman
  106. // is more efficient at compacting data, but can have a greater
  107. // overhead, especially for smaller tile/strip sizes.
  108. //
  109. // There is some additional fun, like ZIP compressing the DC components
  110. // instead of Huffman/zlib, which helps make things slightly smaller.
  111. //
  112. // Compression level is controlled by setting an int/float/double attribute
  113. // on the header named "dwaCompressionLevel". This is a thinly veiled name for
  114. // the "base-error" value mentioned above. The "base-error" is just
  115. // dwaCompressionLevel / 100000. The default value of 45.0 is generally
  116. // pretty good at generating "visually lossless" values at reasonable
  117. // data rates. Setting dwaCompressionLevel to 0 should result in no additional
  118. // quantization at the quantization stage (though there may be
  119. // quantization in practice at the CSC/DCT steps). But if you really
  120. // want lossless compression, there are pleanty of other choices
  121. // of compressors ;)
  122. //
  123. // When dealing with FLOAT source buffers, we first quantize the source
  124. // to HALF and continue down as we would for HALF source.
  125. //
  126. //---------------------------------------------------
  127. #include "ImfDwaCompressor.h"
  128. #include "ImfDwaCompressorSimd.h"
  129. #include "ImfChannelList.h"
  130. #include "ImfStandardAttributes.h"
  131. #include "ImfHeader.h"
  132. #include "ImfHuf.h"
  133. #include "ImfInt64.h"
  134. #include "ImfIntAttribute.h"
  135. #include "ImfIO.h"
  136. #include "ImfMisc.h"
  137. #include "ImfNamespace.h"
  138. #include "ImfRle.h"
  139. #include "ImfSimd.h"
  140. #include "ImfSystemSpecific.h"
  141. #include "ImfXdr.h"
  142. #include "ImfZip.h"
  143. #include "ImathFun.h"
  144. #include "ImathBox.h"
  145. #include "ImathVec.h"
  146. #include "half.h"
  147. #include "halfLimits.h"
  148. #include "dwaLookups.h"
  149. #include <vector>
  150. #include <string>
  151. #include <cctype>
  152. #include <cassert>
  153. #include <algorithm>
  154. // Windows specific addition to prevent the indirect import of the redefined min/max macros
  155. #if defined _WIN32 || defined _WIN64
  156. #ifdef NOMINMAX
  157. #undef NOMINMAX
  158. #endif
  159. #define NOMINMAX
  160. #endif
  161. #include <zlib.h>
  162. OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
  163. namespace {
  164. //
  165. // Function pointer to dispatch to an approprate
  166. // convertFloatToHalf64_* impl, based on runtime cpu checking.
  167. // Should be initialized in DwaCompressor::initializeFuncs()
  168. //
  169. void (*convertFloatToHalf64)(unsigned short*, float*) =
  170. convertFloatToHalf64_scalar;
  171. //
  172. // Function pointer for dispatching a fromHalfZigZag_ impl
  173. //
  174. void (*fromHalfZigZag)(unsigned short*, float*) =
  175. fromHalfZigZag_scalar;
  176. //
  177. // Dispatch the inverse DCT on an 8x8 block, where the last
  178. // n rows can be all zeros. The n=0 case converts the full block.
  179. //
  180. void (*dctInverse8x8_0)(float*) = dctInverse8x8_scalar<0>;
  181. void (*dctInverse8x8_1)(float*) = dctInverse8x8_scalar<1>;
  182. void (*dctInverse8x8_2)(float*) = dctInverse8x8_scalar<2>;
  183. void (*dctInverse8x8_3)(float*) = dctInverse8x8_scalar<3>;
  184. void (*dctInverse8x8_4)(float*) = dctInverse8x8_scalar<4>;
  185. void (*dctInverse8x8_5)(float*) = dctInverse8x8_scalar<5>;
  186. void (*dctInverse8x8_6)(float*) = dctInverse8x8_scalar<6>;
  187. void (*dctInverse8x8_7)(float*) = dctInverse8x8_scalar<7>;
  188. } // namespace
  189. struct DwaCompressor::ChannelData
  190. {
  191. std::string name;
  192. CompressorScheme compression;
  193. int xSampling;
  194. int ySampling;
  195. PixelType type;
  196. bool pLinear;
  197. int width;
  198. int height;
  199. //
  200. // Incoming and outgoing data is scanline interleaved, and it's much
  201. // easier to operate on contiguous data. Assuming the planare unc
  202. // buffer is to hold RLE data, we need to rearrange to make bytes
  203. // adjacent.
  204. //
  205. char *planarUncBuffer;
  206. char *planarUncBufferEnd;
  207. char *planarUncRle[4];
  208. char *planarUncRleEnd[4];
  209. PixelType planarUncType;
  210. int planarUncSize;
  211. };
  212. struct DwaCompressor::CscChannelSet
  213. {
  214. int idx[3];
  215. };
  216. struct DwaCompressor::Classifier
  217. {
  218. Classifier (std::string suffix,
  219. CompressorScheme scheme,
  220. PixelType type,
  221. int cscIdx,
  222. bool caseInsensitive):
  223. _suffix(suffix),
  224. _scheme(scheme),
  225. _type(type),
  226. _cscIdx(cscIdx),
  227. _caseInsensitive(caseInsensitive)
  228. {
  229. if (caseInsensitive)
  230. std::transform(_suffix.begin(), _suffix.end(), _suffix.begin(), tolower);
  231. }
  232. Classifier (const char *&ptr, int size)
  233. {
  234. if (size <= 0)
  235. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  236. " (truncated rule).");
  237. {
  238. char suffix[Name::SIZE];
  239. memset (suffix, 0, Name::SIZE);
  240. Xdr::read<CharPtrIO> (ptr, std::min(size, Name::SIZE-1), suffix);
  241. _suffix = std::string(suffix);
  242. }
  243. if (size < _suffix.length() + 1 + 2*Xdr::size<char>())
  244. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  245. " (truncated rule).");
  246. char value;
  247. Xdr::read<CharPtrIO> (ptr, value);
  248. _cscIdx = (int)(value >> 4) - 1;
  249. if (_cscIdx < -1 || _cscIdx >= 3)
  250. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  251. " (corrupt cscIdx rule).");
  252. _scheme = (CompressorScheme)((value >> 2) & 3);
  253. if (_scheme < 0 || _scheme >= NUM_COMPRESSOR_SCHEMES)
  254. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  255. " (corrupt scheme rule).");
  256. _caseInsensitive = (value & 1 ? true : false);
  257. Xdr::read<CharPtrIO> (ptr, value);
  258. if (value < 0 || value >= NUM_PIXELTYPES)
  259. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  260. " (corrupt rule).");
  261. _type = (PixelType)value;
  262. }
  263. bool match (const std::string &suffix, const PixelType type) const
  264. {
  265. if (_type != type) return false;
  266. if (_caseInsensitive)
  267. {
  268. std::string tmp(suffix);
  269. std::transform(tmp.begin(), tmp.end(), tmp.begin(), tolower);
  270. return tmp == _suffix;
  271. }
  272. return suffix == _suffix;
  273. }
  274. size_t size () const
  275. {
  276. // string length + \0
  277. size_t sizeBytes = _suffix.length() + 1;
  278. // 1 byte for scheme / cscIdx / caseInsensitive, and 1 byte for type
  279. sizeBytes += 2 * Xdr::size<char>();
  280. return sizeBytes;
  281. }
  282. void write (char *&ptr) const
  283. {
  284. Xdr::write<CharPtrIO> (ptr, _suffix.c_str());
  285. // Encode _cscIdx (-1-3) in the upper 4 bits,
  286. // _scheme (0-2) in the next 2 bits
  287. // _caseInsen in the bottom bit
  288. unsigned char value = 0;
  289. value |= ((unsigned char)(_cscIdx+1) & 15) << 4;
  290. value |= ((unsigned char)_scheme & 3) << 2;
  291. value |= (unsigned char)_caseInsensitive & 1;
  292. Xdr::write<CharPtrIO> (ptr, value);
  293. Xdr::write<CharPtrIO> (ptr, (unsigned char)_type);
  294. }
  295. std::string _suffix;
  296. CompressorScheme _scheme;
  297. PixelType _type;
  298. int _cscIdx;
  299. bool _caseInsensitive;
  300. };
  301. //
  302. // Base class for the LOSSY_DCT decoder classes
  303. //
  304. class DwaCompressor::LossyDctDecoderBase
  305. {
  306. public:
  307. LossyDctDecoderBase
  308. (char *packedAc,
  309. char *packedDc,
  310. const unsigned short *toLinear,
  311. int width,
  312. int height);
  313. virtual ~LossyDctDecoderBase ();
  314. void execute();
  315. //
  316. // These return number of items, not bytes. Each item
  317. // is an unsigned short
  318. //
  319. int numAcValuesEncoded() const { return _packedAcCount; }
  320. int numDcValuesEncoded() const { return _packedDcCount; }
  321. protected:
  322. //
  323. // Un-RLE the packed AC components into
  324. // a half buffer. The half block should
  325. // be the full 8x8 block (in zig-zag order
  326. // still), not the first AC component.
  327. //
  328. // currAcComp is advanced as bytes are decoded.
  329. //
  330. // This returns the index of the last non-zero
  331. // value in the buffer - with the index into zig zag
  332. // order data. If we return 0, we have DC only data.
  333. //
  334. int unRleAc (unsigned short *&currAcComp,
  335. unsigned short *halfZigBlock);
  336. //
  337. // if NATIVE and XDR are really the same values, we can
  338. // skip some processing and speed things along
  339. //
  340. bool _isNativeXdr;
  341. //
  342. // Counts of how many items have been packed into the
  343. // AC and DC buffers
  344. //
  345. int _packedAcCount;
  346. int _packedDcCount;
  347. //
  348. // AC and DC buffers to pack
  349. //
  350. char *_packedAc;
  351. char *_packedDc;
  352. //
  353. // half -> half LUT to transform from nonlinear to linear
  354. //
  355. const unsigned short *_toLinear;
  356. //
  357. // image dimensions
  358. //
  359. int _width;
  360. int _height;
  361. //
  362. // Pointers to the start of each scanlines, to be filled on decode
  363. // Generally, these will be filled by the subclasses.
  364. //
  365. std::vector< std::vector<char *> > _rowPtrs;
  366. //
  367. // The type of each data that _rowPtrs[i] is referring. Layout
  368. // is in the same order as _rowPtrs[].
  369. //
  370. std::vector<PixelType> _type;
  371. std::vector<SimdAlignedBuffer64f> _dctData;
  372. };
  373. //
  374. // Used to decode a single channel of LOSSY_DCT data.
  375. //
  376. class DwaCompressor::LossyDctDecoder: public LossyDctDecoderBase
  377. {
  378. public:
  379. //
  380. // toLinear is a half-float LUT to convert the encoded values
  381. // back to linear light. If you want to skip this step, pass
  382. // in NULL here.
  383. //
  384. LossyDctDecoder
  385. (std::vector<char *> &rowPtrs,
  386. char *packedAc,
  387. char *packedDc,
  388. const unsigned short *toLinear,
  389. int width,
  390. int height,
  391. PixelType type)
  392. :
  393. LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
  394. {
  395. _rowPtrs.push_back(rowPtrs);
  396. _type.push_back(type);
  397. }
  398. virtual ~LossyDctDecoder () {}
  399. };
  400. //
  401. // Used to decode 3 channels of LOSSY_DCT data that
  402. // are grouped together and color space converted.
  403. //
  404. class DwaCompressor::LossyDctDecoderCsc: public LossyDctDecoderBase
  405. {
  406. public:
  407. //
  408. // toLinear is a half-float LUT to convert the encoded values
  409. // back to linear light. If you want to skip this step, pass
  410. // in NULL here.
  411. //
  412. LossyDctDecoderCsc
  413. (std::vector<char *> &rowPtrsR,
  414. std::vector<char *> &rowPtrsG,
  415. std::vector<char *> &rowPtrsB,
  416. char *packedAc,
  417. char *packedDc,
  418. const unsigned short *toLinear,
  419. int width,
  420. int height,
  421. PixelType typeR,
  422. PixelType typeG,
  423. PixelType typeB)
  424. :
  425. LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
  426. {
  427. _rowPtrs.push_back(rowPtrsR);
  428. _rowPtrs.push_back(rowPtrsG);
  429. _rowPtrs.push_back(rowPtrsB);
  430. _type.push_back(typeR);
  431. _type.push_back(typeG);
  432. _type.push_back(typeB);
  433. }
  434. virtual ~LossyDctDecoderCsc () {}
  435. };
  436. //
  437. // Base class for encoding using the lossy DCT scheme
  438. //
  439. class DwaCompressor::LossyDctEncoderBase
  440. {
  441. public:
  442. LossyDctEncoderBase
  443. (float quantBaseError,
  444. char *packedAc,
  445. char *packedDc,
  446. const unsigned short *toNonlinear,
  447. int width,
  448. int height);
  449. virtual ~LossyDctEncoderBase ();
  450. void execute ();
  451. //
  452. // These return number of items, not bytes. Each item
  453. // is an unsigned short
  454. //
  455. int numAcValuesEncoded () const {return _numAcComp;}
  456. int numDcValuesEncoded () const {return _numDcComp;}
  457. protected:
  458. void toZigZag (half *dst, half *src);
  459. int countSetBits (unsigned short src);
  460. half quantize (half src, float errorTolerance);
  461. void rleAc (half *block, unsigned short *&acPtr);
  462. float _quantBaseError;
  463. int _width,
  464. _height;
  465. const unsigned short *_toNonlinear;
  466. int _numAcComp,
  467. _numDcComp;
  468. std::vector< std::vector<const char *> > _rowPtrs;
  469. std::vector<PixelType> _type;
  470. std::vector<SimdAlignedBuffer64f> _dctData;
  471. //
  472. // Pointers to the buffers where AC and DC
  473. // DCT components should be packed for
  474. // lossless compression downstream
  475. //
  476. char *_packedAc;
  477. char *_packedDc;
  478. //
  479. // Our "quantization tables" - the example JPEG tables,
  480. // normalized so that the smallest value in each is 1.0.
  481. // This gives us a relationship between error in DCT
  482. // components
  483. //
  484. float _quantTableY[64];
  485. float _quantTableCbCr[64];
  486. };
  487. //
  488. // Single channel lossy DCT encoder
  489. //
  490. class DwaCompressor::LossyDctEncoder: public LossyDctEncoderBase
  491. {
  492. public:
  493. LossyDctEncoder
  494. (float quantBaseError,
  495. std::vector<const char *> &rowPtrs,
  496. char *packedAc,
  497. char *packedDc,
  498. const unsigned short *toNonlinear,
  499. int width,
  500. int height,
  501. PixelType type)
  502. :
  503. LossyDctEncoderBase
  504. (quantBaseError, packedAc, packedDc, toNonlinear, width, height)
  505. {
  506. _rowPtrs.push_back(rowPtrs);
  507. _type.push_back(type);
  508. }
  509. virtual ~LossyDctEncoder () {}
  510. };
  511. //
  512. // RGB channel lossy DCT encoder
  513. //
  514. class DwaCompressor::LossyDctEncoderCsc: public LossyDctEncoderBase
  515. {
  516. public:
  517. LossyDctEncoderCsc
  518. (float quantBaseError,
  519. std::vector<const char *> &rowPtrsR,
  520. std::vector<const char *> &rowPtrsG,
  521. std::vector<const char *> &rowPtrsB,
  522. char *packedAc,
  523. char *packedDc,
  524. const unsigned short *toNonlinear,
  525. int width,
  526. int height,
  527. PixelType typeR,
  528. PixelType typeG,
  529. PixelType typeB)
  530. :
  531. LossyDctEncoderBase
  532. (quantBaseError, packedAc, packedDc, toNonlinear, width, height)
  533. {
  534. _type.push_back(typeR);
  535. _type.push_back(typeG);
  536. _type.push_back(typeB);
  537. _rowPtrs.push_back(rowPtrsR);
  538. _rowPtrs.push_back(rowPtrsG);
  539. _rowPtrs.push_back(rowPtrsB);
  540. }
  541. virtual ~LossyDctEncoderCsc () {}
  542. };
  543. // ==============================================================
  544. //
  545. // LossyDctDecoderBase
  546. //
  547. // --------------------------------------------------------------
  548. DwaCompressor::LossyDctDecoderBase::LossyDctDecoderBase
  549. (char *packedAc,
  550. char *packedDc,
  551. const unsigned short *toLinear,
  552. int width,
  553. int height)
  554. :
  555. _isNativeXdr(false),
  556. _packedAcCount(0),
  557. _packedDcCount(0),
  558. _packedAc(packedAc),
  559. _packedDc(packedDc),
  560. _toLinear(toLinear),
  561. _width(width),
  562. _height(height)
  563. {
  564. if (_toLinear == 0)
  565. _toLinear = get_dwaCompressorNoOp();
  566. _isNativeXdr = GLOBAL_SYSTEM_LITTLE_ENDIAN;
  567. }
  568. DwaCompressor::LossyDctDecoderBase::~LossyDctDecoderBase () {}
  569. void
  570. DwaCompressor::LossyDctDecoderBase::execute ()
  571. {
  572. int numComp = _rowPtrs.size();
  573. int lastNonZero = 0;
  574. int numBlocksX = (int) ceil ((float)_width / 8.0f);
  575. int numBlocksY = (int) ceil ((float)_height / 8.0f);
  576. int leftoverX = _width - (numBlocksX-1) * 8;
  577. int leftoverY = _height - (numBlocksY-1) * 8;
  578. int numFullBlocksX = (int)floor ((float)_width / 8.0f);
  579. unsigned short tmpShortNative = 0;
  580. unsigned short tmpShortXdr = 0;
  581. const char *tmpConstCharPtr = 0;
  582. unsigned short *currAcComp = (unsigned short *)_packedAc;
  583. std::vector<unsigned short *> currDcComp (_rowPtrs.size());
  584. std::vector<SimdAlignedBuffer64us> halfZigBlock (_rowPtrs.size());
  585. if (_type.size() != _rowPtrs.size())
  586. throw IEX_NAMESPACE::BaseExc ("Row pointers and types mismatch in count");
  587. if ((_rowPtrs.size() != 3) && (_rowPtrs.size() != 1))
  588. throw IEX_NAMESPACE::NoImplExc ("Only 1 and 3 channel encoding is supported");
  589. _dctData.resize(numComp);
  590. //
  591. // Allocate a temp aligned buffer to hold a rows worth of full
  592. // 8x8 half-float blocks
  593. //
  594. unsigned char *rowBlockHandle = new unsigned char
  595. [numComp * numBlocksX * 64 * sizeof(unsigned short) + _SSE_ALIGNMENT];
  596. unsigned short *rowBlock[3];
  597. rowBlock[0] = (unsigned short*)rowBlockHandle;
  598. for (int i = 0; i < _SSE_ALIGNMENT; ++i)
  599. {
  600. if (((size_t)(rowBlockHandle + i) & _SSE_ALIGNMENT_MASK) == 0)
  601. rowBlock[0] = (unsigned short *)(rowBlockHandle + i);
  602. }
  603. for (int comp = 1; comp < numComp; ++comp)
  604. rowBlock[comp] = rowBlock[comp - 1] + numBlocksX * 64;
  605. //
  606. // Pack DC components together by common plane, so we can get
  607. // a little more out of differencing them. We'll always have
  608. // one component per block, so we can computed offsets.
  609. //
  610. currDcComp[0] = (unsigned short *)_packedDc;
  611. for (unsigned int comp = 1; comp < numComp; ++comp)
  612. currDcComp[comp] = currDcComp[comp - 1] + numBlocksX * numBlocksY;
  613. for (int blocky = 0; blocky < numBlocksY; ++blocky)
  614. {
  615. int maxY = 8;
  616. if (blocky == numBlocksY-1)
  617. maxY = leftoverY;
  618. int maxX = 8;
  619. for (int blockx = 0; blockx < numBlocksX; ++blockx)
  620. {
  621. if (blockx == numBlocksX-1)
  622. maxX = leftoverX;
  623. //
  624. // If we can detect that the block is constant values
  625. // (all components only have DC values, and all AC is 0),
  626. // we can do everything only on 1 value, instead of all
  627. // 64.
  628. //
  629. // This won't really help for regular images, but it is
  630. // meant more for layers with large swaths of black
  631. //
  632. bool blockIsConstant = true;
  633. for (unsigned int comp = 0; comp < numComp; ++comp)
  634. {
  635. //
  636. // DC component is stored separately
  637. //
  638. #ifdef IMF_HAVE_SSE2
  639. {
  640. __m128i *dst = (__m128i*)halfZigBlock[comp]._buffer;
  641. dst[7] = _mm_setzero_si128();
  642. dst[6] = _mm_setzero_si128();
  643. dst[5] = _mm_setzero_si128();
  644. dst[4] = _mm_setzero_si128();
  645. dst[3] = _mm_setzero_si128();
  646. dst[2] = _mm_setzero_si128();
  647. dst[1] = _mm_setzero_si128();
  648. dst[0] = _mm_insert_epi16
  649. (_mm_setzero_si128(), *currDcComp[comp]++, 0);
  650. }
  651. #else /* IMF_HAVE_SSE2 */
  652. memset (halfZigBlock[comp]._buffer, 0, 64 * 2);
  653. halfZigBlock[comp]._buffer[0] = *currDcComp[comp]++;
  654. #endif /* IMF_HAVE_SSE2 */
  655. _packedDcCount++;
  656. //
  657. // UnRLE the AC. This will modify currAcComp
  658. //
  659. lastNonZero = unRleAc (currAcComp, halfZigBlock[comp]._buffer);
  660. //
  661. // Convert from XDR to NATIVE
  662. //
  663. if (!_isNativeXdr)
  664. {
  665. for (int i = 0; i < 64; ++i)
  666. {
  667. tmpShortXdr = halfZigBlock[comp]._buffer[i];
  668. tmpConstCharPtr = (const char *)&tmpShortXdr;
  669. Xdr::read<CharPtrIO> (tmpConstCharPtr, tmpShortNative);
  670. halfZigBlock[comp]._buffer[i] = tmpShortNative;
  671. }
  672. }
  673. if (lastNonZero == 0)
  674. {
  675. //
  676. // DC only case - AC components are all 0
  677. //
  678. half h;
  679. h.setBits (halfZigBlock[comp]._buffer[0]);
  680. _dctData[comp]._buffer[0] = (float)h;
  681. dctInverse8x8DcOnly (_dctData[comp]._buffer);
  682. }
  683. else
  684. {
  685. //
  686. // We have some AC components that are non-zero.
  687. // Can't use the 'constant block' optimization
  688. //
  689. blockIsConstant = false;
  690. //
  691. // Un-Zig zag
  692. //
  693. (*fromHalfZigZag)
  694. (halfZigBlock[comp]._buffer, _dctData[comp]._buffer);
  695. //
  696. // Zig-Zag indices in normal layout are as follows:
  697. //
  698. // 0 1 5 6 14 15 27 28
  699. // 2 4 7 13 16 26 29 42
  700. // 3 8 12 17 25 30 41 43
  701. // 9 11 18 24 31 40 44 53
  702. // 10 19 23 32 39 45 52 54
  703. // 20 22 33 38 46 51 55 60
  704. // 21 34 37 47 50 56 59 61
  705. // 35 36 48 49 57 58 62 63
  706. //
  707. // If lastNonZero is less than the first item on
  708. // each row, we know that the whole row is zero and
  709. // can be skipped in the row-oriented part of the
  710. // iDCT.
  711. //
  712. // The unrolled logic here is:
  713. //
  714. // if lastNonZero < rowStartIdx[i],
  715. // zeroedRows = rowsEmpty[i]
  716. //
  717. // where:
  718. //
  719. // const int rowStartIdx[] = {2, 3, 9, 10, 20, 21, 35};
  720. // const int rowsEmpty[] = {7, 6, 5, 4, 3, 2, 1};
  721. //
  722. if (lastNonZero < 2)
  723. dctInverse8x8_7(_dctData[comp]._buffer);
  724. else if (lastNonZero < 3)
  725. dctInverse8x8_6(_dctData[comp]._buffer);
  726. else if (lastNonZero < 9)
  727. dctInverse8x8_5(_dctData[comp]._buffer);
  728. else if (lastNonZero < 10)
  729. dctInverse8x8_4(_dctData[comp]._buffer);
  730. else if (lastNonZero < 20)
  731. dctInverse8x8_3(_dctData[comp]._buffer);
  732. else if (lastNonZero < 21)
  733. dctInverse8x8_2(_dctData[comp]._buffer);
  734. else if (lastNonZero < 35)
  735. dctInverse8x8_1(_dctData[comp]._buffer);
  736. else
  737. dctInverse8x8_0(_dctData[comp]._buffer);
  738. }
  739. }
  740. //
  741. // Perform the CSC
  742. //
  743. if (numComp == 3)
  744. {
  745. if (!blockIsConstant)
  746. {
  747. csc709Inverse64 (_dctData[0]._buffer,
  748. _dctData[1]._buffer,
  749. _dctData[2]._buffer);
  750. }
  751. else
  752. {
  753. csc709Inverse (_dctData[0]._buffer[0],
  754. _dctData[1]._buffer[0],
  755. _dctData[2]._buffer[0]);
  756. }
  757. }
  758. //
  759. // Float -> Half conversion.
  760. //
  761. // If the block has a constant value, just convert the first pixel.
  762. //
  763. for (unsigned int comp = 0; comp < numComp; ++comp)
  764. {
  765. if (!blockIsConstant)
  766. {
  767. (*convertFloatToHalf64)
  768. (&rowBlock[comp][blockx*64], _dctData[comp]._buffer);
  769. }
  770. else
  771. {
  772. #ifdef IMF_HAVE_SSE2
  773. __m128i *dst = (__m128i*)&rowBlock[comp][blockx*64];
  774. dst[0] = _mm_set1_epi16
  775. (((half)_dctData[comp]._buffer[0]).bits());
  776. dst[1] = dst[0];
  777. dst[2] = dst[0];
  778. dst[3] = dst[0];
  779. dst[4] = dst[0];
  780. dst[5] = dst[0];
  781. dst[6] = dst[0];
  782. dst[7] = dst[0];
  783. #else /* IMF_HAVE_SSE2 */
  784. unsigned short *dst = &rowBlock[comp][blockx*64];
  785. dst[0] = ((half)_dctData[comp]._buffer[0]).bits();
  786. for (int i = 1; i < 64; ++i)
  787. {
  788. dst[i] = dst[0];
  789. }
  790. #endif /* IMF_HAVE_SSE2 */
  791. } // blockIsConstant
  792. } // comp
  793. } // blockx
  794. //
  795. // At this point, we have half-float nonlinear value blocked
  796. // in rowBlock[][]. We need to unblock the data, transfer
  797. // back to linear, and write the results in the _rowPtrs[].
  798. //
  799. // There is a fast-path for aligned rows, which helps
  800. // things a little. Since this fast path is only valid
  801. // for full 8-element wide blocks, the partial x blocks
  802. // are broken into a separate loop below.
  803. //
  804. // At the moment, the fast path requires:
  805. // * sse support
  806. // * aligned row pointers
  807. // * full 8-element wide blocks
  808. //
  809. for (int comp = 0; comp < numComp; ++comp)
  810. {
  811. //
  812. // Test if we can use the fast path
  813. //
  814. #ifdef IMF_HAVE_SSE2
  815. bool fastPath = true;
  816. for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
  817. {
  818. if ((size_t)_rowPtrs[comp][y] & _SSE_ALIGNMENT_MASK)
  819. fastPath = false;
  820. }
  821. if (fastPath)
  822. {
  823. //
  824. // Handle all the full X blocks, in a fast path with sse2 and
  825. // aligned row pointers
  826. //
  827. for (int y=8*blocky; y<8*blocky+maxY; ++y)
  828. {
  829. __m128i *dst = (__m128i *)_rowPtrs[comp][y];
  830. __m128i *src = (__m128i *)&rowBlock[comp][(y & 0x7) * 8];
  831. for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
  832. {
  833. //
  834. // These may need some twiddling.
  835. // Run with multiples of 8
  836. //
  837. _mm_prefetch ((char *)(src + 16), _MM_HINT_NTA);
  838. unsigned short i0 = _mm_extract_epi16 (*src, 0);
  839. unsigned short i1 = _mm_extract_epi16 (*src, 1);
  840. unsigned short i2 = _mm_extract_epi16 (*src, 2);
  841. unsigned short i3 = _mm_extract_epi16 (*src, 3);
  842. unsigned short i4 = _mm_extract_epi16 (*src, 4);
  843. unsigned short i5 = _mm_extract_epi16 (*src, 5);
  844. unsigned short i6 = _mm_extract_epi16 (*src, 6);
  845. unsigned short i7 = _mm_extract_epi16 (*src, 7);
  846. i0 = _toLinear[i0];
  847. i1 = _toLinear[i1];
  848. i2 = _toLinear[i2];
  849. i3 = _toLinear[i3];
  850. i4 = _toLinear[i4];
  851. i5 = _toLinear[i5];
  852. i6 = _toLinear[i6];
  853. i7 = _toLinear[i7];
  854. *dst = _mm_insert_epi16 (_mm_setzero_si128(), i0, 0);
  855. *dst = _mm_insert_epi16 (*dst, i1, 1);
  856. *dst = _mm_insert_epi16 (*dst, i2, 2);
  857. *dst = _mm_insert_epi16 (*dst, i3, 3);
  858. *dst = _mm_insert_epi16 (*dst, i4, 4);
  859. *dst = _mm_insert_epi16 (*dst, i5, 5);
  860. *dst = _mm_insert_epi16 (*dst, i6, 6);
  861. *dst = _mm_insert_epi16 (*dst, i7, 7);
  862. src += 8;
  863. dst++;
  864. }
  865. }
  866. }
  867. else
  868. {
  869. #endif /* IMF_HAVE_SSE2 */
  870. //
  871. // Basic scalar kinda slow path for handling the full X blocks
  872. //
  873. for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
  874. {
  875. unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];
  876. for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
  877. {
  878. unsigned short *src =
  879. &rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];
  880. dst[0] = _toLinear[src[0]];
  881. dst[1] = _toLinear[src[1]];
  882. dst[2] = _toLinear[src[2]];
  883. dst[3] = _toLinear[src[3]];
  884. dst[4] = _toLinear[src[4]];
  885. dst[5] = _toLinear[src[5]];
  886. dst[6] = _toLinear[src[6]];
  887. dst[7] = _toLinear[src[7]];
  888. dst += 8;
  889. }
  890. }
  891. #ifdef IMF_HAVE_SSE2
  892. }
  893. #endif /* IMF_HAVE_SSE2 */
  894. //
  895. // If we have partial X blocks, deal with all those now
  896. // Since this should be minimal work, there currently
  897. // is only one path that should work for everyone.
  898. //
  899. if (numFullBlocksX != numBlocksX)
  900. {
  901. for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
  902. {
  903. unsigned short *src = (unsigned short *)
  904. &rowBlock[comp][numFullBlocksX * 64 + ((y & 0x7) * 8)];
  905. unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];
  906. dst += 8 * numFullBlocksX;
  907. for (int x = 0; x < maxX; ++x)
  908. {
  909. *dst++ = _toLinear[*src++];
  910. }
  911. }
  912. }
  913. } // comp
  914. } // blocky
  915. //
  916. // Walk over all the channels that are of type FLOAT.
  917. // Convert from HALF XDR back to FLOAT XDR.
  918. //
  919. for (unsigned int chan = 0; chan < numComp; ++chan)
  920. {
  921. if (_type[chan] != FLOAT)
  922. continue;
  923. std::vector<unsigned short> halfXdr (_width);
  924. for (int y=0; y<_height; ++y)
  925. {
  926. char *floatXdrPtr = _rowPtrs[chan][y];
  927. memcpy(&halfXdr[0], floatXdrPtr, _width*sizeof(unsigned short));
  928. const char *halfXdrPtr = (const char *)(&halfXdr[0]);
  929. for (int x=0; x<_width; ++x)
  930. {
  931. half tmpHalf;
  932. Xdr::read<CharPtrIO> (halfXdrPtr, tmpHalf);
  933. Xdr::write<CharPtrIO> (floatXdrPtr, (float)tmpHalf);
  934. //
  935. // Xdr::write and Xdr::read will advance the ptrs
  936. //
  937. }
  938. }
  939. }
  940. delete[] rowBlockHandle;
  941. }
  942. //
  943. // Un-RLE the packed AC components into
  944. // a half buffer. The half block should
  945. // be the full 8x8 block (in zig-zag order
  946. // still), not the first AC component.
  947. //
  948. // currAcComp is advanced as bytes are decoded.
  949. //
  950. // This returns the index of the last non-zero
  951. // value in the buffer - with the index into zig zag
  952. // order data. If we return 0, we have DC only data.
  953. //
  954. // This is assuminging that halfZigBlock is zero'ed
  955. // prior to calling
  956. //
  957. int
  958. DwaCompressor::LossyDctDecoderBase::unRleAc
  959. (unsigned short *&currAcComp,
  960. unsigned short *halfZigBlock)
  961. {
  962. //
  963. // Un-RLE the RLE'd blocks. If we find an item whose
  964. // high byte is 0xff, then insert the number of 0's
  965. // as indicated by the low byte.
  966. //
  967. // Otherwise, just copy the number verbaitm.
  968. //
  969. int lastNonZero = 0;
  970. int dctComp = 1;
  971. //
  972. // Start with a zero'ed block, so we don't have to
  973. // write when we hit a run symbol
  974. //
  975. while (dctComp < 64)
  976. {
  977. if (*currAcComp == 0xff00)
  978. {
  979. //
  980. // End of block
  981. //
  982. dctComp = 64;
  983. }
  984. else if ((*currAcComp) >> 8 == 0xff)
  985. {
  986. //
  987. // Run detected! Insert 0's.
  988. //
  989. // Since the block has been zeroed, just advance the ptr
  990. //
  991. dctComp += (*currAcComp) & 0xff;
  992. }
  993. else
  994. {
  995. //
  996. // Not a run, just copy over the value
  997. //
  998. lastNonZero = dctComp;
  999. halfZigBlock[dctComp] = *currAcComp;
  1000. dctComp++;
  1001. }
  1002. _packedAcCount++;
  1003. currAcComp++;
  1004. }
  1005. return lastNonZero;
  1006. }
  1007. // ==============================================================
  1008. //
  1009. // LossyDctEncoderBase
  1010. //
  1011. // --------------------------------------------------------------
  1012. DwaCompressor::LossyDctEncoderBase::LossyDctEncoderBase
  1013. (float quantBaseError,
  1014. char *packedAc,
  1015. char *packedDc,
  1016. const unsigned short *toNonlinear,
  1017. int width,
  1018. int height)
  1019. :
  1020. _quantBaseError(quantBaseError),
  1021. _width(width),
  1022. _height(height),
  1023. _toNonlinear(toNonlinear),
  1024. _numAcComp(0),
  1025. _numDcComp(0),
  1026. _packedAc(packedAc),
  1027. _packedDc(packedDc)
  1028. {
  1029. //
  1030. // Here, we take the generic JPEG quantization tables and
  1031. // normalize them by the smallest component in each table.
  1032. // This gives us a relationship amongst the DCT components,
  1033. // in terms of how sensitive each component is to
  1034. // error.
  1035. //
  1036. // A higher normalized value means we can quantize more,
  1037. // and a small normalized value means we can quantize less.
  1038. //
  1039. // Eventually, we will want an acceptable quantization
  1040. // error range for each component. We find this by
  1041. // multiplying some user-specified level (_quantBaseError)
  1042. // by the normalized table (_quantTableY, _quantTableCbCr) to
  1043. // find the acceptable quantization error range.
  1044. //
  1045. // The quantization table is not needed for decoding, and
  1046. // is not transmitted. So, if you want to get really fancy,
  1047. // you could derive some content-dependent quantization
  1048. // table, and the decoder would not need to be changed. But,
  1049. // for now, we'll just use statice quantization tables.
  1050. //
  1051. int jpegQuantTableY[] =
  1052. {
  1053. 16, 11, 10, 16, 24, 40, 51, 61,
  1054. 12, 12, 14, 19, 26, 58, 60, 55,
  1055. 14, 13, 16, 24, 40, 57, 69, 56,
  1056. 14, 17, 22, 29, 51, 87, 80, 62,
  1057. 18, 22, 37, 56, 68, 109, 103, 77,
  1058. 24, 35, 55, 64, 81, 104, 113, 92,
  1059. 49, 64, 78, 87, 103, 121, 120, 101,
  1060. 72, 92, 95, 98, 112, 100, 103, 99
  1061. };
  1062. int jpegQuantTableYMin = 10;
  1063. int jpegQuantTableCbCr[] =
  1064. {
  1065. 17, 18, 24, 47, 99, 99, 99, 99,
  1066. 18, 21, 26, 66, 99, 99, 99, 99,
  1067. 24, 26, 56, 99, 99, 99, 99, 99,
  1068. 47, 66, 99, 99, 99, 99, 99, 99,
  1069. 99, 99, 99, 99, 99, 99, 99, 99,
  1070. 99, 99, 99, 99, 99, 99, 99, 99,
  1071. 99, 99, 99, 99, 99, 99, 99, 99,
  1072. 99, 99, 99, 99, 99, 99, 99, 99
  1073. };
  1074. int jpegQuantTableCbCrMin = 17;
  1075. for (int idx = 0; idx < 64; ++idx)
  1076. {
  1077. _quantTableY[idx] = static_cast<float> (jpegQuantTableY[idx]) /
  1078. static_cast<float> (jpegQuantTableYMin);
  1079. _quantTableCbCr[idx] = static_cast<float> (jpegQuantTableCbCr[idx]) /
  1080. static_cast<float> (jpegQuantTableCbCrMin);
  1081. }
  1082. if (_quantBaseError < 0)
  1083. quantBaseError = 0;
  1084. }
  1085. DwaCompressor::LossyDctEncoderBase::~LossyDctEncoderBase ()
  1086. {
  1087. }
  1088. //
  1089. // Given three channels of source data, encoding by first applying
  1090. // a color space conversion to a YCbCr space. Otherwise, if we only
  1091. // have one channel, just encode it as is.
  1092. //
  1093. // Other numbers of channels are somewhat unexpected at this point,
  1094. // and will throw an exception.
  1095. //
  1096. void
  1097. DwaCompressor::LossyDctEncoderBase::execute ()
  1098. {
  1099. int numBlocksX = (int)ceil ((float)_width / 8.0f);
  1100. int numBlocksY = (int)ceil ((float)_height/ 8.0f);
  1101. half halfZigCoef[64];
  1102. half halfCoef[64];
  1103. std::vector<unsigned short *> currDcComp (_rowPtrs.size());
  1104. unsigned short *currAcComp = (unsigned short *)_packedAc;
  1105. _dctData.resize (_rowPtrs.size());
  1106. _numAcComp = 0;
  1107. _numDcComp = 0;
  1108. assert (_type.size() == _rowPtrs.size());
  1109. assert ((_rowPtrs.size() == 3) || (_rowPtrs.size() == 1));
  1110. //
  1111. // Allocate a temp half buffer to quantize into for
  1112. // any FLOAT source channels.
  1113. //
  1114. int tmpHalfBufferElements = 0;
  1115. for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
  1116. if (_type[chan] == FLOAT)
  1117. tmpHalfBufferElements += _width * _height;
  1118. std::vector<unsigned short> tmpHalfBuffer (tmpHalfBufferElements);
  1119. char *tmpHalfBufferPtr = 0;
  1120. if (tmpHalfBufferElements)
  1121. tmpHalfBufferPtr = (char *)&tmpHalfBuffer[0];
  1122. //
  1123. // Run over all the float scanlines, quantizing,
  1124. // and re-assigning _rowPtr[y]. We need to translate
  1125. // FLOAT XDR to HALF XDR.
  1126. //
  1127. for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
  1128. {
  1129. if (_type[chan] != FLOAT)
  1130. continue;
  1131. for (int y = 0; y < _height; ++y)
  1132. {
  1133. float src = 0;
  1134. const char *srcXdr = _rowPtrs[chan][y];
  1135. char *dstXdr = tmpHalfBufferPtr;
  1136. for (int x = 0; x < _width; ++x)
  1137. {
  1138. Xdr::read<CharPtrIO> (srcXdr, src);
  1139. //
  1140. // Clamp to half ranges, instead of just casting. This
  1141. // avoids introducing Infs which end up getting zeroed later
  1142. //
  1143. src = std::max (
  1144. std::min ((float) std::numeric_limits<half>::max(), src),
  1145. (float)-std::numeric_limits<half>::max());
  1146. Xdr::write<CharPtrIO> (dstXdr, ((half)src).bits());
  1147. //
  1148. // Xdr::read and Xdr::write will advance the ptr
  1149. //
  1150. }
  1151. _rowPtrs[chan][y] = (const char *)tmpHalfBufferPtr;
  1152. tmpHalfBufferPtr += _width * sizeof (unsigned short);
  1153. }
  1154. }
  1155. //
  1156. // Pack DC components together by common plane, so we can get
  1157. // a little more out of differencing them. We'll always have
  1158. // one component per block, so we can computed offsets.
  1159. //
  1160. currDcComp[0] = (unsigned short *)_packedDc;
  1161. for (unsigned int chan = 1; chan < _rowPtrs.size(); ++chan)
  1162. currDcComp[chan] = currDcComp[chan-1] + numBlocksX * numBlocksY;
  1163. for (int blocky = 0; blocky < numBlocksY; ++blocky)
  1164. {
  1165. for (int blockx = 0; blockx < numBlocksX; ++blockx)
  1166. {
  1167. half h;
  1168. unsigned short tmpShortXdr, tmpShortNative;
  1169. char *tmpCharPtr;
  1170. for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
  1171. {
  1172. //
  1173. // Break the source into 8x8 blocks. If we don't
  1174. // fit at the edges, mirror.
  1175. //
  1176. // Also, convert from linear to nonlinear representation.
  1177. // Our source is assumed to be XDR, and we need to convert
  1178. // to NATIVE prior to converting to float.
  1179. //
  1180. // If we're converting linear -> nonlinear, assume that the
  1181. // XDR -> NATIVE conversion is built into the lookup. Otherwise,
  1182. // we'll need to explicitly do it.
  1183. //
  1184. for (int y = 0; y < 8; ++y)
  1185. {
  1186. for (int x = 0; x < 8; ++x)
  1187. {
  1188. int vx = 8 * blockx + x;
  1189. int vy = 8 * blocky + y;
  1190. if (vx >= _width)
  1191. vx = _width - (vx - (_width - 1));
  1192. if (vx < 0) vx = _width-1;
  1193. if (vy >=_height)
  1194. vy = _height - (vy - (_height - 1));
  1195. if (vy < 0) vy = _height-1;
  1196. tmpShortXdr =
  1197. ((const unsigned short *)(_rowPtrs[chan])[vy])[vx];
  1198. if (_toNonlinear)
  1199. {
  1200. h.setBits (_toNonlinear[tmpShortXdr]);
  1201. }
  1202. else
  1203. {
  1204. const char *tmpConstCharPtr =
  1205. (const char *)(&tmpShortXdr);
  1206. Xdr::read<CharPtrIO>
  1207. (tmpConstCharPtr, tmpShortNative);
  1208. h.setBits(tmpShortNative);
  1209. }
  1210. _dctData[chan]._buffer[y * 8 + x] = (float)h;
  1211. } // x
  1212. } // y
  1213. } // chan
  1214. //
  1215. // Color space conversion
  1216. //
  1217. if (_rowPtrs.size() == 3)
  1218. {
  1219. csc709Forward64 (_dctData[0]._buffer,
  1220. _dctData[1]._buffer,
  1221. _dctData[2]._buffer);
  1222. }
  1223. for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
  1224. {
  1225. //
  1226. // Forward DCT
  1227. //
  1228. dctForward8x8(_dctData[chan]._buffer);
  1229. //
  1230. // Quantize to half, and zigzag
  1231. //
  1232. if (chan == 0)
  1233. {
  1234. for (int i = 0; i < 64; ++i)
  1235. {
  1236. halfCoef[i] =
  1237. quantize ((half)_dctData[chan]._buffer[i],
  1238. _quantBaseError*_quantTableY[i]);
  1239. }
  1240. }
  1241. else
  1242. {
  1243. for (int i = 0; i < 64; ++i)
  1244. {
  1245. halfCoef[i] =
  1246. quantize ((half)_dctData[chan]._buffer[i],
  1247. _quantBaseError*_quantTableCbCr[i]);
  1248. }
  1249. }
  1250. toZigZag (halfZigCoef, halfCoef);
  1251. //
  1252. // Convert from NATIVE back to XDR, before we write out
  1253. //
  1254. for (int i = 0; i < 64; ++i)
  1255. {
  1256. tmpCharPtr = (char *)&tmpShortXdr;
  1257. Xdr::write<CharPtrIO>(tmpCharPtr, halfZigCoef[i].bits());
  1258. halfZigCoef[i].setBits(tmpShortXdr);
  1259. }
  1260. //
  1261. // Save the DC component separately, to be compressed on
  1262. // its own.
  1263. //
  1264. *currDcComp[chan]++ = halfZigCoef[0].bits();
  1265. _numDcComp++;
  1266. //
  1267. // Then RLE the AC components (which will record the count
  1268. // of the resulting number of items)
  1269. //
  1270. rleAc (halfZigCoef, currAcComp);
  1271. } // chan
  1272. } // blockx
  1273. } // blocky
  1274. }
  1275. //
  1276. // Reorder from zig-zag order to normal ordering
  1277. //
  1278. void
  1279. DwaCompressor::LossyDctEncoderBase::toZigZag (half *dst, half *src)
  1280. {
  1281. const int remap[] =
  1282. {
  1283. 0,
  1284. 1, 8,
  1285. 16, 9, 2,
  1286. 3, 10, 17, 24,
  1287. 32, 25, 18, 11, 4,
  1288. 5, 12, 19, 26, 33, 40,
  1289. 48, 41, 34, 27, 20, 13, 6,
  1290. 7, 14, 21, 28, 35, 42, 49, 56,
  1291. 57, 50, 43, 36, 29, 22, 15,
  1292. 23, 30, 37, 44, 51, 58,
  1293. 59, 52, 45, 38, 31,
  1294. 39, 46, 53, 60,
  1295. 61, 54, 47,
  1296. 55, 62,
  1297. 63
  1298. };
  1299. for (int i=0; i<64; ++i)
  1300. dst[i] = src[remap[i]];
  1301. }
  1302. //
  1303. // Precomputing the bit count runs faster than using
  1304. // the builtin instruction, at least in one case..
  1305. //
  1306. // Precomputing 8-bits is no slower than 16-bits,
  1307. // and saves a fair bit of overhead..
  1308. //
  1309. int
  1310. DwaCompressor::LossyDctEncoderBase::countSetBits (unsigned short src)
  1311. {
  1312. static const unsigned short numBitsSet[256] =
  1313. {
  1314. 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  1315. 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  1316. 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  1317. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1318. 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  1319. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1320. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1321. 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  1322. 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  1323. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1324. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1325. 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  1326. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1327. 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  1328. 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  1329. 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
  1330. };
  1331. return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
  1332. }
  1333. //
  1334. // Take a DCT coefficient, as well as an acceptable error. Search
  1335. // nearby values within the error tolerance, that have fewer
  1336. // bits set.
  1337. //
  1338. // The list of candidates has been pre-computed and sorted
  1339. // in order of increasing numbers of bits set. This way, we
  1340. // can stop searching as soon as we find a candidate that
  1341. // is within the error tolerance.
  1342. //
  1343. half
  1344. DwaCompressor::LossyDctEncoderBase::quantize (half src, float errorTolerance)
  1345. {
  1346. half tmp;
  1347. float srcFloat = (float)src;
  1348. int numSetBits = countSetBits(src.bits());
  1349. const unsigned short *closest = get_dwaClosest(src.bits());
  1350. for (int targetNumSetBits = numSetBits - 1;
  1351. targetNumSetBits >= 0;
  1352. --targetNumSetBits)
  1353. {
  1354. tmp.setBits (*closest);
  1355. if (fabs ((float)tmp - srcFloat) < errorTolerance)
  1356. return tmp;
  1357. closest++;
  1358. }
  1359. return src;
  1360. }
  1361. //
  1362. // RLE the zig-zag of the AC components + copy over
  1363. // into another tmp buffer
  1364. //
  1365. // Try to do a simple RLE scheme to reduce run's of 0's. This
  1366. // differs from the jpeg EOB case, since EOB just indicates that
  1367. // the rest of the block is zero. In our case, we have lots of
  1368. // NaN symbols, which shouldn't be allowed to occur in DCT
  1369. // coefficents - so we'll use them for encoding runs.
  1370. //
  1371. // If the high byte is 0xff, then we have a run of 0's, of length
  1372. // given by the low byte. For example, 0xff03 would be a run
  1373. // of 3 0's, starting at the current location.
  1374. //
  1375. // block is our block of 64 coefficients
  1376. // acPtr a pointer to back the RLE'd values into.
  1377. //
  1378. // This will advance the counter, _numAcComp.
  1379. //
  1380. void
  1381. DwaCompressor::LossyDctEncoderBase::rleAc
  1382. (half *block,
  1383. unsigned short *&acPtr)
  1384. {
  1385. int dctComp = 1;
  1386. unsigned short rleSymbol = 0x0;
  1387. while (dctComp < 64)
  1388. {
  1389. int runLen = 1;
  1390. //
  1391. // If we don't have a 0, output verbatim
  1392. //
  1393. if (block[dctComp].bits() != rleSymbol)
  1394. {
  1395. *acPtr++ = block[dctComp].bits();
  1396. _numAcComp++;
  1397. dctComp += runLen;
  1398. continue;
  1399. }
  1400. //
  1401. // We're sitting on a 0, so see how big the run is.
  1402. //
  1403. while ((dctComp+runLen < 64) &&
  1404. (block[dctComp+runLen].bits() == rleSymbol))
  1405. {
  1406. runLen++;
  1407. }
  1408. //
  1409. // If the run len is too small, just output verbatim
  1410. // otherwise output our run token
  1411. //
  1412. // Originally, we wouldn't have a separate symbol for
  1413. // "end of block". But in some experimentation, it looks
  1414. // like using 0xff00 for "end of block" can save a bit
  1415. // of space.
  1416. //
  1417. if (runLen == 1)
  1418. {
  1419. runLen = 1;
  1420. *acPtr++ = block[dctComp].bits();
  1421. _numAcComp++;
  1422. //
  1423. // Using 0xff00 for "end of block"
  1424. //
  1425. }
  1426. else if (runLen + dctComp == 64)
  1427. {
  1428. //
  1429. // Signal EOB
  1430. //
  1431. *acPtr++ = 0xff00;
  1432. _numAcComp++;
  1433. }
  1434. else
  1435. {
  1436. //
  1437. // Signal normal run
  1438. //
  1439. *acPtr++ = 0xff00 | runLen;
  1440. _numAcComp++;
  1441. }
  1442. //
  1443. // Advance by runLen
  1444. //
  1445. dctComp += runLen;
  1446. }
  1447. }
  1448. // ==============================================================
  1449. //
  1450. // DwaCompressor
  1451. //
  1452. // --------------------------------------------------------------
  1453. //
  1454. // DwaCompressor()
  1455. //
  1456. DwaCompressor::DwaCompressor
  1457. (const Header &hdr,
  1458. int maxScanLineSize,
  1459. int numScanLines,
  1460. AcCompression acCompression)
  1461. :
  1462. Compressor(hdr),
  1463. _acCompression(acCompression),
  1464. _maxScanLineSize(maxScanLineSize),
  1465. _numScanLines(numScanLines),
  1466. _channels(hdr.channels()),
  1467. _packedAcBuffer(0),
  1468. _packedAcBufferSize(0),
  1469. _packedDcBuffer(0),
  1470. _packedDcBufferSize(0),
  1471. _rleBuffer(0),
  1472. _rleBufferSize(0),
  1473. _outBuffer(0),
  1474. _outBufferSize(0),
  1475. _zip(0),
  1476. _dwaCompressionLevel(45.0)
  1477. {
  1478. _min[0] = hdr.dataWindow().min.x;
  1479. _min[1] = hdr.dataWindow().min.y;
  1480. _max[0] = hdr.dataWindow().max.x;
  1481. _max[1] = hdr.dataWindow().max.y;
  1482. for (int i=0; i < NUM_COMPRESSOR_SCHEMES; ++i)
  1483. {
  1484. _planarUncBuffer[i] = 0;
  1485. _planarUncBufferSize[i] = 0;
  1486. }
  1487. //
  1488. // Check the header for a quality attribute
  1489. //
  1490. if (hasDwaCompressionLevel (hdr))
  1491. _dwaCompressionLevel = dwaCompressionLevel (hdr);
  1492. }
  1493. DwaCompressor::~DwaCompressor()
  1494. {
  1495. delete[] _packedAcBuffer;
  1496. delete[] _packedDcBuffer;
  1497. delete[] _rleBuffer;
  1498. delete[] _outBuffer;
  1499. delete _zip;
  1500. for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
  1501. delete[] _planarUncBuffer[i];
  1502. }
  1503. int
  1504. DwaCompressor::numScanLines() const
  1505. {
  1506. return _numScanLines;
  1507. }
  1508. OPENEXR_IMF_NAMESPACE::Compressor::Format
  1509. DwaCompressor::format() const
  1510. {
  1511. if (GLOBAL_SYSTEM_LITTLE_ENDIAN)
  1512. return NATIVE;
  1513. else
  1514. return XDR;
  1515. }
  1516. int
  1517. DwaCompressor::compress
  1518. (const char *inPtr,
  1519. int inSize,
  1520. int minY,
  1521. const char *&outPtr)
  1522. {
  1523. return compress
  1524. (inPtr,
  1525. inSize,
  1526. IMATH_NAMESPACE::Box2i (IMATH_NAMESPACE::V2i (_min[0], minY),
  1527. IMATH_NAMESPACE::V2i (_max[0], minY + numScanLines() - 1)),
  1528. outPtr);
  1529. }
  1530. int
  1531. DwaCompressor::compressTile
  1532. (const char *inPtr,
  1533. int inSize,
  1534. IMATH_NAMESPACE::Box2i range,
  1535. const char *&outPtr)
  1536. {
  1537. return compress (inPtr, inSize, range, outPtr);
  1538. }
  1539. int
  1540. DwaCompressor::compress
  1541. (const char *inPtr,
  1542. int inSize,
  1543. IMATH_NAMESPACE::Box2i range,
  1544. const char *&outPtr)
  1545. {
  1546. const char *inDataPtr = inPtr;
  1547. char *packedAcEnd = 0;
  1548. char *packedDcEnd = 0;
  1549. int fileVersion = 2; // Starting with 2, we write the channel
  1550. // classification rules into the file
  1551. if (fileVersion < 2)
  1552. initializeLegacyChannelRules();
  1553. else
  1554. initializeDefaultChannelRules();
  1555. size_t outBufferSize = 0;
  1556. initializeBuffers(outBufferSize);
  1557. unsigned short channelRuleSize = 0;
  1558. std::vector<Classifier> channelRules;
  1559. if (fileVersion >= 2)
  1560. {
  1561. relevantChannelRules(channelRules);
  1562. channelRuleSize = Xdr::size<unsigned short>();
  1563. for (size_t i = 0; i < channelRules.size(); ++i)
  1564. channelRuleSize += channelRules[i].size();
  1565. }
  1566. //
  1567. // Remember to allocate _outBuffer, if we haven't done so already.
  1568. //
  1569. outBufferSize += channelRuleSize;
  1570. if (outBufferSize > _outBufferSize)
  1571. {
  1572. _outBufferSize = outBufferSize;
  1573. if (_outBuffer != 0)
  1574. delete[] _outBuffer;
  1575. _outBuffer = new char[outBufferSize];
  1576. }
  1577. char *outDataPtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(OPENEXR_IMF_NAMESPACE::Int64) +
  1578. channelRuleSize];
  1579. //
  1580. // We might not be dealing with any color data, in which
  1581. // case the AC buffer size will be 0, and deferencing
  1582. // a vector will not be a good thing to do.
  1583. //
  1584. if (_packedAcBuffer)
  1585. packedAcEnd = _packedAcBuffer;
  1586. if (_packedDcBuffer)
  1587. packedDcEnd = _packedDcBuffer;
  1588. #define OBIDX(x) (Int64 *)&_outBuffer[x * sizeof (Int64)]
  1589. Int64 *version = OBIDX (VERSION);
  1590. Int64 *unknownUncompressedSize = OBIDX (UNKNOWN_UNCOMPRESSED_SIZE);
  1591. Int64 *unknownCompressedSize = OBIDX (UNKNOWN_COMPRESSED_SIZE);
  1592. Int64 *acCompressedSize = OBIDX (AC_COMPRESSED_SIZE);
  1593. Int64 *dcCompressedSize = OBIDX (DC_COMPRESSED_SIZE);
  1594. Int64 *rleCompressedSize = OBIDX (RLE_COMPRESSED_SIZE);
  1595. Int64 *rleUncompressedSize = OBIDX (RLE_UNCOMPRESSED_SIZE);
  1596. Int64 *rleRawSize = OBIDX (RLE_RAW_SIZE);
  1597. Int64 *totalAcUncompressedCount = OBIDX (AC_UNCOMPRESSED_COUNT);
  1598. Int64 *totalDcUncompressedCount = OBIDX (DC_UNCOMPRESSED_COUNT);
  1599. Int64 *acCompression = OBIDX (AC_COMPRESSION);
  1600. int minX = range.min.x;
  1601. int maxX = std::min(range.max.x, _max[0]);
  1602. int minY = range.min.y;
  1603. int maxY = std::min(range.max.y, _max[1]);
  1604. //
  1605. // Zero all the numbers in the chunk header
  1606. //
  1607. memset (_outBuffer, 0, NUM_SIZES_SINGLE * sizeof (Int64));
  1608. //
  1609. // Setup the AC compression strategy and the version in the data block,
  1610. // then write the relevant channel classification rules if needed
  1611. //
  1612. *version = fileVersion;
  1613. *acCompression = _acCompression;
  1614. setupChannelData (minX, minY, maxX, maxY);
  1615. if (fileVersion >= 2)
  1616. {
  1617. char *writePtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(OPENEXR_IMF_NAMESPACE::Int64)];
  1618. Xdr::write<CharPtrIO> (writePtr, channelRuleSize);
  1619. for (size_t i = 0; i < channelRules.size(); ++i)
  1620. channelRules[i].write(writePtr);
  1621. }
  1622. //
  1623. // Determine the start of each row in the input buffer
  1624. // Channels are interleaved by scanline
  1625. //
  1626. std::vector<bool> encodedChannels (_channelData.size());
  1627. std::vector< std::vector<const char *> > rowPtrs (_channelData.size());
  1628. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  1629. encodedChannels[chan] = false;
  1630. inDataPtr = inPtr;
  1631. for (int y = minY; y <= maxY; ++y)
  1632. {
  1633. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  1634. {
  1635. ChannelData *cd = &_channelData[chan];
  1636. if (IMATH_NAMESPACE::modp(y, cd->ySampling) != 0)
  1637. continue;
  1638. rowPtrs[chan].push_back(inDataPtr);
  1639. inDataPtr += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type);
  1640. }
  1641. }
  1642. inDataPtr = inPtr;
  1643. //
  1644. // Make a pass over all our CSC sets and try to encode them first
  1645. //
  1646. for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
  1647. {
  1648. LossyDctEncoderCsc encoder
  1649. (_dwaCompressionLevel / 100000.f,
  1650. rowPtrs[_cscSets[csc].idx[0]],
  1651. rowPtrs[_cscSets[csc].idx[1]],
  1652. rowPtrs[_cscSets[csc].idx[2]],
  1653. packedAcEnd,
  1654. packedDcEnd,
  1655. get_dwaCompressorToNonlinear(),
  1656. _channelData[_cscSets[csc].idx[0]].width,
  1657. _channelData[_cscSets[csc].idx[0]].height,
  1658. _channelData[_cscSets[csc].idx[0]].type,
  1659. _channelData[_cscSets[csc].idx[1]].type,
  1660. _channelData[_cscSets[csc].idx[2]].type);
  1661. encoder.execute();
  1662. *totalAcUncompressedCount += encoder.numAcValuesEncoded();
  1663. *totalDcUncompressedCount += encoder.numDcValuesEncoded();
  1664. packedAcEnd += encoder.numAcValuesEncoded() * sizeof(unsigned short);
  1665. packedDcEnd += encoder.numDcValuesEncoded() * sizeof(unsigned short);
  1666. encodedChannels[_cscSets[csc].idx[0]] = true;
  1667. encodedChannels[_cscSets[csc].idx[1]] = true;
  1668. encodedChannels[_cscSets[csc].idx[2]] = true;
  1669. }
  1670. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  1671. {
  1672. ChannelData *cd = &_channelData[chan];
  1673. if (encodedChannels[chan])
  1674. continue;
  1675. switch (cd->compression)
  1676. {
  1677. case LOSSY_DCT:
  1678. //
  1679. // For LOSSY_DCT, treat this just like the CSC'd case,
  1680. // but only operate on one channel
  1681. //
  1682. {
  1683. const unsigned short *nonlinearLut = 0;
  1684. if (!cd->pLinear)
  1685. nonlinearLut = get_dwaCompressorToNonlinear();
  1686. LossyDctEncoder encoder
  1687. (_dwaCompressionLevel / 100000.f,
  1688. rowPtrs[chan],
  1689. packedAcEnd,
  1690. packedDcEnd,
  1691. nonlinearLut,
  1692. cd->width,
  1693. cd->height,
  1694. cd->type);
  1695. encoder.execute();
  1696. *totalAcUncompressedCount += encoder.numAcValuesEncoded();
  1697. *totalDcUncompressedCount += encoder.numDcValuesEncoded();
  1698. packedAcEnd +=
  1699. encoder.numAcValuesEncoded() * sizeof (unsigned short);
  1700. packedDcEnd +=
  1701. encoder.numDcValuesEncoded() * sizeof (unsigned short);
  1702. }
  1703. break;
  1704. case RLE:
  1705. //
  1706. // For RLE, bash the bytes up so that the first bytes of each
  1707. // pixel are contingous, as are the second bytes, and so on.
  1708. //
  1709. for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
  1710. {
  1711. const char *row = rowPtrs[chan][y];
  1712. for (int x = 0; x < cd->width; ++x)
  1713. {
  1714. for (int byte = 0;
  1715. byte < OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);
  1716. ++byte)
  1717. {
  1718. *cd->planarUncRleEnd[byte]++ = *row++;
  1719. }
  1720. }
  1721. *rleRawSize += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type);
  1722. }
  1723. break;
  1724. case UNKNOWN:
  1725. //
  1726. // Otherwise, just copy data over verbatim
  1727. //
  1728. {
  1729. int scanlineSize = cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type);
  1730. for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
  1731. {
  1732. memcpy (cd->planarUncBufferEnd,
  1733. rowPtrs[chan][y],
  1734. scanlineSize);
  1735. cd->planarUncBufferEnd += scanlineSize;
  1736. }
  1737. *unknownUncompressedSize += cd->planarUncSize;
  1738. }
  1739. break;
  1740. default:
  1741. assert (false);
  1742. }
  1743. encodedChannels[chan] = true;
  1744. }
  1745. //
  1746. // Pack the Unknown data into the output buffer first. Instead of
  1747. // just copying it uncompressed, try zlib compression at least.
  1748. //
  1749. if (*unknownUncompressedSize > 0)
  1750. {
  1751. uLongf inSize = (uLongf)(*unknownUncompressedSize);
  1752. uLongf outSize = compressBound (inSize);
  1753. if (Z_OK != ::compress2 ((Bytef *)outDataPtr,
  1754. &outSize,
  1755. (const Bytef *)_planarUncBuffer[UNKNOWN],
  1756. inSize,
  1757. 9))
  1758. {
  1759. throw IEX_NAMESPACE::BaseExc ("Data compression (zlib) failed.");
  1760. }
  1761. outDataPtr += outSize;
  1762. *unknownCompressedSize = outSize;
  1763. }
  1764. //
  1765. // Now, pack all the Lossy DCT coefficients into our output
  1766. // buffer, with Huffman encoding.
  1767. //
  1768. // Also, record the compressed size and the number of
  1769. // uncompressed componentns we have.
  1770. //
  1771. if (*totalAcUncompressedCount > 0)
  1772. {
  1773. switch (_acCompression)
  1774. {
  1775. case STATIC_HUFFMAN:
  1776. *acCompressedSize = (int)
  1777. hufCompress((unsigned short *)_packedAcBuffer,
  1778. (int)*totalAcUncompressedCount,
  1779. outDataPtr);
  1780. break;
  1781. case DEFLATE:
  1782. {
  1783. uLongf destLen = compressBound (
  1784. (*totalAcUncompressedCount) * sizeof (unsigned short));
  1785. if (Z_OK != ::compress2
  1786. ((Bytef *)outDataPtr,
  1787. &destLen,
  1788. (Bytef *)_packedAcBuffer,
  1789. (uLong)(*totalAcUncompressedCount
  1790. * sizeof (unsigned short)),
  1791. 9))
  1792. {
  1793. throw IEX_NAMESPACE::InputExc ("Data compression (zlib) failed.");
  1794. }
  1795. *acCompressedSize = destLen;
  1796. }
  1797. break;
  1798. default:
  1799. assert (false);
  1800. }
  1801. outDataPtr += *acCompressedSize;
  1802. }
  1803. //
  1804. // Handle the DC components separately
  1805. //
  1806. if (*totalDcUncompressedCount > 0)
  1807. {
  1808. *dcCompressedSize = _zip->compress
  1809. (_packedDcBuffer,
  1810. (int)(*totalDcUncompressedCount) * sizeof (unsigned short),
  1811. outDataPtr);
  1812. outDataPtr += *dcCompressedSize;
  1813. }
  1814. //
  1815. // If we have RLE data, first RLE encode it and set the uncompressed
  1816. // size. Then, deflate the results and set the compressed size.
  1817. //
  1818. if (*rleRawSize > 0)
  1819. {
  1820. *rleUncompressedSize = rleCompress
  1821. ((int)(*rleRawSize),
  1822. _planarUncBuffer[RLE],
  1823. (signed char *)_rleBuffer);
  1824. uLongf dstLen = compressBound ((uLongf)*rleUncompressedSize);
  1825. if (Z_OK != ::compress2
  1826. ((Bytef *)outDataPtr,
  1827. &dstLen,
  1828. (Bytef *)_rleBuffer,
  1829. (uLong)(*rleUncompressedSize),
  1830. 9))
  1831. {
  1832. throw IEX_NAMESPACE::BaseExc ("Error compressing RLE'd data.");
  1833. }
  1834. *rleCompressedSize = dstLen;
  1835. outDataPtr += *rleCompressedSize;
  1836. }
  1837. //
  1838. // Flip the counters to XDR format
  1839. //
  1840. for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
  1841. {
  1842. Int64 src = *(((Int64 *)_outBuffer) + i);
  1843. char *dst = (char *)(((Int64 *)_outBuffer) + i);
  1844. Xdr::write<CharPtrIO> (dst, src);
  1845. }
  1846. //
  1847. // We're done - compute the number of bytes we packed
  1848. //
  1849. outPtr = _outBuffer;
  1850. return static_cast<int>(outDataPtr - _outBuffer + 1);
  1851. }
  1852. int
  1853. DwaCompressor::uncompress
  1854. (const char *inPtr,
  1855. int inSize,
  1856. int minY,
  1857. const char *&outPtr)
  1858. {
  1859. return uncompress (inPtr,
  1860. inSize,
  1861. IMATH_NAMESPACE::Box2i (IMATH_NAMESPACE::V2i (_min[0], minY),
  1862. IMATH_NAMESPACE::V2i (_max[0], minY + numScanLines() - 1)),
  1863. outPtr);
  1864. }
  1865. int
  1866. DwaCompressor::uncompressTile
  1867. (const char *inPtr,
  1868. int inSize,
  1869. IMATH_NAMESPACE::Box2i range,
  1870. const char *&outPtr)
  1871. {
  1872. return uncompress (inPtr, inSize, range, outPtr);
  1873. }
  1874. int
  1875. DwaCompressor::uncompress
  1876. (const char *inPtr,
  1877. int inSize,
  1878. IMATH_NAMESPACE::Box2i range,
  1879. const char *&outPtr)
  1880. {
  1881. int minX = range.min.x;
  1882. int maxX = std::min (range.max.x, _max[0]);
  1883. int minY = range.min.y;
  1884. int maxY = std::min (range.max.y, _max[1]);
  1885. int headerSize = NUM_SIZES_SINGLE*sizeof(Int64);
  1886. if (inSize < headerSize)
  1887. {
  1888. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  1889. "(truncated header).");
  1890. }
  1891. //
  1892. // Flip the counters from XDR to NATIVE
  1893. //
  1894. for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
  1895. {
  1896. Int64 *dst = (((Int64 *)inPtr) + i);
  1897. const char *src = (char *)(((Int64 *)inPtr) + i);
  1898. Xdr::read<CharPtrIO> (src, *dst);
  1899. }
  1900. //
  1901. // Unwind all the counter info
  1902. //
  1903. const Int64 *inPtr64 = (const Int64*) inPtr;
  1904. Int64 version = *(inPtr64 + VERSION);
  1905. Int64 unknownUncompressedSize = *(inPtr64 + UNKNOWN_UNCOMPRESSED_SIZE);
  1906. Int64 unknownCompressedSize = *(inPtr64 + UNKNOWN_COMPRESSED_SIZE);
  1907. Int64 acCompressedSize = *(inPtr64 + AC_COMPRESSED_SIZE);
  1908. Int64 dcCompressedSize = *(inPtr64 + DC_COMPRESSED_SIZE);
  1909. Int64 rleCompressedSize = *(inPtr64 + RLE_COMPRESSED_SIZE);
  1910. Int64 rleUncompressedSize = *(inPtr64 + RLE_UNCOMPRESSED_SIZE);
  1911. Int64 rleRawSize = *(inPtr64 + RLE_RAW_SIZE);
  1912. Int64 totalAcUncompressedCount = *(inPtr64 + AC_UNCOMPRESSED_COUNT);
  1913. Int64 totalDcUncompressedCount = *(inPtr64 + DC_UNCOMPRESSED_COUNT);
  1914. Int64 acCompression = *(inPtr64 + AC_COMPRESSION);
  1915. Int64 compressedSize = unknownCompressedSize +
  1916. acCompressedSize +
  1917. dcCompressedSize +
  1918. rleCompressedSize;
  1919. const char *dataPtr = inPtr + NUM_SIZES_SINGLE * sizeof(Int64);
  1920. /* Both the sum and individual sizes are checked in case of overflow. */
  1921. if (inSize < (headerSize + compressedSize) ||
  1922. inSize < unknownCompressedSize ||
  1923. inSize < acCompressedSize ||
  1924. inSize < dcCompressedSize ||
  1925. inSize < rleCompressedSize)
  1926. {
  1927. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  1928. "(truncated file).");
  1929. }
  1930. if ((SInt64)unknownUncompressedSize < 0 ||
  1931. (SInt64)unknownCompressedSize < 0 ||
  1932. (SInt64)acCompressedSize < 0 ||
  1933. (SInt64)dcCompressedSize < 0 ||
  1934. (SInt64)rleCompressedSize < 0 ||
  1935. (SInt64)rleUncompressedSize < 0 ||
  1936. (SInt64)rleRawSize < 0 ||
  1937. (SInt64)totalAcUncompressedCount < 0 ||
  1938. (SInt64)totalDcUncompressedCount < 0)
  1939. {
  1940. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  1941. " (corrupt header).");
  1942. }
  1943. if (version < 2)
  1944. initializeLegacyChannelRules();
  1945. else
  1946. {
  1947. unsigned short ruleSize = 0;
  1948. Xdr::read<CharPtrIO>(dataPtr, ruleSize);
  1949. if (ruleSize < 0)
  1950. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  1951. " (corrupt header file).");
  1952. headerSize += ruleSize;
  1953. if (inSize < headerSize + compressedSize)
  1954. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  1955. " (truncated file).");
  1956. _channelRules.clear();
  1957. ruleSize -= Xdr::size<unsigned short> ();
  1958. while (ruleSize > 0)
  1959. {
  1960. Classifier rule(dataPtr, ruleSize);
  1961. _channelRules.push_back(rule);
  1962. ruleSize -= rule.size();
  1963. }
  1964. }
  1965. size_t outBufferSize = 0;
  1966. initializeBuffers(outBufferSize);
  1967. //
  1968. // Allocate _outBuffer, if we haven't done so already
  1969. //
  1970. if (_maxScanLineSize * numScanLines() > _outBufferSize)
  1971. {
  1972. _outBufferSize = _maxScanLineSize * numScanLines();
  1973. if (_outBuffer != 0)
  1974. delete[] _outBuffer;
  1975. _outBuffer = new char[_maxScanLineSize * numScanLines()];
  1976. }
  1977. char *outBufferEnd = _outBuffer;
  1978. //
  1979. // Find the start of the RLE packed AC components and
  1980. // the DC components for each channel. This will be handy
  1981. // if you want to decode the channels in parallel later on.
  1982. //
  1983. char *packedAcBufferEnd = 0;
  1984. if (_packedAcBuffer)
  1985. packedAcBufferEnd = _packedAcBuffer;
  1986. char *packedDcBufferEnd = 0;
  1987. if (_packedDcBuffer)
  1988. packedDcBufferEnd = _packedDcBuffer;
  1989. //
  1990. // UNKNOWN data is packed first, followed by the
  1991. // Huffman-compressed AC, then the DC values,
  1992. // and then the zlib compressed RLE data.
  1993. //
  1994. const char *compressedUnknownBuf = dataPtr;
  1995. const char *compressedAcBuf = compressedUnknownBuf +
  1996. static_cast<ptrdiff_t>(unknownCompressedSize);
  1997. const char *compressedDcBuf = compressedAcBuf +
  1998. static_cast<ptrdiff_t>(acCompressedSize);
  1999. const char *compressedRleBuf = compressedDcBuf +
  2000. static_cast<ptrdiff_t>(dcCompressedSize);
  2001. //
  2002. // Sanity check that the version is something we expect. Right now,
  2003. // we can decode version 0, 1, and 2. v1 adds 'end of block' symbols
  2004. // to the AC RLE. v2 adds channel classification rules at the
  2005. // start of the data block.
  2006. //
  2007. if (version > 2)
  2008. throw IEX_NAMESPACE::InputExc ("Invalid version of compressed data block");
  2009. setupChannelData(minX, minY, maxX, maxY);
  2010. //
  2011. // Uncompress the UNKNOWN data into _planarUncBuffer[UNKNOWN]
  2012. //
  2013. if (unknownCompressedSize > 0)
  2014. {
  2015. if (unknownUncompressedSize > _planarUncBufferSize[UNKNOWN])
  2016. {
  2017. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  2018. "(corrupt header).");
  2019. }
  2020. uLongf outSize = (uLongf)unknownUncompressedSize;
  2021. if (Z_OK != ::uncompress
  2022. ((Bytef *)_planarUncBuffer[UNKNOWN],
  2023. &outSize,
  2024. (Bytef *)compressedUnknownBuf,
  2025. (uLong)unknownCompressedSize))
  2026. {
  2027. throw IEX_NAMESPACE::BaseExc("Error uncompressing UNKNOWN data.");
  2028. }
  2029. }
  2030. //
  2031. // Uncompress the AC data into _packedAcBuffer
  2032. //
  2033. if (acCompressedSize > 0)
  2034. {
  2035. if (totalAcUncompressedCount*sizeof(unsigned short) > _packedAcBufferSize)
  2036. {
  2037. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  2038. "(corrupt header).");
  2039. }
  2040. //
  2041. // Don't trust the user to get it right, look in the file.
  2042. //
  2043. switch (acCompression)
  2044. {
  2045. case STATIC_HUFFMAN:
  2046. hufUncompress
  2047. (compressedAcBuf,
  2048. (int)acCompressedSize,
  2049. (unsigned short *)_packedAcBuffer,
  2050. (int)totalAcUncompressedCount);
  2051. break;
  2052. case DEFLATE:
  2053. {
  2054. uLongf destLen =
  2055. (int)(totalAcUncompressedCount) * sizeof (unsigned short);
  2056. if (Z_OK != ::uncompress
  2057. ((Bytef *)_packedAcBuffer,
  2058. &destLen,
  2059. (Bytef *)compressedAcBuf,
  2060. (uLong)acCompressedSize))
  2061. {
  2062. throw IEX_NAMESPACE::InputExc ("Data decompression (zlib) failed.");
  2063. }
  2064. if (totalAcUncompressedCount * sizeof (unsigned short) !=
  2065. destLen)
  2066. {
  2067. throw IEX_NAMESPACE::InputExc ("AC data corrupt.");
  2068. }
  2069. }
  2070. break;
  2071. default:
  2072. throw IEX_NAMESPACE::NoImplExc ("Unknown AC Compression");
  2073. break;
  2074. }
  2075. }
  2076. //
  2077. // Uncompress the DC data into _packedDcBuffer
  2078. //
  2079. if (dcCompressedSize > 0)
  2080. {
  2081. if (totalDcUncompressedCount*sizeof(unsigned short) > _packedDcBufferSize)
  2082. {
  2083. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  2084. "(corrupt header).");
  2085. }
  2086. if (_zip->uncompress
  2087. (compressedDcBuf, (int)dcCompressedSize, _packedDcBuffer)
  2088. != (int)totalDcUncompressedCount * sizeof (unsigned short))
  2089. {
  2090. throw IEX_NAMESPACE::BaseExc("DC data corrupt.");
  2091. }
  2092. }
  2093. //
  2094. // Uncompress the RLE data into _rleBuffer, then unRLE the results
  2095. // into _planarUncBuffer[RLE]
  2096. //
  2097. if (rleRawSize > 0)
  2098. {
  2099. if (rleUncompressedSize > _rleBufferSize ||
  2100. rleRawSize > _planarUncBufferSize[RLE])
  2101. {
  2102. throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
  2103. "(corrupt header).");
  2104. }
  2105. uLongf dstLen = (uLongf)rleUncompressedSize;
  2106. if (Z_OK != ::uncompress
  2107. ((Bytef *)_rleBuffer,
  2108. &dstLen,
  2109. (Bytef *)compressedRleBuf,
  2110. (uLong)rleCompressedSize))
  2111. {
  2112. throw IEX_NAMESPACE::BaseExc("Error uncompressing RLE data.");
  2113. }
  2114. if (dstLen != rleUncompressedSize)
  2115. throw IEX_NAMESPACE::BaseExc("RLE data corrupted");
  2116. if (rleUncompress
  2117. ((int)rleUncompressedSize,
  2118. (int)rleRawSize,
  2119. (signed char *)_rleBuffer,
  2120. _planarUncBuffer[RLE]) != rleRawSize)
  2121. {
  2122. throw IEX_NAMESPACE::BaseExc("RLE data corrupted");
  2123. }
  2124. }
  2125. //
  2126. // Determine the start of each row in the output buffer
  2127. //
  2128. std::vector<bool> decodedChannels (_channelData.size());
  2129. std::vector< std::vector<char *> > rowPtrs (_channelData.size());
  2130. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  2131. decodedChannels[chan] = false;
  2132. outBufferEnd = _outBuffer;
  2133. for (int y = minY; y <= maxY; ++y)
  2134. {
  2135. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  2136. {
  2137. ChannelData *cd = &_channelData[chan];
  2138. if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0)
  2139. continue;
  2140. rowPtrs[chan].push_back (outBufferEnd);
  2141. outBufferEnd += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);
  2142. }
  2143. }
  2144. //
  2145. // Setup to decode each block of 3 channels that need to
  2146. // be handled together
  2147. //
  2148. for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
  2149. {
  2150. int rChan = _cscSets[csc].idx[0];
  2151. int gChan = _cscSets[csc].idx[1];
  2152. int bChan = _cscSets[csc].idx[2];
  2153. LossyDctDecoderCsc decoder
  2154. (rowPtrs[rChan],
  2155. rowPtrs[gChan],
  2156. rowPtrs[bChan],
  2157. packedAcBufferEnd,
  2158. packedDcBufferEnd,
  2159. get_dwaCompressorToLinear(),
  2160. _channelData[rChan].width,
  2161. _channelData[rChan].height,
  2162. _channelData[rChan].type,
  2163. _channelData[gChan].type,
  2164. _channelData[bChan].type);
  2165. decoder.execute();
  2166. packedAcBufferEnd +=
  2167. decoder.numAcValuesEncoded() * sizeof (unsigned short);
  2168. packedDcBufferEnd +=
  2169. decoder.numDcValuesEncoded() * sizeof (unsigned short);
  2170. decodedChannels[rChan] = true;
  2171. decodedChannels[gChan] = true;
  2172. decodedChannels[bChan] = true;
  2173. }
  2174. //
  2175. // Setup to handle the remaining channels by themselves
  2176. //
  2177. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  2178. {
  2179. if (decodedChannels[chan])
  2180. continue;
  2181. ChannelData *cd = &_channelData[chan];
  2182. int pixelSize = OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);
  2183. switch (cd->compression)
  2184. {
  2185. case LOSSY_DCT:
  2186. //
  2187. // Setup a single-channel lossy DCT decoder pointing
  2188. // at the output buffer
  2189. //
  2190. {
  2191. const unsigned short *linearLut = 0;
  2192. if (!cd->pLinear)
  2193. linearLut = get_dwaCompressorToLinear();
  2194. LossyDctDecoder decoder
  2195. (rowPtrs[chan],
  2196. packedAcBufferEnd,
  2197. packedDcBufferEnd,
  2198. linearLut,
  2199. cd->width,
  2200. cd->height,
  2201. cd->type);
  2202. decoder.execute();
  2203. packedAcBufferEnd +=
  2204. decoder.numAcValuesEncoded() * sizeof (unsigned short);
  2205. packedDcBufferEnd +=
  2206. decoder.numDcValuesEncoded() * sizeof (unsigned short);
  2207. }
  2208. break;
  2209. case RLE:
  2210. //
  2211. // For the RLE case, the data has been un-RLE'd into
  2212. // planarUncRleEnd[], but is still split out by bytes.
  2213. // We need to rearrange the bytes back into the correct
  2214. // order in the output buffer;
  2215. //
  2216. {
  2217. int row = 0;
  2218. for (int y = minY; y <= maxY; ++y)
  2219. {
  2220. if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0)
  2221. continue;
  2222. char *dst = rowPtrs[chan][row];
  2223. if (pixelSize == 2)
  2224. {
  2225. interleaveByte2 (dst,
  2226. cd->planarUncRleEnd[0],
  2227. cd->planarUncRleEnd[1],
  2228. cd->width);
  2229. cd->planarUncRleEnd[0] += cd->width;
  2230. cd->planarUncRleEnd[1] += cd->width;
  2231. }
  2232. else
  2233. {
  2234. for (int x = 0; x < cd->width; ++x)
  2235. {
  2236. for (int byte = 0; byte < pixelSize; ++byte)
  2237. {
  2238. *dst++ = *cd->planarUncRleEnd[byte]++;
  2239. }
  2240. }
  2241. }
  2242. row++;
  2243. }
  2244. }
  2245. break;
  2246. case UNKNOWN:
  2247. //
  2248. // In the UNKNOWN case, data is already in planarUncBufferEnd
  2249. // and just needs to copied over to the output buffer
  2250. //
  2251. {
  2252. int row = 0;
  2253. int dstScanlineSize = cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);
  2254. for (int y = minY; y <= maxY; ++y)
  2255. {
  2256. if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0)
  2257. continue;
  2258. memcpy (rowPtrs[chan][row],
  2259. cd->planarUncBufferEnd,
  2260. dstScanlineSize);
  2261. cd->planarUncBufferEnd += dstScanlineSize;
  2262. row++;
  2263. }
  2264. }
  2265. break;
  2266. default:
  2267. throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case");
  2268. break;
  2269. }
  2270. decodedChannels[chan] = true;
  2271. }
  2272. //
  2273. // Return a ptr to _outBuffer
  2274. //
  2275. outPtr = _outBuffer;
  2276. return (int)(outBufferEnd - _outBuffer);
  2277. }
  2278. // static
  2279. void
  2280. DwaCompressor::initializeFuncs()
  2281. {
  2282. convertFloatToHalf64 = convertFloatToHalf64_scalar;
  2283. fromHalfZigZag = fromHalfZigZag_scalar;
  2284. CpuId cpuId;
  2285. //
  2286. // Setup HALF <-> FLOAT conversion implementations
  2287. //
  2288. if (cpuId.avx && cpuId.f16c)
  2289. {
  2290. convertFloatToHalf64 = convertFloatToHalf64_f16c;
  2291. fromHalfZigZag = fromHalfZigZag_f16c;
  2292. }
  2293. //
  2294. // Setup inverse DCT implementations
  2295. //
  2296. dctInverse8x8_0 = dctInverse8x8_scalar<0>;
  2297. dctInverse8x8_1 = dctInverse8x8_scalar<1>;
  2298. dctInverse8x8_2 = dctInverse8x8_scalar<2>;
  2299. dctInverse8x8_3 = dctInverse8x8_scalar<3>;
  2300. dctInverse8x8_4 = dctInverse8x8_scalar<4>;
  2301. dctInverse8x8_5 = dctInverse8x8_scalar<5>;
  2302. dctInverse8x8_6 = dctInverse8x8_scalar<6>;
  2303. dctInverse8x8_7 = dctInverse8x8_scalar<7>;
  2304. if (cpuId.avx)
  2305. {
  2306. dctInverse8x8_0 = dctInverse8x8_avx<0>;
  2307. dctInverse8x8_1 = dctInverse8x8_avx<1>;
  2308. dctInverse8x8_2 = dctInverse8x8_avx<2>;
  2309. dctInverse8x8_3 = dctInverse8x8_avx<3>;
  2310. dctInverse8x8_4 = dctInverse8x8_avx<4>;
  2311. dctInverse8x8_5 = dctInverse8x8_avx<5>;
  2312. dctInverse8x8_6 = dctInverse8x8_avx<6>;
  2313. dctInverse8x8_7 = dctInverse8x8_avx<7>;
  2314. }
  2315. else if (cpuId.sse2)
  2316. {
  2317. dctInverse8x8_0 = dctInverse8x8_sse2<0>;
  2318. dctInverse8x8_1 = dctInverse8x8_sse2<1>;
  2319. dctInverse8x8_2 = dctInverse8x8_sse2<2>;
  2320. dctInverse8x8_3 = dctInverse8x8_sse2<3>;
  2321. dctInverse8x8_4 = dctInverse8x8_sse2<4>;
  2322. dctInverse8x8_5 = dctInverse8x8_sse2<5>;
  2323. dctInverse8x8_6 = dctInverse8x8_sse2<6>;
  2324. dctInverse8x8_7 = dctInverse8x8_sse2<7>;
  2325. }
  2326. }
  2327. //
  2328. // Handle channel classification and buffer allocation once we know
  2329. // how to classify channels
  2330. //
  2331. void
  2332. DwaCompressor::initializeBuffers (size_t &outBufferSize)
  2333. {
  2334. classifyChannels (_channels, _channelData, _cscSets);
  2335. //
  2336. // _outBuffer needs to be big enough to hold all our
  2337. // compressed data - which could vary depending on what sort
  2338. // of channels we have.
  2339. //
  2340. int maxOutBufferSize = 0;
  2341. int numLossyDctChans = 0;
  2342. int unknownBufferSize = 0;
  2343. int rleBufferSize = 0;
  2344. int maxLossyDctAcSize = (int)ceil ((float)numScanLines() / 8.0f) *
  2345. (int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
  2346. 63 * sizeof (unsigned short);
  2347. int maxLossyDctDcSize = (int)ceil ((float)numScanLines() / 8.0f) *
  2348. (int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
  2349. sizeof (unsigned short);
  2350. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  2351. {
  2352. switch (_channelData[chan].compression)
  2353. {
  2354. case LOSSY_DCT:
  2355. //
  2356. // This is the size of the number of packed
  2357. // components, plus the requirements for
  2358. // maximum Huffman encoding size (for STATIC_HUFFMAN)
  2359. // or for zlib compression (for DEFLATE)
  2360. //
  2361. maxOutBufferSize += std::max(
  2362. (int)(2 * maxLossyDctAcSize + 65536),
  2363. (int)compressBound (maxLossyDctAcSize) );
  2364. numLossyDctChans++;
  2365. break;
  2366. case RLE:
  2367. {
  2368. //
  2369. // RLE, if gone horribly wrong, could double the size
  2370. // of the source data.
  2371. //
  2372. int rleAmount = 2 * numScanLines() * (_max[0] - _min[0] + 1) *
  2373. OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type);
  2374. rleBufferSize += rleAmount;
  2375. }
  2376. break;
  2377. case UNKNOWN:
  2378. unknownBufferSize += numScanLines() * (_max[0] - _min[0] + 1) *
  2379. OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type);
  2380. break;
  2381. default:
  2382. throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case");
  2383. break;
  2384. }
  2385. }
  2386. //
  2387. // Also, since the results of the RLE are packed into
  2388. // the output buffer, we need the extra room there. But
  2389. // we're going to zlib compress() the data we pack,
  2390. // which could take slightly more space
  2391. //
  2392. maxOutBufferSize += (int)compressBound ((uLongf)rleBufferSize);
  2393. //
  2394. // And the same goes for the UNKNOWN data
  2395. //
  2396. maxOutBufferSize += (int)compressBound ((uLongf)unknownBufferSize);
  2397. //
  2398. // Allocate a zip/deflate compressor big enought to hold the DC data
  2399. // and include it's compressed results in the size requirements
  2400. // for our output buffer
  2401. //
  2402. if (_zip == 0)
  2403. _zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
  2404. else if (_zip->maxRawSize() < maxLossyDctDcSize * numLossyDctChans)
  2405. {
  2406. delete _zip;
  2407. _zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
  2408. }
  2409. maxOutBufferSize += _zip->maxCompressedSize();
  2410. //
  2411. // We also need to reserve space at the head of the buffer to
  2412. // write out the size of our various packed and compressed data.
  2413. //
  2414. maxOutBufferSize += NUM_SIZES_SINGLE * sizeof (Int64);
  2415. //
  2416. // Later, we're going to hijack outBuffer for the result of
  2417. // both encoding and decoding. So it needs to be big enough
  2418. // to hold either a buffers' worth of uncompressed or
  2419. // compressed data
  2420. //
  2421. // For encoding, we'll need _outBuffer to hold maxOutBufferSize bytes,
  2422. // but for decoding, we only need it to be maxScanLineSize*numScanLines.
  2423. // Cache the max size for now, and alloc the buffer when we either
  2424. // encode or decode.
  2425. //
  2426. outBufferSize = maxOutBufferSize;
  2427. //
  2428. // _packedAcBuffer holds the quantized DCT coefficients prior
  2429. // to Huffman encoding
  2430. //
  2431. if (maxLossyDctAcSize * numLossyDctChans > _packedAcBufferSize)
  2432. {
  2433. _packedAcBufferSize = maxLossyDctAcSize * numLossyDctChans;
  2434. if (_packedAcBuffer != 0)
  2435. delete[] _packedAcBuffer;
  2436. _packedAcBuffer = new char[_packedAcBufferSize];
  2437. }
  2438. //
  2439. // _packedDcBuffer holds one quantized DCT coef per 8x8 block
  2440. //
  2441. if (maxLossyDctDcSize * numLossyDctChans > _packedDcBufferSize)
  2442. {
  2443. _packedDcBufferSize = maxLossyDctDcSize * numLossyDctChans;
  2444. if (_packedDcBuffer != 0)
  2445. delete[] _packedDcBuffer;
  2446. _packedDcBuffer = new char[_packedDcBufferSize];
  2447. }
  2448. if (rleBufferSize > _rleBufferSize)
  2449. {
  2450. _rleBufferSize = rleBufferSize;
  2451. if (_rleBuffer != 0)
  2452. delete[] _rleBuffer;
  2453. _rleBuffer = new char[rleBufferSize];
  2454. }
  2455. //
  2456. // The planar uncompressed buffer will hold float data for LOSSY_DCT
  2457. // compressed values, and whatever the native type is for other
  2458. // channels. We're going to use this to hold data in a planar
  2459. // format, as opposed to the native interleaved format we take
  2460. // into compress() and give back from uncompress().
  2461. //
  2462. // This also makes it easier to compress the UNKNOWN and RLE data
  2463. // all in one swoop (for each compression scheme).
  2464. //
  2465. int planarUncBufferSize[NUM_COMPRESSOR_SCHEMES];
  2466. for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
  2467. planarUncBufferSize[i] = 0;
  2468. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  2469. {
  2470. switch (_channelData[chan].compression)
  2471. {
  2472. case LOSSY_DCT:
  2473. break;
  2474. case RLE:
  2475. planarUncBufferSize[RLE] +=
  2476. numScanLines() * (_max[0] - _min[0] + 1) *
  2477. OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type);
  2478. break;
  2479. case UNKNOWN:
  2480. planarUncBufferSize[UNKNOWN] +=
  2481. numScanLines() * (_max[0] - _min[0] + 1) *
  2482. OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type);
  2483. break;
  2484. default:
  2485. throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case");
  2486. break;
  2487. }
  2488. }
  2489. //
  2490. // UNKNOWN data is going to be zlib compressed, which needs
  2491. // a little extra headroom
  2492. //
  2493. if (planarUncBufferSize[UNKNOWN] > 0)
  2494. {
  2495. planarUncBufferSize[UNKNOWN] =
  2496. compressBound ((uLongf)planarUncBufferSize[UNKNOWN]);
  2497. }
  2498. for (int i = 0; i < NUM_COMPRESSOR_SCHEMES; ++i)
  2499. {
  2500. if (planarUncBufferSize[i] > _planarUncBufferSize[i])
  2501. {
  2502. _planarUncBufferSize[i] = planarUncBufferSize[i];
  2503. if (_planarUncBuffer[i] != 0)
  2504. delete[] _planarUncBuffer[i];
  2505. _planarUncBuffer[i] = new char[planarUncBufferSize[i]];
  2506. }
  2507. }
  2508. }
  2509. //
  2510. // Setup channel classification rules to use when writing files
  2511. //
  2512. void
  2513. DwaCompressor::initializeDefaultChannelRules ()
  2514. {
  2515. _channelRules.clear();
  2516. _channelRules.push_back (Classifier ("R", LOSSY_DCT, HALF, 0, false));
  2517. _channelRules.push_back (Classifier ("R", LOSSY_DCT, FLOAT, 0, false));
  2518. _channelRules.push_back (Classifier ("G", LOSSY_DCT, HALF, 1, false));
  2519. _channelRules.push_back (Classifier ("G", LOSSY_DCT, FLOAT, 1, false));
  2520. _channelRules.push_back (Classifier ("B", LOSSY_DCT, HALF, 2, false));
  2521. _channelRules.push_back (Classifier ("B", LOSSY_DCT, FLOAT, 2, false));
  2522. _channelRules.push_back (Classifier ("Y", LOSSY_DCT, HALF, -1, false));
  2523. _channelRules.push_back (Classifier ("Y", LOSSY_DCT, FLOAT, -1, false));
  2524. _channelRules.push_back (Classifier ("BY", LOSSY_DCT, HALF, -1, false));
  2525. _channelRules.push_back (Classifier ("BY", LOSSY_DCT, FLOAT, -1, false));
  2526. _channelRules.push_back (Classifier ("RY", LOSSY_DCT, HALF, -1, false));
  2527. _channelRules.push_back (Classifier ("RY", LOSSY_DCT, FLOAT, -1, false));
  2528. _channelRules.push_back (Classifier ("A", RLE, UINT, -1, false));
  2529. _channelRules.push_back (Classifier ("A", RLE, HALF, -1, false));
  2530. _channelRules.push_back (Classifier ("A", RLE, FLOAT, -1, false));
  2531. }
  2532. //
  2533. // Setup channel classification rules when reading files with VERSION < 2
  2534. //
  2535. void
  2536. DwaCompressor::initializeLegacyChannelRules ()
  2537. {
  2538. _channelRules.clear();
  2539. _channelRules.push_back (Classifier ("r", LOSSY_DCT, HALF, 0, true));
  2540. _channelRules.push_back (Classifier ("r", LOSSY_DCT, FLOAT, 0, true));
  2541. _channelRules.push_back (Classifier ("red", LOSSY_DCT, HALF, 0, true));
  2542. _channelRules.push_back (Classifier ("red", LOSSY_DCT, FLOAT, 0, true));
  2543. _channelRules.push_back (Classifier ("g", LOSSY_DCT, HALF, 1, true));
  2544. _channelRules.push_back (Classifier ("g", LOSSY_DCT, FLOAT, 1, true));
  2545. _channelRules.push_back (Classifier ("grn", LOSSY_DCT, HALF, 1, true));
  2546. _channelRules.push_back (Classifier ("grn", LOSSY_DCT, FLOAT, 1, true));
  2547. _channelRules.push_back (Classifier ("green", LOSSY_DCT, HALF, 1, true));
  2548. _channelRules.push_back (Classifier ("green", LOSSY_DCT, FLOAT, 1, true));
  2549. _channelRules.push_back (Classifier ("b", LOSSY_DCT, HALF, 2, true));
  2550. _channelRules.push_back (Classifier ("b", LOSSY_DCT, FLOAT, 2, true));
  2551. _channelRules.push_back (Classifier ("blu", LOSSY_DCT, HALF, 2, true));
  2552. _channelRules.push_back (Classifier ("blu", LOSSY_DCT, FLOAT, 2, true));
  2553. _channelRules.push_back (Classifier ("blue", LOSSY_DCT, HALF, 2, true));
  2554. _channelRules.push_back (Classifier ("blue", LOSSY_DCT, FLOAT, 2, true));
  2555. _channelRules.push_back (Classifier ("y", LOSSY_DCT, HALF, -1, true));
  2556. _channelRules.push_back (Classifier ("y", LOSSY_DCT, FLOAT, -1, true));
  2557. _channelRules.push_back (Classifier ("by", LOSSY_DCT, HALF, -1, true));
  2558. _channelRules.push_back (Classifier ("by", LOSSY_DCT, FLOAT, -1, true));
  2559. _channelRules.push_back (Classifier ("ry", LOSSY_DCT, HALF, -1, true));
  2560. _channelRules.push_back (Classifier ("ry", LOSSY_DCT, FLOAT, -1, true));
  2561. _channelRules.push_back (Classifier ("a", RLE, UINT, -1, true));
  2562. _channelRules.push_back (Classifier ("a", RLE, HALF, -1, true));
  2563. _channelRules.push_back (Classifier ("a", RLE, FLOAT, -1, true));
  2564. }
  2565. //
  2566. // Given a set of rules and ChannelData, figure out which rules apply
  2567. //
  2568. void
  2569. DwaCompressor::relevantChannelRules (std::vector<Classifier> &rules) const
  2570. {
  2571. rules.clear();
  2572. std::vector<std::string> suffixes;
  2573. for (size_t cd = 0; cd < _channelData.size(); ++cd)
  2574. {
  2575. std::string suffix = _channelData[cd].name;
  2576. size_t lastDot = suffix.find_last_of ('.');
  2577. if (lastDot != std::string::npos)
  2578. suffix = suffix.substr (lastDot+1, std::string::npos);
  2579. suffixes.push_back(suffix);
  2580. }
  2581. for (size_t i = 0; i < _channelRules.size(); ++i)
  2582. {
  2583. for (size_t cd = 0; cd < _channelData.size(); ++cd)
  2584. {
  2585. if (_channelRules[i].match (suffixes[cd], _channelData[cd].type ))
  2586. {
  2587. rules.push_back (_channelRules[i]);
  2588. break;
  2589. }
  2590. }
  2591. }
  2592. }
  2593. //
  2594. // Take our initial list of channels, and cache the contents.
  2595. //
  2596. // Determine approprate compression schemes for each channel,
  2597. // and figure out which sets should potentially be CSC'ed
  2598. // prior to lossy compression.
  2599. //
  2600. void
  2601. DwaCompressor::classifyChannels
  2602. (ChannelList channels,
  2603. std::vector<ChannelData> &chanData,
  2604. std::vector<CscChannelSet> &cscData)
  2605. {
  2606. //
  2607. // prefixMap used to map channel name prefixes to
  2608. // potential CSC-able sets of channels.
  2609. //
  2610. std::map<std::string, DwaCompressor::CscChannelSet> prefixMap;
  2611. std::vector<DwaCompressor::CscChannelSet> tmpCscSet;
  2612. unsigned int numChan = 0;
  2613. for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
  2614. numChan++;
  2615. if (numChan)
  2616. chanData.resize (numChan);
  2617. //
  2618. // Cache the relevant data from the channel structs.
  2619. //
  2620. unsigned int offset = 0;
  2621. for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
  2622. {
  2623. chanData[offset].name = std::string (c.name());
  2624. chanData[offset].compression = UNKNOWN;
  2625. chanData[offset].xSampling = c.channel().xSampling;
  2626. chanData[offset].ySampling = c.channel().ySampling;
  2627. chanData[offset].type = c.channel().type;
  2628. chanData[offset].pLinear = c.channel().pLinear;
  2629. offset++;
  2630. }
  2631. //
  2632. // Try and figure out which channels should be
  2633. // compressed by which means.
  2634. //
  2635. for (offset = 0; offset<numChan; ++offset)
  2636. {
  2637. std::string prefix = "";
  2638. std::string suffix = chanData[offset].name;
  2639. size_t lastDot = suffix.find_last_of ('.');
  2640. if (lastDot != std::string::npos)
  2641. {
  2642. prefix = suffix.substr (0, lastDot);
  2643. suffix = suffix.substr (lastDot+1, std::string::npos);
  2644. }
  2645. //
  2646. // Make sure we have an entry in our CSC set map
  2647. //
  2648. std::map<std::string, DwaCompressor::CscChannelSet>::iterator
  2649. theSet = prefixMap.find (prefix);
  2650. if (theSet == prefixMap.end())
  2651. {
  2652. DwaCompressor::CscChannelSet tmpSet;
  2653. tmpSet.idx[0] =
  2654. tmpSet.idx[1] =
  2655. tmpSet.idx[2] = -1;
  2656. prefixMap[prefix] = tmpSet;
  2657. }
  2658. //
  2659. // Check the suffix against the list of classifications
  2660. // we defined previously. If the _cscIdx is not negative,
  2661. // it indicates that we should be part of a CSC group.
  2662. //
  2663. for (std::vector<Classifier>::iterator i = _channelRules.begin();
  2664. i != _channelRules.end();
  2665. ++i)
  2666. {
  2667. if ( i->match(suffix, chanData[offset].type) )
  2668. {
  2669. chanData[offset].compression = i->_scheme;
  2670. if ( i->_cscIdx >= 0)
  2671. prefixMap[prefix].idx[i->_cscIdx] = offset;
  2672. }
  2673. }
  2674. }
  2675. //
  2676. // Finally, try and find RGB sets of channels which
  2677. // can be CSC'ed to a Y'CbCr space prior to loss, for
  2678. // better compression.
  2679. //
  2680. // Walk over our set of candidates, and see who has
  2681. // all three channels defined (and has common sampling
  2682. // patterns, etc).
  2683. //
  2684. for (std::map<std::string, DwaCompressor::CscChannelSet>::iterator
  2685. theItem = prefixMap.begin(); theItem != prefixMap.end();
  2686. ++theItem)
  2687. {
  2688. int red = (*theItem).second.idx[0];
  2689. int grn = (*theItem).second.idx[1];
  2690. int blu = (*theItem).second.idx[2];
  2691. if ((red < 0) || (grn < 0) || (blu < 0))
  2692. continue;
  2693. if ((chanData[red].xSampling != chanData[grn].xSampling) ||
  2694. (chanData[red].xSampling != chanData[blu].xSampling) ||
  2695. (chanData[grn].xSampling != chanData[blu].xSampling) ||
  2696. (chanData[red].ySampling != chanData[grn].ySampling) ||
  2697. (chanData[red].ySampling != chanData[blu].ySampling) ||
  2698. (chanData[grn].ySampling != chanData[blu].ySampling))
  2699. {
  2700. continue;
  2701. }
  2702. tmpCscSet.push_back ((*theItem).second);
  2703. }
  2704. size_t numCsc = tmpCscSet.size();
  2705. if (numCsc)
  2706. cscData.resize(numCsc);
  2707. for (offset = 0; offset < numCsc; ++offset)
  2708. cscData[offset] = tmpCscSet[offset];
  2709. }
  2710. //
  2711. // Setup some buffer pointers, determine channel sizes, things
  2712. // like that.
  2713. //
  2714. void
  2715. DwaCompressor::setupChannelData (int minX, int minY, int maxX, int maxY)
  2716. {
  2717. char *planarUncBuffer[NUM_COMPRESSOR_SCHEMES];
  2718. for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
  2719. {
  2720. planarUncBuffer[i] = 0;
  2721. if (_planarUncBuffer[i])
  2722. planarUncBuffer[i] = _planarUncBuffer[i];
  2723. }
  2724. for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
  2725. {
  2726. ChannelData *cd = &_channelData[chan];
  2727. cd->width = OPENEXR_IMF_NAMESPACE::numSamples (cd->xSampling, minX, maxX);
  2728. cd->height = OPENEXR_IMF_NAMESPACE::numSamples (cd->ySampling, minY, maxY);
  2729. cd->planarUncSize =
  2730. cd->width * cd->height * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);
  2731. cd->planarUncBuffer = planarUncBuffer[cd->compression];
  2732. cd->planarUncBufferEnd = cd->planarUncBuffer;
  2733. cd->planarUncRle[0] = cd->planarUncBuffer;
  2734. cd->planarUncRleEnd[0] = cd->planarUncRle[0];
  2735. for (int byte = 1; byte < OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type); ++byte)
  2736. {
  2737. cd->planarUncRle[byte] =
  2738. cd->planarUncRle[byte-1] + cd->width * cd->height;
  2739. cd->planarUncRleEnd[byte] =
  2740. cd->planarUncRle[byte];
  2741. }
  2742. cd->planarUncType = cd->type;
  2743. if (cd->compression == LOSSY_DCT)
  2744. {
  2745. cd->planarUncType = FLOAT;
  2746. }
  2747. else
  2748. {
  2749. planarUncBuffer[cd->compression] +=
  2750. cd->width * cd->height * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->planarUncType);
  2751. }
  2752. }
  2753. }
  2754. OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT