dwaLookups.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650
  1. ///////////////////////////////////////////////////////////////////////////
  2. //
  3. // Copyright (c) 2009-2014 DreamWorks Animation LLC.
  4. //
  5. // All rights reserved.
  6. //
  7. // Redistribution and use in source and binary forms, with or without
  8. // modification, are permitted provided that the following conditions are
  9. // met:
  10. // * Redistributions of source code must retain the above copyright
  11. // notice, this list of conditions and the following disclaimer.
  12. // * Redistributions in binary form must reproduce the above
  13. // copyright notice, this list of conditions and the following disclaimer
  14. // in the documentation and/or other materials provided with the
  15. // distribution.
  16. // * Neither the name of DreamWorks Animation nor the names of
  17. // its contributors may be used to endorse or promote products derived
  18. // from this software without specific prior written permission.
  19. //
  20. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. //
  32. ///////////////////////////////////////////////////////////////////////////
  33. #define OPENEXR_BUILTIN_TABLES
  34. //
  35. // A program to generate various acceleration lookup tables
  36. // for Imf::DwaCompressor
  37. //
  38. #include <cstddef>
  39. #include <stdio.h>
  40. #include <stdlib.h>
  41. #include <math.h>
  42. #include <vector>
  43. #include <OpenEXRConfig.h>
  44. #ifndef OPENEXR_BUILTIN_TABLES
  45. #ifdef OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN
  46. #include <unistd.h>
  47. #endif
  48. #endif // OPENEXR_BUILTIN_TABLES
  49. #include <half.h>
  50. #include <IlmThread.h>
  51. #include <IlmThreadSemaphore.h>
  52. #include <ImfIO.h>
  53. #include <ImfXdr.h>
  54. #include "ImfNamespace.h"
  55. using namespace OPENEXR_IMF_NAMESPACE;
  56. namespace {
  57. #ifdef OPENEXR_BUILTIN_TABLES
  58. static unsigned short dwaCompressorNoOp[0x10000] = {};
  59. static unsigned short dwaCompressorToLinear[0x10000] = {};
  60. static unsigned short dwaCompressorToNonlinear[0x10000] = {};
  61. //static unsigned int closestDataOffset[0x10000] = {};
  62. //static unsigned short closestData[0x80000] = {};
  63. #else
  64. class LutHeaderWorker
  65. {
  66. public:
  67. class Runner : public ILMTHREAD_NAMESPACE::Thread
  68. {
  69. public:
  70. Runner(LutHeaderWorker &worker, bool output):
  71. ILMTHREAD_NAMESPACE::Thread(),
  72. _worker(worker),
  73. _output(output)
  74. {
  75. start();
  76. }
  77. virtual ~Runner()
  78. {
  79. _semaphore.wait();
  80. }
  81. virtual void run()
  82. {
  83. _semaphore.post();
  84. _worker.run(_output);
  85. }
  86. private:
  87. LutHeaderWorker &_worker;
  88. bool _output;
  89. ILMTHREAD_NAMESPACE::Semaphore _semaphore;
  90. }; // class LutHeaderWorker::Runner
  91. LutHeaderWorker(size_t startValue,
  92. size_t endValue):
  93. _lastCandidateCount(0),
  94. _startValue(startValue),
  95. _endValue(endValue),
  96. _numElements(0),
  97. _offset(new size_t[numValues()]),
  98. _elements(new unsigned short[1024*1024*2])
  99. {
  100. }
  101. ~LutHeaderWorker()
  102. {
  103. delete[] _offset;
  104. delete[] _elements;
  105. }
  106. size_t lastCandidateCount() const
  107. {
  108. return _lastCandidateCount;
  109. }
  110. size_t numValues() const
  111. {
  112. return _endValue - _startValue;
  113. }
  114. size_t numElements() const
  115. {
  116. return _numElements;
  117. }
  118. const size_t* offset() const
  119. {
  120. return _offset;
  121. }
  122. const unsigned short* elements() const
  123. {
  124. return _elements;
  125. }
  126. void run(bool outputProgress)
  127. {
  128. half candidate[16];
  129. int candidateCount = 0;
  130. for (size_t input=_startValue; input<_endValue; ++input) {
  131. if (outputProgress) {
  132. #ifdef __GNUC__
  133. if (input % 100 == 0) {
  134. fprintf(stderr,
  135. " Building acceleration for DwaCompressor, %.2f %% %c",
  136. 100.*(float)input/(float)numValues(), 13);
  137. }
  138. #else
  139. if (input % 1000 == 0) {
  140. fprintf(stderr,
  141. " Building acceleration for DwaCompressor, %.2f %%\n",
  142. 100.*(float)input/(float)numValues());
  143. }
  144. #endif
  145. }
  146. int numSetBits = countSetBits(input);
  147. half inputHalf, closestHalf;
  148. inputHalf.setBits(input);
  149. _offset[input - _startValue] = _numElements;
  150. // Gather candidates
  151. candidateCount = 0;
  152. for (int targetNumSetBits=numSetBits-1; targetNumSetBits>=0;
  153. --targetNumSetBits) {
  154. bool valueFound = false;
  155. for (int i=0; i<65536; ++i) {
  156. if (countSetBits(i) != targetNumSetBits) continue;
  157. if (!valueFound) {
  158. closestHalf.setBits(i);
  159. valueFound = true;
  160. } else {
  161. half tmpHalf;
  162. tmpHalf.setBits(i);
  163. if (fabs((float)inputHalf - (float)tmpHalf) <
  164. fabs((float)inputHalf - (float)closestHalf)) {
  165. closestHalf = tmpHalf;
  166. }
  167. }
  168. }
  169. if (valueFound == false) {
  170. fprintf(stderr, "bork bork bork!\n");
  171. }
  172. candidate[candidateCount] = closestHalf;
  173. candidateCount++;
  174. }
  175. // Sort candidates by increasing number of bits set
  176. for (int i=0; i<candidateCount; ++i) {
  177. for (int j=i+1; j<candidateCount; ++j) {
  178. int iCnt = countSetBits(candidate[i].bits());
  179. int jCnt = countSetBits(candidate[j].bits());
  180. if (jCnt < iCnt) {
  181. half tmp = candidate[i];
  182. candidate[i] = candidate[j];
  183. candidate[j] = tmp;
  184. }
  185. }
  186. }
  187. // Copy candidates to the data buffer;
  188. for (int i=0; i<candidateCount; ++i) {
  189. _elements[_numElements] = candidate[i].bits();
  190. _numElements++;
  191. }
  192. if (input == _endValue-1) {
  193. _lastCandidateCount = candidateCount;
  194. }
  195. }
  196. }
  197. private:
  198. size_t _lastCandidateCount;
  199. size_t _startValue;
  200. size_t _endValue;
  201. size_t _numElements;
  202. size_t *_offset;
  203. unsigned short *_elements;
  204. //
  205. // Precomputing the bit count runs faster than using
  206. // the builtin instruction, at least in one case..
  207. //
  208. // Precomputing 8-bits is no slower than 16-bits,
  209. // and saves a fair bit of overhead..
  210. //
  211. int countSetBits(unsigned short src)
  212. {
  213. static const unsigned short numBitsSet[256] =
  214. {
  215. 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  216. 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  217. 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  218. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  219. 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  220. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  221. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  222. 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  223. 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  224. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  225. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  226. 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  227. 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  228. 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  229. 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  230. 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
  231. };
  232. return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
  233. }
  234. }; // class LutHeaderWorker
  235. #endif // OPENEXR_BUILTIN_TABLES
  236. } // namespace
  237. //
  238. // Generate a no-op LUT, to cut down in conditional branches
  239. //
  240. static void
  241. generateNoop()
  242. {
  243. #ifndef OPENEXR_BUILTIN_TABLES
  244. printf("const unsigned short dwaCompressorNoOp[] = \n");
  245. printf("{");
  246. #endif // OPENEXR_BUILTIN_TABLES
  247. for (int i=0; i<65536; ++i) {
  248. #ifndef OPENEXR_BUILTIN_TABLES
  249. if (i % 8 == 0) {
  250. printf("\n ");
  251. }
  252. #endif // OPENEXR_BUILTIN_TABLES
  253. unsigned short dst;
  254. char *tmp = (char *)(&dst);
  255. unsigned short src = (unsigned short)i;
  256. Xdr::write <CharPtrIO> (tmp, src);
  257. #ifndef OPENEXR_BUILTIN_TABLES
  258. printf("0x%04x, ", dst);
  259. #else
  260. dwaCompressorNoOp[i] = dst;
  261. #endif // OPENEXR_BUILTIN_TABLES
  262. }
  263. #ifndef OPENEXR_BUILTIN_TABLES
  264. printf("\n};\n");
  265. #endif // OPENEXR_BUILTIN_TABLES
  266. }
  267. //
  268. // Nonlinearly encode luminance. For values below 1.0, we want
  269. // to use a gamma 2.2 function to match what is fairly common
  270. // for storing output referred. However, > 1, gamma functions blow up,
  271. // and log functions are much better behaved. We could use a log
  272. // function everywhere, but it tends to over-sample dark
  273. // regions and undersample the brighter regions, when
  274. // compared to the way real devices reproduce values.
  275. //
  276. // So, above 1, use a log function which is a smooth blend
  277. // into the gamma function.
  278. //
  279. // Nonlinear(linear) =
  280. //
  281. // linear^(1./2.2) / linear <= 1.0
  282. // |
  283. // ln(linear)/ln(e^2.2) + 1 \ otherwise
  284. //
  285. //
  286. // toNonlinear[] needs to take in XDR format half float values,
  287. // and output NATIVE format float.
  288. //
  289. // toLinear[] does the opposite - takes in NATIVE half and
  290. // outputs XDR half values.
  291. //
  292. static void
  293. generateToLinear()
  294. {
  295. #ifndef OPENEXR_BUILTIN_TABLES
  296. unsigned short toLinear[65536];
  297. #else
  298. unsigned short* toLinear = dwaCompressorToLinear;
  299. #endif // OPENEXR_BUILTIN_TABLES
  300. toLinear[0] = 0;
  301. for (int i=1; i<65536; ++i) {
  302. half h;
  303. float sign = 1;
  304. float logBase = pow(2.7182818, 2.2);
  305. // map NaN and inf to 0
  306. if ((i & 0x7c00) == 0x7c00) {
  307. toLinear[i] = 0;
  308. continue;
  309. }
  310. //
  311. // _toLinear - assume i is NATIVE, but our output needs
  312. // to get flipped to XDR
  313. //
  314. h.setBits(i);
  315. sign = 1;
  316. if ((float)h < 0) {
  317. sign = -1;
  318. }
  319. if ( fabs( (float)h) <= 1.0 ) {
  320. h = (half)(sign * pow((float)fabs((float)h), 2.2f));
  321. } else {
  322. h = (half)(sign * pow(logBase, (float)(fabs((float)h) - 1.0)));
  323. }
  324. {
  325. char *tmp = (char *)(&toLinear[i]);
  326. Xdr::write <CharPtrIO> ( tmp, h.bits());
  327. }
  328. }
  329. #ifndef OPENEXR_BUILTIN_TABLES
  330. printf("const unsigned short dwaCompressorToLinear[] = \n");
  331. printf("{");
  332. for (int i=0; i<65536; ++i) {
  333. if (i % 8 == 0) {
  334. printf("\n ");
  335. }
  336. printf("0x%04x, ", toLinear[i]);
  337. }
  338. printf("\n};\n");
  339. #endif // OPENEXR_BUILTIN_TABLES
  340. }
  341. static void
  342. generateToNonlinear()
  343. {
  344. #ifndef OPENEXR_BUILTIN_TABLES
  345. unsigned short toNonlinear[65536];
  346. #else
  347. unsigned short* toNonlinear = dwaCompressorToNonlinear;
  348. #endif // OPENEXR_BUILTIN_TABLES
  349. toNonlinear[0] = 0;
  350. for (int i=1; i<65536; ++i) {
  351. unsigned short usNative, usXdr;
  352. half h;
  353. float sign = 1;
  354. float logBase = pow(2.7182818, 2.2);
  355. usXdr = i;
  356. {
  357. const char *tmp = (char *)(&usXdr);
  358. Xdr::read<CharPtrIO>(tmp, usNative);
  359. }
  360. // map NaN and inf to 0
  361. if ((usNative & 0x7c00) == 0x7c00) {
  362. toNonlinear[i] = 0;
  363. continue;
  364. }
  365. //
  366. // toNonlinear - assume i is XDR
  367. //
  368. h.setBits(usNative);
  369. sign = 1;
  370. if ((float)h < 0) {
  371. sign = -1;
  372. }
  373. if ( fabs( (float)h ) <= 1.0) {
  374. h = (half)(sign * pow(fabs((float)h), 1.f/2.2f));
  375. } else {
  376. h = (half)(sign * ( log(fabs((float)h)) / log(logBase) + 1.0) );
  377. }
  378. toNonlinear[i] = h.bits();
  379. }
  380. #ifndef OPENEXR_BUILTIN_TABLES
  381. printf("const unsigned short dwaCompressorToNonlinear[] = \n");
  382. printf("{");
  383. for (int i=0; i<65536; ++i) {
  384. if (i % 8 == 0) {
  385. printf("\n ");
  386. }
  387. printf("0x%04x, ", toNonlinear[i]);
  388. }
  389. printf("\n};\n");
  390. #endif // OPENEXR_BUILTIN_TABLES
  391. }
  392. #ifndef OPENEXR_BUILTIN_TABLES
  393. //
  394. // Attempt to get available CPUs in a somewhat portable way.
  395. //
  396. int
  397. cpuCount()
  398. {
  399. if (!ILMTHREAD_NAMESPACE::supportsThreads()) return 1;
  400. int cpuCount = 1;
  401. #if defined (OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN)
  402. cpuCount = sysconf(_SC_NPROCESSORS_ONLN);
  403. #elif defined (_WIN32)
  404. SYSTEM_INFO sysinfo;
  405. GetSystemInfo( &sysinfo );
  406. cpuCount = sysinfo.dwNumberOfProcessors;
  407. #endif
  408. if (cpuCount < 1) cpuCount = 1;
  409. return cpuCount;
  410. }
  411. //
  412. // Generate acceleration luts for the quantization.
  413. //
  414. // For each possible input value, we want to find the closest numbers
  415. // which have one fewer bits set than before.
  416. //
  417. // This gives us num_bits(input)-1 values per input. If we alloc
  418. // space for everything, that's like a 2MB table. We can do better
  419. // by compressing all the values to be contigious and using offset
  420. // pointers.
  421. //
  422. // After we've found the candidates with fewer bits set, sort them
  423. // based on increasing numbers of bits set. This way, on quantize(),
  424. // we can scan through the list and halt once we find the first
  425. // candidate within the error range. For small values that can
  426. // be quantized to 0, 0 is the first value tested and the search
  427. // can exit fairly quickly.
  428. //
  429. void
  430. generateLutHeader()
  431. {
  432. std::vector<LutHeaderWorker*> workers;
  433. size_t numWorkers = cpuCount();
  434. size_t workerInterval = 65536 / numWorkers;
  435. for (size_t i=0; i<numWorkers; ++i) {
  436. if (i != numWorkers-1) {
  437. workers.push_back( new LutHeaderWorker( i *workerInterval,
  438. (i+1)*workerInterval) );
  439. } else {
  440. workers.push_back( new LutHeaderWorker(i*workerInterval, 65536) );
  441. }
  442. }
  443. if (ILMTHREAD_NAMESPACE::supportsThreads()) {
  444. std::vector<LutHeaderWorker::Runner*> runners;
  445. for (size_t i=0; i<workers.size(); ++i) {
  446. runners.push_back( new LutHeaderWorker::Runner(*workers[i], (i==0)) );
  447. }
  448. for (size_t i=0; i<workers.size(); ++i) {
  449. delete runners[i];
  450. }
  451. } else {
  452. for (size_t i=0; i<workers.size(); ++i) {
  453. workers[i]->run(i == 0);
  454. }
  455. }
  456. printf("static unsigned int closestDataOffset[] = {\n");
  457. int offsetIdx = 0;
  458. int offsetPrev = 0;
  459. for (size_t i=0; i<workers.size(); ++i) {
  460. for (size_t value=0; value<workers[i]->numValues(); ++value) {
  461. if (offsetIdx % 8 == 0) {
  462. printf(" ");
  463. }
  464. printf("%6lu, ", workers[i]->offset()[value] + offsetPrev);
  465. if (offsetIdx % 8 == 7) {
  466. printf("\n");
  467. }
  468. offsetIdx++;
  469. }
  470. offsetPrev += workers[i]->offset()[workers[i]->numValues()-1] +
  471. workers[i]->lastCandidateCount();
  472. }
  473. printf("};\n\n\n");
  474. printf("static unsigned short closestData[] = {\n");
  475. int elementIdx = 0;
  476. for (size_t i=0; i<workers.size(); ++i) {
  477. for (size_t element=0; element<workers[i]->numElements(); ++element) {
  478. if (elementIdx % 8 == 0) {
  479. printf(" ");
  480. }
  481. printf("%5d, ", workers[i]->elements()[element]);
  482. if (elementIdx % 8 == 7) {
  483. printf("\n");
  484. }
  485. elementIdx++;
  486. }
  487. }
  488. printf("};\n\n\n");
  489. for (size_t i=0; i<workers.size(); ++i) {
  490. delete workers[i];
  491. }
  492. }
  493. int
  494. main(int argc, char **argv)
  495. {
  496. printf("#include <cstddef>\n");
  497. printf("\n\n\n");
  498. generateNoop();
  499. printf("\n\n\n");
  500. generateToLinear();
  501. printf("\n\n\n");
  502. generateToNonlinear();
  503. printf("\n\n\n");
  504. generateLutHeader();
  505. return 0;
  506. }
  507. #else // OPENEXR_BUILTIN_TABLES
  508. #include "dwaLookups.h"
  509. OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
  510. static void init_dwa_()
  511. {
  512. generateNoop();
  513. generateToLinear();
  514. generateToNonlinear();
  515. // N/A: generateLutHeader();
  516. }
  517. static inline void init_dwa()
  518. {
  519. static bool initialized = false;
  520. if (!initialized)
  521. {
  522. init_dwa_();
  523. initialized = true;
  524. }
  525. }
  526. const unsigned short* get_dwaCompressorNoOp()
  527. {
  528. init_dwa();
  529. return dwaCompressorNoOp;
  530. }
  531. const unsigned short* get_dwaCompressorToLinear()
  532. {
  533. init_dwa();
  534. return dwaCompressorToLinear;
  535. }
  536. const unsigned short* get_dwaCompressorToNonlinear()
  537. {
  538. init_dwa();
  539. return dwaCompressorToNonlinear;
  540. }
  541. OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
  542. #endif // OPENEXR_BUILTIN_TABLES