opencl_kernels_test_gapi.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. // This file is part of OpenCV project.
  2. // It is subject to the license terms in the LICENSE file found in the top-level directory
  3. // of this distribution and at http://opencv.org/license.html.
  4. //
  5. // Copyright (C) 2018 Intel Corporation
  6. #include <opencv2/core/ocl.hpp>
  7. #include <opencv2/core/ocl_genbase.hpp>
  8. #include <opencv2/core/opencl/ocl_defs.hpp>
  9. #ifdef HAVE_OPENCL
  10. const char* opencl_symm7x7_src =
  11. "#if BORDER_REPLICATE\n"
  12. "#define GET_BORDER(elem) (elem)\n"
  13. "#define SET_ALL(i, j) a0[i] = a0[j]; a1[i] = a1[j]; a2[i] = a2[j]; b[i] = b[j]; c0[i] = c0[j]; c1[i] = c1[j]; c2[i] = c2[j];\n"
  14. "#else\n"
  15. "#define GET_BORDER(elem) (BORDER_CONSTANT_VALUE)\n"
  16. "#define SET_ALL(i, j) a0[i] = a1[i] = a2[i] = c0[i] = c1[i] = c2[i] = BORDER_CONSTANT_VALUE; b[i] = BORDER_CONSTANT_VALUE;\n"
  17. "#endif\n"
  18. "#define GET_A0(id, x, l_edge, a1) ((x) <= (l_edge + 2) ? GET_BORDER(a1) : (((const __global uchar*)(id))[-3]))\n"
  19. "#define GET_A1(id, x, l_edge, a2) ((x) <= (l_edge + 1) ? GET_BORDER(a2) : (((const __global uchar*)(id))[-2]))\n"
  20. "#define GET_A2(id, x, l_edge, b) ((x) <= (l_edge) ? GET_BORDER(b.s0) : (((const __global uchar*)(id))[-1]))\n"
  21. "#define GET_C0(id, x, r_edge, b) ((x) >= (r_edge) ? GET_BORDER(b.s7) : (((const __global uchar*)(id))[8]))\n"
  22. "#define GET_C1(id, x, r_edge, c0) ((x) >= (r_edge - 1) ? GET_BORDER(c0) : (((const __global uchar*)(id))[8 + 1]))\n"
  23. "#define GET_C2(id, x, r_edge, c1) ((x) >= (r_edge - 2) ? GET_BORDER(c1) : (((const __global uchar*)(id))[8 + 2]))\n"
  24. "__kernel void symm_7x7_test(\n"
  25. "__global const uchar * srcptr,\n"
  26. "int srcStep, int srcEndX, int srcEndY,\n"
  27. "__global uchar * dstptr, int dstStep,\n"
  28. "int rows, int cols,\n"
  29. "int tile_y_coord,\n"
  30. "__constant int * coeff)\n"
  31. "{\n"
  32. "int lEdge = 0, rEdge = cols - 8;\n"
  33. "int x = (get_global_id(0) < cols/8) ? get_global_id(0) * 8: cols - 8;\n"
  34. "int y = get_global_id(1);\n"
  35. "int yd = min(3, tile_y_coord);\n"
  36. "int dst_id = mad24(y, dstStep, x);\n"
  37. "y+=yd;\n"
  38. "int src_id = mad24(y, srcStep, x);\n"
  39. "int y_limit = y + tile_y_coord;\n"
  40. "y_limit-=yd;\n"
  41. "const __global uchar* psrc = (const __global uchar*)(srcptr + src_id);\n"
  42. "__global uchar* pdst = (__global uchar*)(dstptr + dst_id);\n"
  43. "#define BSIZE (7)\n"
  44. "float a0[BSIZE]; float a1[BSIZE]; float a2[BSIZE];\n"
  45. "float8 b[BSIZE];\n"
  46. "float c0[BSIZE]; float c1[BSIZE]; float c2[BSIZE];\n"
  47. "b[3] = convert_float8(vload8(0, (const __global uchar*)psrc));\n"
  48. "if( (y_limit <=2 ) || (y_limit >= srcEndY - 3) || (x >= rEdge-2) || (x <= lEdge + 2) )\n"
  49. "{\n"
  50. "a2[3] = GET_A2(psrc, x, lEdge, b[3]);\n"
  51. "a1[3] = GET_A1(psrc, x, lEdge, a2[3]);\n"
  52. "a0[3] = GET_A0(psrc, x, lEdge, a1[3]);\n"
  53. "c0[3] = GET_C0(psrc, x, rEdge, b[3]);\n"
  54. "c1[3] = GET_C1(psrc, x, rEdge, c0[3]);\n"
  55. "c2[3] = GET_C2(psrc, x, rEdge, c1[3]);\n"
  56. "if(y_limit > 0)\n"
  57. "{\n"
  58. "b[2] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep)));\n"
  59. "a2[2] = GET_A2(psrc - srcStep, x, lEdge, b[2]);\n"
  60. "a1[2] = GET_A1(psrc - srcStep, x, lEdge, a2[2]);\n"
  61. "a0[2] = GET_A0(psrc - srcStep, x, lEdge, a1[2]);\n"
  62. "c0[2] = GET_C0(psrc - srcStep, x, rEdge, b[2]);\n"
  63. "c1[2] = GET_C1(psrc - srcStep, x, rEdge, c0[2]);\n"
  64. "c2[2] = GET_C2(psrc - srcStep, x, rEdge, c1[2]);\n"
  65. "}\n"
  66. "else\n"
  67. "{\n"
  68. "SET_ALL(2, 3);\n"
  69. "}\n"
  70. "if( y_limit > 1 )\n"
  71. "{\n"
  72. "b[1] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep*2)));\n"
  73. "a2[1] = GET_A2(psrc - srcStep*2, x, lEdge, b[1]);\n"
  74. "a1[1] = GET_A1(psrc - srcStep*2, x, lEdge, a2[1]);\n"
  75. "a0[1] = GET_A0(psrc - srcStep*2, x, lEdge, a1[1]);\n"
  76. "c0[1] = GET_C0(psrc - srcStep*2, x, rEdge, b[1]);\n"
  77. "c1[1] = GET_C1(psrc - srcStep*2, x, rEdge, c0[1]);\n"
  78. "c2[1] = GET_C2(psrc - srcStep*2, x, rEdge, c1[1]);\n"
  79. "}\n"
  80. "else\n"
  81. "{\n"
  82. "SET_ALL(1, 2);\n"
  83. "}\n"
  84. "if( y_limit > 2 )\n"
  85. "{\n"
  86. "b[0] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep*3)));\n"
  87. "a2[0] = GET_A2(psrc - srcStep*3, x, lEdge, b[0]);\n"
  88. "a1[0] = GET_A1(psrc - srcStep*3, x, lEdge, a2[0]);\n"
  89. "a0[0] = GET_A0(psrc - srcStep*3, x, lEdge, a1[0]);\n"
  90. "c0[0] = GET_C0(psrc - srcStep*3, x, rEdge, b[0]);\n"
  91. "c1[0] = GET_C1(psrc - srcStep*3, x, rEdge, c0[0]);\n"
  92. "c2[0] = GET_C2(psrc - srcStep*3, x, rEdge, c1[0]);\n"
  93. "}\n"
  94. "else\n"
  95. "{\n"
  96. "SET_ALL(0, 1);\n"
  97. "}\n"
  98. "if( y_limit < srcEndY - 1 )\n"
  99. "{\n"
  100. "b[4] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep)));\n"
  101. "a2[4] = GET_A2(psrc + srcStep, x, lEdge, b[4]);\n"
  102. "a1[4] = GET_A1(psrc + srcStep, x, lEdge, a2[4]);\n"
  103. "a0[4] = GET_A0(psrc + srcStep, x, lEdge, a1[4]);\n"
  104. "c0[4] = GET_C0(psrc + srcStep, x, rEdge, b[4]);\n"
  105. "c1[4] = GET_C1(psrc + srcStep, x, rEdge, c0[4]);\n"
  106. "c2[4] = GET_C2(psrc + srcStep, x, rEdge, c1[4]);\n"
  107. "}\n"
  108. "else\n"
  109. "{\n"
  110. "SET_ALL(4, 3);\n"
  111. "}\n"
  112. "if( y_limit < srcEndY - 2 )\n"
  113. "{\n"
  114. "b[5] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep*2)));\n"
  115. "a2[5] = GET_A2(psrc + srcStep*2, x, lEdge, b[5]);\n"
  116. "a1[5] = GET_A1(psrc + srcStep*2, x, lEdge, a2[5]);\n"
  117. "a0[5] = GET_A0(psrc + srcStep*2, x, lEdge, a1[5]);\n"
  118. "c0[5] = GET_C0(psrc + srcStep*2, x, rEdge, b[5]);\n"
  119. "c1[5] = GET_C1(psrc + srcStep*2, x, rEdge, c0[5]);\n"
  120. "c2[5] = GET_C2(psrc + srcStep*2, x, rEdge, c1[5]);\n"
  121. "}\n"
  122. "else\n"
  123. "{\n"
  124. "SET_ALL(5, 4);\n"
  125. "}\n"
  126. "if( y_limit < srcEndY - 3 )\n"
  127. "{\n"
  128. "b[6] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep*3)));\n"
  129. "a2[6] = GET_A2(psrc + srcStep*3, x, lEdge, b[6]);\n"
  130. "a1[6] = GET_A1(psrc + srcStep*3, x, lEdge, a2[6]);\n"
  131. "a0[6] = GET_A0(psrc + srcStep*3, x, lEdge, a1[6]);\n"
  132. "c0[6] = GET_C0(psrc + srcStep*3, x, rEdge, b[6]);\n"
  133. "c1[6] = GET_C1(psrc + srcStep*3, x, rEdge, c0[6]);\n"
  134. "c2[6] = GET_C2(psrc + srcStep*3, x, rEdge, c1[6]);\n"
  135. "}\n"
  136. "else\n"
  137. "{\n"
  138. "SET_ALL(6, 5);\n"
  139. "}\n"
  140. "}\n"
  141. "else\n"
  142. "{\n"
  143. "a2[3] = (((const __global uchar*)(psrc))[-1]);\n"
  144. "a1[3] = (((const __global uchar*)(psrc))[-2]);\n"
  145. "a0[3] = (((const __global uchar*)(psrc))[-3]);\n"
  146. "c0[3] = (((const __global uchar*)(psrc))[8]);\n"
  147. "c1[3] = (((const __global uchar*)(psrc))[8 + 1]);\n"
  148. "c2[3] = (((const __global uchar*)(psrc))[8 + 2]);\n"
  149. "b[2] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep)));\n"
  150. "a2[2] = (((const __global uchar*)(psrc - srcStep))[-1]);\n"
  151. "a1[2] = (((const __global uchar*)(psrc - srcStep))[-2]);\n"
  152. "a0[2] = (((const __global uchar*)(psrc - srcStep))[-3]);\n"
  153. "c0[2] = (((const __global uchar*)(psrc - srcStep))[8]);\n"
  154. "c1[2] = (((const __global uchar*)(psrc - srcStep))[8 + 1]);\n"
  155. "c2[2] = (((const __global uchar*)(psrc - srcStep))[8 + 2]);\n"
  156. "b[1] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep*2)));\n"
  157. "a2[1] = (((const __global uchar*)(psrc - srcStep*2))[-1]);\n"
  158. "a1[1] = (((const __global uchar*)(psrc - srcStep*2))[-2]);\n"
  159. "a0[1] = (((const __global uchar*)(psrc - srcStep*2))[-3]);\n"
  160. "c0[1] = (((const __global uchar*)(psrc - srcStep*2))[8]);\n"
  161. "c1[1] = (((const __global uchar*)(psrc - srcStep*2))[8 + 1]);\n"
  162. "c2[1] = (((const __global uchar*)(psrc - srcStep*2))[8 + 2]);\n"
  163. "b[0] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep*3)));\n"
  164. "a2[0] = (((const __global uchar*)(psrc - srcStep*3))[-1]);\n"
  165. "a1[0] = (((const __global uchar*)(psrc - srcStep*3))[-2]);\n"
  166. "a0[0] = (((const __global uchar*)(psrc - srcStep*3))[-3]);\n"
  167. "c0[0] = (((const __global uchar*)(psrc - srcStep*3))[8]);\n"
  168. "c1[0] = (((const __global uchar*)(psrc - srcStep*3))[8 + 1]);\n"
  169. "c2[0] = (((const __global uchar*)(psrc - srcStep*3))[8 + 2]);\n"
  170. "b[4] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep)));\n"
  171. "a2[4] = (((const __global uchar*)(psrc + srcStep))[-1]);\n"
  172. "a1[4] = (((const __global uchar*)(psrc + srcStep))[-2]);\n"
  173. "a0[4] = (((const __global uchar*)(psrc + srcStep))[-3]);\n"
  174. "c0[4] = (((const __global uchar*)(psrc + srcStep))[8]);\n"
  175. "c1[4] = (((const __global uchar*)(psrc + srcStep))[8 + 1]);\n"
  176. "c2[4] = (((const __global uchar*)(psrc + srcStep))[8 + 2]);\n"
  177. "b[5] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep*2)));\n"
  178. "a2[5] = (((const __global uchar*)(psrc + srcStep*2))[-1]);\n"
  179. "a1[5] = (((const __global uchar*)(psrc + srcStep*2))[-2]);\n"
  180. "a0[5] = (((const __global uchar*)(psrc + srcStep*2))[-3]);\n"
  181. "c0[5] = (((const __global uchar*)(psrc + srcStep*2))[8]);\n"
  182. "c1[5] = (((const __global uchar*)(psrc + srcStep*2))[8 + 1]);\n"
  183. "c2[5] = (((const __global uchar*)(psrc + srcStep*2))[8 + 2]);\n"
  184. "b[6] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep*3)));\n"
  185. "a2[6] = (((const __global uchar*)(psrc + srcStep*3))[-1]);\n"
  186. "a1[6] = (((const __global uchar*)(psrc + srcStep*3))[-2]);\n"
  187. "a0[6] = (((const __global uchar*)(psrc + srcStep*3))[-3]);\n"
  188. "c0[6] = (((const __global uchar*)(psrc + srcStep*3))[8]);\n"
  189. "c1[6] = (((const __global uchar*)(psrc + srcStep*3))[8 + 1]);\n"
  190. "c2[6] = (((const __global uchar*)(psrc + srcStep*3))[8 + 2]);\n"
  191. "}\n"
  192. "float a0_sum[3]; float a1_sum[3]; float a2_sum[3];\n"
  193. "float8 b_sum[3];\n"
  194. "float c0_sum[3]; float c1_sum[3]; float c2_sum[3];\n"
  195. "a0_sum[0] = a0[0] + a0[6];\n"
  196. "a0_sum[1] = a0[1] + a0[5];\n"
  197. "a0_sum[2] = a0[2] + a0[4];\n"
  198. "a1_sum[0] = a1[0] + a1[6];\n"
  199. "a1_sum[1] = a1[1] + a1[5];\n"
  200. "a1_sum[2] = a1[2] + a1[4];\n"
  201. "a2_sum[0] = a2[0] + a2[6];\n"
  202. "a2_sum[1] = a2[1] + a2[5];\n"
  203. "a2_sum[2] = a2[2] + a2[4];\n"
  204. "c0_sum[0] = c0[0] + c0[6];\n"
  205. "c0_sum[1] = c0[1] + c0[5];\n"
  206. "c0_sum[2] = c0[2] + c0[4];\n"
  207. "c1_sum[0] = c1[0] + c1[6];\n"
  208. "c1_sum[1] = c1[1] + c1[5];\n"
  209. "c1_sum[2] = c1[2] + c1[4];\n"
  210. "c2_sum[0] = c2[0] + c2[6];\n"
  211. "c2_sum[1] = c2[1] + c2[5];\n"
  212. "c2_sum[2] = c2[2] + c2[4];\n"
  213. "b_sum[0] = b[0] + b[6];\n"
  214. "b_sum[1] = b[1] + b[5];\n"
  215. "b_sum[2] = b[2] + b[4];\n"
  216. "float8 A = b[3];\n"
  217. "float8 intermediate = A * (float)coeff[0];\n"
  218. "float8 B = b_sum[2] +\n"
  219. "(float8)(a2[3], b[3].s0123, b[3].s456) +\n"
  220. "(float8)(b[3].s123, b[3].s4567, c0[3]);\n"
  221. "intermediate += B * (float)coeff[1];\n"
  222. "float8 C = (float8)(a2_sum[2], b_sum[2].s0123, b_sum[2].s456) +\n"
  223. "(float8)(b_sum[2].s123, b_sum[2].s4567, c0_sum[2]);\n"
  224. "intermediate += C * (float)coeff[2];\n"
  225. "float8 D = b_sum[1] +\n"
  226. "(float8)(a1[3], a2[3], b[3].s0123, b[3].s45) +\n"
  227. "(float8)(b[3].s23, b[3].s4567, c0[3], c1[3]);\n"
  228. "intermediate += D * (float)coeff[3];\n"
  229. "float8 E = (float8)(a2_sum[1], b_sum[1].s0123, b_sum[1].s456) +\n"
  230. "(float8)( b_sum[1].s123, b_sum[1].s4567, c0_sum[1]) +\n"
  231. "(float8)( a1_sum[2], a2_sum[2], b_sum[2].s0123, b_sum[2].s45) +\n"
  232. "(float8)( b_sum[2].s23, b_sum[2].s4567, c0_sum[2], c1_sum[2]);\n"
  233. "intermediate += E * (float)coeff[4];\n"
  234. "float8 F = (float8)(a1_sum[1], a2_sum[1], b_sum[1].s0123, b_sum[1].s45) +\n"
  235. "(float8)(b_sum[1].s23, b_sum[1].s4567, c0_sum[1], c1_sum[1]);\n"
  236. "intermediate += F * (float)coeff[5];\n"
  237. "float8 G = b_sum[0] +\n"
  238. "(float8)(a0[3], a1[3], a2[3], b[3].s0123, b[3].s4) +\n"
  239. "(float8)(b[3].s3, b[3].s4567, c0[3], c1[3], c2[3]);\n"
  240. "intermediate += G * (float)coeff[6];\n"
  241. "float8 H = (float8)(a2_sum[0], b_sum[0].s0123, b_sum[0].s456) +\n"
  242. "(float8)(b_sum[0].s123, b_sum[0].s4567, c0_sum[0]) +\n"
  243. "(float8)(a0_sum[2], a1_sum[2], a2_sum[2], b_sum[2].s0123, b_sum[2].s4) +\n"
  244. "(float8)(b_sum[2].s3, b_sum[2].s4567, c0_sum[2], c1_sum[2], c2_sum[2]);\n"
  245. "intermediate += H * (float)coeff[7];\n"
  246. "float8 I = (float8)(a1_sum[0], a2_sum[0], b_sum[0].s0123, b_sum[0].s45) +\n"
  247. "(float8)(b_sum[0].s23, b_sum[0].s4567, c0_sum[0], c1_sum[0]) +\n"
  248. "(float8)(a0_sum[1], a1_sum[1], a2_sum[1], b_sum[1].s0123, b_sum[1].s4) +\n"
  249. "(float8)(b_sum[1].s3, b_sum[1].s4567, c0_sum[1], c1_sum[1], c2_sum[1]);\n"
  250. "intermediate += I * (float)coeff[8];\n"
  251. "float8 J = (float8)(a0_sum[0], a1_sum[0], a2_sum[0], b_sum[0].s0123, b_sum[0].s4) +\n"
  252. "(float8)(b_sum[0].s3, b_sum[0].s4567, c0_sum[0], c1_sum[0], c2_sum[0]);\n"
  253. "intermediate += J * (float)coeff[9];\n"
  254. "intermediate *= SCALE;\n"
  255. "vstore8(convert_uchar8_sat(intermediate), 0, (__global uchar*)(pdst));\n"
  256. "}\n"
  257. ;
  258. #endif