stereo_multi.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. // This sample demonstrates working on one piece of data using two GPUs.
  2. // It splits input into two parts and processes them separately on different GPUs.
  3. #ifdef _WIN32
  4. #define NOMINMAX
  5. #include <windows.h>
  6. #else
  7. #include <pthread.h>
  8. #include <unistd.h>
  9. #endif
  10. #include <iostream>
  11. #include <iomanip>
  12. #include "opencv2/core.hpp"
  13. #include "opencv2/highgui.hpp"
  14. #include "opencv2/imgproc.hpp"
  15. #include "opencv2/cudastereo.hpp"
  16. using namespace std;
  17. using namespace cv;
  18. using namespace cv::cuda;
  19. ///////////////////////////////////////////////////////////
  20. // Thread
  21. // OS-specific wrappers for multi-threading
  22. #ifdef _WIN32
  23. class Thread
  24. {
  25. struct UserData
  26. {
  27. void (*func)(void* userData);
  28. void* param;
  29. };
  30. static DWORD WINAPI WinThreadFunction(LPVOID lpParam)
  31. {
  32. UserData* userData = static_cast<UserData*>(lpParam);
  33. userData->func(userData->param);
  34. return 0;
  35. }
  36. UserData userData_;
  37. HANDLE thread_;
  38. DWORD threadId_;
  39. public:
  40. Thread(void (*func)(void* userData), void* userData)
  41. {
  42. userData_.func = func;
  43. userData_.param = userData;
  44. thread_ = CreateThread(
  45. NULL, // default security attributes
  46. 0, // use default stack size
  47. WinThreadFunction, // thread function name
  48. &userData_, // argument to thread function
  49. 0, // use default creation flags
  50. &threadId_); // returns the thread identifier
  51. }
  52. ~Thread()
  53. {
  54. CloseHandle(thread_);
  55. }
  56. void wait()
  57. {
  58. WaitForSingleObject(thread_, INFINITE);
  59. }
  60. };
  61. #else
  62. class Thread
  63. {
  64. struct UserData
  65. {
  66. void (*func)(void* userData);
  67. void* param;
  68. };
  69. static void* PThreadFunction(void* lpParam)
  70. {
  71. UserData* userData = static_cast<UserData*>(lpParam);
  72. userData->func(userData->param);
  73. return 0;
  74. }
  75. pthread_t thread_;
  76. UserData userData_;
  77. public:
  78. Thread(void (*func)(void* userData), void* userData)
  79. {
  80. userData_.func = func;
  81. userData_.param = userData;
  82. pthread_create(&thread_, NULL, PThreadFunction, &userData_);
  83. }
  84. ~Thread()
  85. {
  86. pthread_detach(thread_);
  87. }
  88. void wait()
  89. {
  90. pthread_join(thread_, NULL);
  91. }
  92. };
  93. #endif
  94. ///////////////////////////////////////////////////////////
  95. // StereoSingleGpu
  96. // Run Stereo algorithm on single GPU
  97. class StereoSingleGpu
  98. {
  99. public:
  100. explicit StereoSingleGpu(int deviceId = 0);
  101. ~StereoSingleGpu();
  102. void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
  103. private:
  104. int deviceId_;
  105. GpuMat d_leftFrame;
  106. GpuMat d_rightFrame;
  107. GpuMat d_disparity;
  108. Ptr<cuda::StereoBM> d_alg;
  109. };
  110. StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId)
  111. {
  112. cuda::setDevice(deviceId_);
  113. d_alg = cuda::createStereoBM(256);
  114. }
  115. StereoSingleGpu::~StereoSingleGpu()
  116. {
  117. cuda::setDevice(deviceId_);
  118. d_leftFrame.release();
  119. d_rightFrame.release();
  120. d_disparity.release();
  121. d_alg.release();
  122. }
  123. void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
  124. {
  125. cuda::setDevice(deviceId_);
  126. d_leftFrame.upload(leftFrame);
  127. d_rightFrame.upload(rightFrame);
  128. d_alg->compute(d_leftFrame, d_rightFrame, d_disparity);
  129. d_disparity.download(disparity);
  130. }
  131. ///////////////////////////////////////////////////////////
  132. // StereoMultiGpuThread
  133. // Run Stereo algorithm on two GPUs using different host threads
  134. class StereoMultiGpuThread
  135. {
  136. public:
  137. StereoMultiGpuThread();
  138. ~StereoMultiGpuThread();
  139. void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
  140. private:
  141. GpuMat d_leftFrames[2];
  142. GpuMat d_rightFrames[2];
  143. GpuMat d_disparities[2];
  144. Ptr<cuda::StereoBM> d_algs[2];
  145. struct StereoLaunchData
  146. {
  147. int deviceId;
  148. Mat leftFrame;
  149. Mat rightFrame;
  150. Mat disparity;
  151. GpuMat* d_leftFrame;
  152. GpuMat* d_rightFrame;
  153. GpuMat* d_disparity;
  154. Ptr<cuda::StereoBM> d_alg;
  155. };
  156. static void launchGpuStereoAlg(void* userData);
  157. };
  158. StereoMultiGpuThread::StereoMultiGpuThread()
  159. {
  160. cuda::setDevice(0);
  161. d_algs[0] = cuda::createStereoBM(256);
  162. cuda::setDevice(1);
  163. d_algs[1] = cuda::createStereoBM(256);
  164. }
  165. StereoMultiGpuThread::~StereoMultiGpuThread()
  166. {
  167. cuda::setDevice(0);
  168. d_leftFrames[0].release();
  169. d_rightFrames[0].release();
  170. d_disparities[0].release();
  171. d_algs[0].release();
  172. cuda::setDevice(1);
  173. d_leftFrames[1].release();
  174. d_rightFrames[1].release();
  175. d_disparities[1].release();
  176. d_algs[1].release();
  177. }
  178. void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
  179. {
  180. disparity.create(leftFrame.size(), CV_8UC1);
  181. // Split input data onto two parts for each GPUs.
  182. // We add small border for each part,
  183. // because original algorithm doesn't calculate disparity on image borders.
  184. // With such padding we will get output in the middle of final result.
  185. StereoLaunchData launchDatas[2];
  186. launchDatas[0].deviceId = 0;
  187. launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32);
  188. launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32);
  189. launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2);
  190. launchDatas[0].d_leftFrame = &d_leftFrames[0];
  191. launchDatas[0].d_rightFrame = &d_rightFrames[0];
  192. launchDatas[0].d_disparity = &d_disparities[0];
  193. launchDatas[0].d_alg = d_algs[0];
  194. launchDatas[1].deviceId = 1;
  195. launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
  196. launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
  197. launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows);
  198. launchDatas[1].d_leftFrame = &d_leftFrames[1];
  199. launchDatas[1].d_rightFrame = &d_rightFrames[1];
  200. launchDatas[1].d_disparity = &d_disparities[1];
  201. launchDatas[1].d_alg = d_algs[1];
  202. Thread thread0(launchGpuStereoAlg, &launchDatas[0]);
  203. Thread thread1(launchGpuStereoAlg, &launchDatas[1]);
  204. thread0.wait();
  205. thread1.wait();
  206. }
  207. void StereoMultiGpuThread::launchGpuStereoAlg(void* userData)
  208. {
  209. StereoLaunchData* data = static_cast<StereoLaunchData*>(userData);
  210. cuda::setDevice(data->deviceId);
  211. data->d_leftFrame->upload(data->leftFrame);
  212. data->d_rightFrame->upload(data->rightFrame);
  213. data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity);
  214. if (data->deviceId == 0)
  215. data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity);
  216. else
  217. data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity);
  218. }
  219. ///////////////////////////////////////////////////////////
  220. // StereoMultiGpuStream
  221. // Run Stereo algorithm on two GPUs from single host thread using async API
  222. class StereoMultiGpuStream
  223. {
  224. public:
  225. StereoMultiGpuStream();
  226. ~StereoMultiGpuStream();
  227. void compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity);
  228. private:
  229. GpuMat d_leftFrames[2];
  230. GpuMat d_rightFrames[2];
  231. GpuMat d_disparities[2];
  232. Ptr<cuda::StereoBM> d_algs[2];
  233. Ptr<Stream> streams[2];
  234. };
  235. StereoMultiGpuStream::StereoMultiGpuStream()
  236. {
  237. cuda::setDevice(0);
  238. d_algs[0] = cuda::createStereoBM(256);
  239. streams[0] = makePtr<Stream>();
  240. cuda::setDevice(1);
  241. d_algs[1] = cuda::createStereoBM(256);
  242. streams[1] = makePtr<Stream>();
  243. }
  244. StereoMultiGpuStream::~StereoMultiGpuStream()
  245. {
  246. cuda::setDevice(0);
  247. d_leftFrames[0].release();
  248. d_rightFrames[0].release();
  249. d_disparities[0].release();
  250. d_algs[0].release();
  251. streams[0].release();
  252. cuda::setDevice(1);
  253. d_leftFrames[1].release();
  254. d_rightFrames[1].release();
  255. d_disparities[1].release();
  256. d_algs[1].release();
  257. streams[1].release();
  258. }
  259. void StereoMultiGpuStream::compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity)
  260. {
  261. disparity.create(leftFrame.size(), CV_8UC1);
  262. // Split input data onto two parts for each GPUs.
  263. // We add small border for each part,
  264. // because original algorithm doesn't calculate disparity on image borders.
  265. // With such padding we will get output in the middle of final result.
  266. Mat leftFrameHdr = leftFrame.createMatHeader();
  267. Mat rightFrameHdr = rightFrame.createMatHeader();
  268. Mat disparityHdr = disparity.createMatHeader();
  269. Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2);
  270. Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows);
  271. cuda::setDevice(0);
  272. d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
  273. d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
  274. d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]);
  275. d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]);
  276. cuda::setDevice(1);
  277. d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
  278. d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
  279. d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]);
  280. d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]);
  281. cuda::setDevice(0);
  282. streams[0]->waitForCompletion();
  283. cuda::setDevice(1);
  284. streams[1]->waitForCompletion();
  285. }
  286. ///////////////////////////////////////////////////////////
  287. // main
  288. int main(int argc, char** argv)
  289. {
  290. if (argc != 3)
  291. {
  292. cerr << "Usage: stereo_multi <left_video> <right_video>" << endl;
  293. return -1;
  294. }
  295. const int numDevices = getCudaEnabledDeviceCount();
  296. if (numDevices != 2)
  297. {
  298. cerr << "Two GPUs are required" << endl;
  299. return -1;
  300. }
  301. for (int i = 0; i < numDevices; ++i)
  302. {
  303. DeviceInfo devInfo(i);
  304. if (!devInfo.isCompatible())
  305. {
  306. cerr << "CUDA module wasn't built for GPU #" << i << " ("
  307. << devInfo.name() << ", CC " << devInfo.majorVersion()
  308. << devInfo.minorVersion() << endl;
  309. return -1;
  310. }
  311. printShortCudaDeviceInfo(i);
  312. }
  313. VideoCapture leftVideo(argv[1]);
  314. VideoCapture rightVideo(argv[2]);
  315. if (!leftVideo.isOpened())
  316. {
  317. cerr << "Can't open " << argv[1] << " video file" << endl;
  318. return -1;
  319. }
  320. if (!rightVideo.isOpened())
  321. {
  322. cerr << "Can't open " << argv[2] << " video file" << endl;
  323. return -1;
  324. }
  325. cout << endl;
  326. cout << "This sample demonstrates working on one piece of data using two GPUs." << endl;
  327. cout << "It splits input into two parts and processes them separately on different GPUs." << endl;
  328. cout << endl;
  329. Mat leftFrame, rightFrame;
  330. HostMem leftGrayFrame, rightGrayFrame;
  331. StereoSingleGpu gpu0Alg(0);
  332. StereoSingleGpu gpu1Alg(1);
  333. StereoMultiGpuThread multiThreadAlg;
  334. StereoMultiGpuStream multiStreamAlg;
  335. Mat disparityGpu0;
  336. Mat disparityGpu1;
  337. Mat disparityMultiThread;
  338. HostMem disparityMultiStream;
  339. Mat disparityGpu0Show;
  340. Mat disparityGpu1Show;
  341. Mat disparityMultiThreadShow;
  342. Mat disparityMultiStreamShow;
  343. TickMeter tm;
  344. cout << "-------------------------------------------------------------------" << endl;
  345. cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl;
  346. cout << "-------------------------------------------------------------------" << endl;
  347. for (int i = 0;; ++i)
  348. {
  349. leftVideo >> leftFrame;
  350. rightVideo >> rightFrame;
  351. if (leftFrame.empty() || rightFrame.empty())
  352. break;
  353. if (leftFrame.size() != rightFrame.size())
  354. {
  355. cerr << "Frames have different sizes" << endl;
  356. return -1;
  357. }
  358. leftGrayFrame.create(leftFrame.size(), CV_8UC1);
  359. rightGrayFrame.create(leftFrame.size(), CV_8UC1);
  360. cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
  361. cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
  362. tm.reset(); tm.start();
  363. gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
  364. disparityGpu0);
  365. tm.stop();
  366. const double gpu0Time = tm.getTimeMilli();
  367. tm.reset(); tm.start();
  368. gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
  369. disparityGpu1);
  370. tm.stop();
  371. const double gpu1Time = tm.getTimeMilli();
  372. tm.reset(); tm.start();
  373. multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
  374. disparityMultiThread);
  375. tm.stop();
  376. const double multiThreadTime = tm.getTimeMilli();
  377. tm.reset(); tm.start();
  378. multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream);
  379. tm.stop();
  380. const double multiStreamTime = tm.getTimeMilli();
  381. cout << "| " << setw(5) << i << " | "
  382. << setw(8) << setprecision(1) << fixed << gpu0Time << " | "
  383. << setw(8) << setprecision(1) << fixed << gpu1Time << " | "
  384. << setw(15) << setprecision(1) << fixed << multiThreadTime << " | "
  385. << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl;
  386. resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA);
  387. resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA);
  388. resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA);
  389. resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA);
  390. imshow("disparityGpu0", disparityGpu0Show);
  391. imshow("disparityGpu1", disparityGpu1Show);
  392. imshow("disparityMultiThread", disparityMultiThreadShow);
  393. imshow("disparityMultiStream", disparityMultiStreamShow);
  394. const int key = waitKey(30) & 0xff;
  395. if (key == 27)
  396. break;
  397. }
  398. cout << "-------------------------------------------------------------------" << endl;
  399. return 0;
  400. }