gaze_estimation.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. #include <algorithm>
  2. #include <iostream>
  3. #include <cctype>
  4. #include <opencv2/gapi.hpp>
  5. #include <opencv2/gapi/core.hpp>
  6. #include <opencv2/gapi/infer.hpp>
  7. #include <opencv2/gapi/infer/ie.hpp>
  8. #include <opencv2/gapi/streaming/cap.hpp>
  9. #include <opencv2/gapi/cpu/gcpukernel.hpp>
  10. #include <opencv2/highgui.hpp> // CommandLineParser
  11. #include <opencv2/gapi/infer/parsers.hpp>
  12. const std::string about =
  13. "This is an OpenCV-based version of Gaze Estimation example";
  14. const std::string keys =
  15. "{ h help | | Print this help message }"
  16. "{ input | | Path to the input video file }"
  17. "{ facem | face-detection-retail-0005.xml | Path to OpenVINO face detection model (.xml) }"
  18. "{ faced | CPU | Target device for the face detection (e.g. CPU, GPU, ...) }"
  19. "{ landm | facial-landmarks-35-adas-0002.xml | Path to OpenVINO landmarks detector model (.xml) }"
  20. "{ landd | CPU | Target device for the landmarks detector (e.g. CPU, GPU, ...) }"
  21. "{ headm | head-pose-estimation-adas-0001.xml | Path to OpenVINO head pose estimation model (.xml) }"
  22. "{ headd | CPU | Target device for the head pose estimation inference (e.g. CPU, GPU, ...) }"
  23. "{ gazem | gaze-estimation-adas-0002.xml | Path to OpenVINO gaze vector estimaiton model (.xml) }"
  24. "{ gazed | CPU | Target device for the gaze vector estimation inference (e.g. CPU, GPU, ...) }"
  25. ;
  26. namespace {
  27. std::string weights_path(const std::string &model_path) {
  28. const auto EXT_LEN = 4u;
  29. const auto sz = model_path.size();
  30. CV_Assert(sz > EXT_LEN);
  31. auto ext = model_path.substr(sz - EXT_LEN);
  32. auto lower = [](unsigned char c) {
  33. return static_cast<unsigned char>(std::tolower(c));
  34. };
  35. std::transform(ext.begin(), ext.end(), ext.begin(), lower);
  36. CV_Assert(ext == ".xml");
  37. return model_path.substr(0u, sz - EXT_LEN) + ".bin";
  38. }
  39. } // anonymous namespace
  40. namespace custom {
  41. namespace {
  42. using GMat3 = std::tuple<cv::GMat,cv::GMat,cv::GMat>;
  43. using GMats = cv::GArray<cv::GMat>;
  44. using GRects = cv::GArray<cv::Rect>;
  45. using GSize = cv::GOpaque<cv::Size>;
  46. G_API_NET(Faces, <cv::GMat(cv::GMat)>, "face-detector" );
  47. G_API_NET(Landmarks, <cv::GMat(cv::GMat)>, "facial-landmarks");
  48. G_API_NET(HeadPose, < GMat3(cv::GMat)>, "head-pose");
  49. G_API_NET(Gaze, <cv::GMat(cv::GMat,cv::GMat,cv::GMat)>, "gaze-vector");
  50. G_API_OP(Size, <GSize(cv::GMat)>, "custom.gapi.size") {
  51. static cv::GOpaqueDesc outMeta(const cv::GMatDesc &) {
  52. return cv::empty_gopaque_desc();
  53. }
  54. };
  55. // Left/Right eye per every face
  56. G_API_OP(ParseEyes,
  57. <std::tuple<GRects, GRects>(GMats, GRects, GSize)>,
  58. "custom.gaze_estimation.parseEyes") {
  59. static std::tuple<cv::GArrayDesc, cv::GArrayDesc>
  60. outMeta( const cv::GArrayDesc &
  61. , const cv::GArrayDesc &
  62. , const cv::GOpaqueDesc &) {
  63. return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc());
  64. }
  65. };
  66. // Combine three scalars into a 1x3 vector (per every face)
  67. G_API_OP(ProcessPoses,
  68. <GMats(GMats, GMats, GMats)>,
  69. "custom.gaze_estimation.processPoses") {
  70. static cv::GArrayDesc outMeta( const cv::GArrayDesc &
  71. , const cv::GArrayDesc &
  72. , const cv::GArrayDesc &) {
  73. return cv::empty_array_desc();
  74. }
  75. };
  76. void gazeVectorToGazeAngles(const cv::Point3f& gazeVector,
  77. cv::Point2f& gazeAngles) {
  78. auto r = cv::norm(gazeVector);
  79. double v0 = static_cast<double>(gazeVector.x);
  80. double v1 = static_cast<double>(gazeVector.y);
  81. double v2 = static_cast<double>(gazeVector.z);
  82. gazeAngles.x = static_cast<float>(180.0 / M_PI * (M_PI_2 + std::atan2(v2, v0)));
  83. gazeAngles.y = static_cast<float>(180.0 / M_PI * (M_PI_2 - std::acos(v1 / r)));
  84. }
  85. GAPI_OCV_KERNEL(OCVSize, Size) {
  86. static void run(const cv::Mat &in, cv::Size &out) {
  87. out = in.size();
  88. }
  89. };
  90. cv::Rect eyeBox(const cv::Rect &face_rc,
  91. float p1_x, float p1_y, float p2_x, float p2_y,
  92. float scale = 1.8f) {
  93. const auto &up = face_rc.size();
  94. const cv::Point p1 = {
  95. static_cast<int>(p1_x*up.width),
  96. static_cast<int>(p1_y*up.height)
  97. };
  98. const cv::Point p2 = {
  99. static_cast<int>(p2_x*up.width),
  100. static_cast<int>(p2_y*up.height)
  101. };
  102. cv::Rect result;
  103. const auto size = static_cast<float>(cv::norm(p1 - p2));
  104. const auto midpoint = (p1 + p2) / 2;
  105. result.width = static_cast<int>(scale * size);
  106. result.height = result.width;
  107. result.x = face_rc.x + midpoint.x - (result.width / 2);
  108. result.y = face_rc.y + midpoint.y - (result.height / 2);
  109. // Shift result to the original frame's absolute coordinates
  110. return result;
  111. }
  112. GAPI_OCV_KERNEL(OCVParseEyes, ParseEyes) {
  113. static void run(const std::vector<cv::Mat> &in_landmarks_per_face,
  114. const std::vector<cv::Rect> &in_face_rcs,
  115. const cv::Size &frame_size,
  116. std::vector<cv::Rect> &out_left_eyes,
  117. std::vector<cv::Rect> &out_right_eyes) {
  118. const size_t numFaces = in_landmarks_per_face.size();
  119. const cv::Rect surface(cv::Point(0,0), frame_size);
  120. GAPI_Assert(numFaces == in_face_rcs.size());
  121. out_left_eyes.clear();
  122. out_right_eyes.clear();
  123. out_left_eyes.reserve(numFaces);
  124. out_right_eyes.reserve(numFaces);
  125. for (std::size_t i = 0u; i < numFaces; i++) {
  126. const auto &lm = in_landmarks_per_face[i];
  127. const auto &rc = in_face_rcs[i];
  128. // Left eye is defined by points 0/1 (x2),
  129. // Right eye is defined by points 2/3 (x2)
  130. const float *data = lm.ptr<float>();
  131. out_left_eyes .push_back(surface & eyeBox(rc, data[0], data[1], data[2], data[3]));
  132. out_right_eyes.push_back(surface & eyeBox(rc, data[4], data[5], data[6], data[7]));
  133. }
  134. }
  135. };
  136. GAPI_OCV_KERNEL(OCVProcessPoses, ProcessPoses) {
  137. static void run(const std::vector<cv::Mat> &in_ys,
  138. const std::vector<cv::Mat> &in_ps,
  139. const std::vector<cv::Mat> &in_rs,
  140. std::vector<cv::Mat> &out_poses) {
  141. const std::size_t sz = in_ys.size();
  142. GAPI_Assert(sz == in_ps.size() && sz == in_rs.size());
  143. out_poses.clear();
  144. for (std::size_t idx = 0u; idx < sz; idx++) {
  145. cv::Mat pose(1, 3, CV_32FC1);
  146. float *ptr = pose.ptr<float>();
  147. ptr[0] = in_ys[idx].ptr<float>()[0];
  148. ptr[1] = in_ps[idx].ptr<float>()[0];
  149. ptr[2] = in_rs[idx].ptr<float>()[0];
  150. out_poses.push_back(std::move(pose));
  151. }
  152. }
  153. };
  154. } // anonymous namespace
  155. } // namespace custom
  156. namespace vis {
  157. namespace {
  158. cv::Point2f midp(const cv::Rect &rc) {
  159. return (rc.tl() + rc.br()) / 2;
  160. };
  161. void bbox(cv::Mat &m, const cv::Rect &rc) {
  162. cv::rectangle(m, rc, cv::Scalar{0,255,0}, 2, cv::LINE_8, 0);
  163. };
  164. void pose(cv::Mat &m, const cv::Mat &p, const cv::Rect &face_rc) {
  165. const auto *posePtr = p.ptr<float>();
  166. const auto yaw = static_cast<double>(posePtr[0]);
  167. const auto pitch = static_cast<double>(posePtr[1]);
  168. const auto roll = static_cast<double>(posePtr[2]);
  169. const auto sinY = std::sin(yaw * M_PI / 180.0);
  170. const auto sinP = std::sin(pitch * M_PI / 180.0);
  171. const auto sinR = std::sin(roll * M_PI / 180.0);
  172. const auto cosY = std::cos(yaw * M_PI / 180.0);
  173. const auto cosP = std::cos(pitch * M_PI / 180.0);
  174. const auto cosR = std::cos(roll * M_PI / 180.0);
  175. const auto axisLength = 0.4 * face_rc.width;
  176. const auto xCenter = face_rc.x + face_rc.width / 2;
  177. const auto yCenter = face_rc.y + face_rc.height / 2;
  178. const auto center = cv::Point{xCenter, yCenter};
  179. const auto axisln = cv::Point2d{axisLength, axisLength};
  180. const auto ctr = cv::Matx<double,2,2>(cosR*cosY, sinY*sinP*sinR, 0.f, cosP*sinR);
  181. const auto ctt = cv::Matx<double,2,2>(cosR*sinY*sinP, cosY*sinR, 0.f, -cosP*cosR);
  182. const auto ctf = cv::Matx<double,2,2>(sinY*cosP, 0.f, 0.f, sinP);
  183. // center to right
  184. cv::line(m, center, center + static_cast<cv::Point>(ctr*axisln), cv::Scalar(0, 0, 255), 2);
  185. // center to top
  186. cv::line(m, center, center + static_cast<cv::Point>(ctt*axisln), cv::Scalar(0, 255, 0), 2);
  187. // center to forward
  188. cv::line(m, center, center + static_cast<cv::Point>(ctf*axisln), cv::Scalar(255, 0, 255), 2);
  189. }
  190. void vvec(cv::Mat &m, const cv::Mat &v, const cv::Rect &face_rc,
  191. const cv::Rect &left_rc, const cv::Rect &right_rc) {
  192. const auto scale = 0.002 * face_rc.width;
  193. cv::Point3f gazeVector;
  194. const auto *gazePtr = v.ptr<float>();
  195. gazeVector.x = gazePtr[0];
  196. gazeVector.y = gazePtr[1];
  197. gazeVector.z = gazePtr[2];
  198. gazeVector = gazeVector / cv::norm(gazeVector);
  199. const double arrowLength = 0.4 * face_rc.width;
  200. const auto left_mid = midp(left_rc);
  201. const auto right_mid = midp(right_rc);
  202. cv::Point2f gazeArrow;
  203. gazeArrow.x = gazeVector.x;
  204. gazeArrow.y = -gazeVector.y;
  205. gazeArrow *= arrowLength;
  206. cv::arrowedLine(m, left_mid, left_mid + gazeArrow, cv::Scalar(255, 0, 0), 2);
  207. cv::arrowedLine(m, right_mid, right_mid + gazeArrow, cv::Scalar(255, 0, 0), 2);
  208. cv::Point2f gazeAngles;
  209. custom::gazeVectorToGazeAngles(gazeVector, gazeAngles);
  210. cv::putText(m,
  211. cv::format("gaze angles: (h=%0.0f, v=%0.0f)",
  212. static_cast<double>(std::round(gazeAngles.x)),
  213. static_cast<double>(std::round(gazeAngles.y))),
  214. cv::Point(static_cast<int>(face_rc.tl().x),
  215. static_cast<int>(face_rc.br().y + 12. * face_rc.width / 100.)),
  216. cv::FONT_HERSHEY_PLAIN, scale * 2, cv::Scalar::all(255), 1);
  217. };
  218. } // anonymous namespace
  219. } // namespace vis
  220. int main(int argc, char *argv[])
  221. {
  222. cv::CommandLineParser cmd(argc, argv, keys);
  223. cmd.about(about);
  224. if (cmd.has("help")) {
  225. cmd.printMessage();
  226. return 0;
  227. }
  228. cv::GMat in;
  229. cv::GMat faces = cv::gapi::infer<custom::Faces>(in);
  230. cv::GOpaque<cv::Size> sz = cv::gapi::streaming::size(in);
  231. cv::GArray<cv::Rect> faces_rc = cv::gapi::parseSSD(faces, sz, 0.5f, true, true);
  232. cv::GArray<cv::GMat> angles_y, angles_p, angles_r;
  233. std::tie(angles_y, angles_p, angles_r) = cv::gapi::infer<custom::HeadPose>(faces_rc, in);
  234. cv::GArray<cv::GMat> heads_pos = custom::ProcessPoses::on(angles_y, angles_p, angles_r);
  235. cv::GArray<cv::GMat> landmarks = cv::gapi::infer<custom::Landmarks>(faces_rc, in);
  236. cv::GArray<cv::Rect> left_eyes, right_eyes;
  237. std::tie(left_eyes, right_eyes) = custom::ParseEyes::on(landmarks, faces_rc, sz);
  238. cv::GArray<cv::GMat> gaze_vectors = cv::gapi::infer2<custom::Gaze>( in
  239. , left_eyes
  240. , right_eyes
  241. , heads_pos);
  242. cv::GComputation graph(cv::GIn(in),
  243. cv::GOut( cv::gapi::copy(in)
  244. , faces_rc
  245. , left_eyes
  246. , right_eyes
  247. , heads_pos
  248. , gaze_vectors));
  249. const auto input_file_name = cmd.get<std::string>("input");
  250. const auto face_model_path = cmd.get<std::string>("facem");
  251. const auto head_model_path = cmd.get<std::string>("headm");
  252. const auto lmrk_model_path = cmd.get<std::string>("landm");
  253. const auto gaze_model_path = cmd.get<std::string>("gazem");
  254. auto face_net = cv::gapi::ie::Params<custom::Faces> {
  255. face_model_path, // path to topology IR
  256. weights_path(face_model_path), // path to weights
  257. cmd.get<std::string>("faced"), /// device specifier
  258. };
  259. auto head_net = cv::gapi::ie::Params<custom::HeadPose> {
  260. head_model_path, // path to topology IR
  261. weights_path(head_model_path), // path to weights
  262. cmd.get<std::string>("headd"), // device specifier
  263. }.cfgOutputLayers({"angle_y_fc", "angle_p_fc", "angle_r_fc"});
  264. auto landmarks_net = cv::gapi::ie::Params<custom::Landmarks> {
  265. lmrk_model_path, // path to topology IR
  266. weights_path(lmrk_model_path), // path to weights
  267. cmd.get<std::string>("landd"), // device specifier
  268. };
  269. auto gaze_net = cv::gapi::ie::Params<custom::Gaze> {
  270. gaze_model_path, // path to topology IR
  271. weights_path(gaze_model_path), // path to weights
  272. cmd.get<std::string>("gazed"), // device specifier
  273. }.cfgInputLayers({"left_eye_image", "right_eye_image", "head_pose_angles"});
  274. auto kernels = cv::gapi::kernels< custom::OCVSize
  275. , custom::OCVParseEyes
  276. , custom::OCVProcessPoses>();
  277. auto networks = cv::gapi::networks(face_net, head_net, landmarks_net, gaze_net);
  278. auto pipeline = graph.compileStreaming(cv::compile_args(networks, kernels));
  279. cv::TickMeter tm;
  280. cv::Mat image;
  281. std::vector<cv::Rect> out_faces, out_right_eyes, out_left_eyes;
  282. std::vector<cv::Mat> out_poses;
  283. std::vector<cv::Mat> out_gazes;
  284. std::size_t frames = 0u;
  285. std::cout << "Reading " << input_file_name << std::endl;
  286. pipeline.setSource(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(input_file_name));
  287. pipeline.start();
  288. tm.start();
  289. while (pipeline.pull(cv::gout( image
  290. , out_faces
  291. , out_left_eyes
  292. , out_right_eyes
  293. , out_poses
  294. , out_gazes))) {
  295. frames++;
  296. // Visualize results on the frame
  297. for (auto &&rc : out_faces) vis::bbox(image, rc);
  298. for (auto &&rc : out_left_eyes) vis::bbox(image, rc);
  299. for (auto &&rc : out_right_eyes) vis::bbox(image, rc);
  300. for (std::size_t i = 0u; i < out_faces.size(); i++) {
  301. vis::pose(image, out_poses[i], out_faces[i]);
  302. vis::vvec(image, out_gazes[i], out_faces[i], out_left_eyes[i], out_right_eyes[i]);
  303. }
  304. tm.stop();
  305. const auto fps_str = std::to_string(frames / tm.getTimeSec()) + " FPS";
  306. cv::putText(image, fps_str, {0,32}, cv::FONT_HERSHEY_SIMPLEX, 1.0, {0,255,0}, 2);
  307. cv::imshow("Out", image);
  308. cv::waitKey(1);
  309. tm.start();
  310. }
  311. tm.stop();
  312. std::cout << "Processed " << frames << " frames"
  313. << " (" << frames / tm.getTimeSec() << " FPS)" << std::endl;
  314. return 0;
  315. }