gaze_estimation.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. import argparse
  2. import time
  3. import numpy as np
  4. import cv2 as cv
  5. # ------------------------Service operations------------------------
  6. def weight_path(model_path):
  7. """ Get path of weights based on path to IR
  8. Params:
  9. model_path: the string contains path to IR file
  10. Return:
  11. Path to weights file
  12. """
  13. assert model_path.endswith('.xml'), "Wrong topology path was provided"
  14. return model_path[:-3] + 'bin'
  15. def build_argparser():
  16. """ Parse arguments from command line
  17. Return:
  18. Pack of arguments from command line
  19. """
  20. parser = argparse.ArgumentParser(description='This is an OpenCV-based version of Gaze Estimation example')
  21. parser.add_argument('--input',
  22. help='Path to the input video file')
  23. parser.add_argument('--out',
  24. help='Path to the output video file')
  25. parser.add_argument('--facem',
  26. default='face-detection-retail-0005.xml',
  27. help='Path to OpenVINO face detection model (.xml)')
  28. parser.add_argument('--faced',
  29. default='CPU',
  30. help='Target device for the face detection' +
  31. '(e.g. CPU, GPU, VPU, ...)')
  32. parser.add_argument('--headm',
  33. default='head-pose-estimation-adas-0001.xml',
  34. help='Path to OpenVINO head pose estimation model (.xml)')
  35. parser.add_argument('--headd',
  36. default='CPU',
  37. help='Target device for the head pose estimation inference ' +
  38. '(e.g. CPU, GPU, VPU, ...)')
  39. parser.add_argument('--landm',
  40. default='facial-landmarks-35-adas-0002.xml',
  41. help='Path to OpenVINO landmarks detector model (.xml)')
  42. parser.add_argument('--landd',
  43. default='CPU',
  44. help='Target device for the landmarks detector (e.g. CPU, GPU, VPU, ...)')
  45. parser.add_argument('--gazem',
  46. default='gaze-estimation-adas-0002.xml',
  47. help='Path to OpenVINO gaze vector estimaiton model (.xml)')
  48. parser.add_argument('--gazed',
  49. default='CPU',
  50. help='Target device for the gaze vector estimation inference ' +
  51. '(e.g. CPU, GPU, VPU, ...)')
  52. parser.add_argument('--eyem',
  53. default='open-closed-eye-0001.xml',
  54. help='Path to OpenVINO open closed eye model (.xml)')
  55. parser.add_argument('--eyed',
  56. default='CPU',
  57. help='Target device for the eyes state inference (e.g. CPU, GPU, VPU, ...)')
  58. return parser
  59. # ------------------------Support functions for custom kernels------------------------
  60. def intersection(surface, rect):
  61. """ Remove zone of out of bound from ROI
  62. Params:
  63. surface: image bounds is rect representation (top left coordinates and width and height)
  64. rect: region of interest is also has rect representation
  65. Return:
  66. Modified ROI with correct bounds
  67. """
  68. l_x = max(surface[0], rect[0])
  69. l_y = max(surface[1], rect[1])
  70. width = min(surface[0] + surface[2], rect[0] + rect[2]) - l_x
  71. height = min(surface[1] + surface[3], rect[1] + rect[3]) - l_y
  72. if width < 0 or height < 0:
  73. return (0, 0, 0, 0)
  74. return (l_x, l_y, width, height)
  75. def process_landmarks(r_x, r_y, r_w, r_h, landmarks):
  76. """ Create points from result of inference of facial-landmarks network and size of input image
  77. Params:
  78. r_x: x coordinate of top left corner of input image
  79. r_y: y coordinate of top left corner of input image
  80. r_w: width of input image
  81. r_h: height of input image
  82. landmarks: result of inference of facial-landmarks network
  83. Return:
  84. Array of landmarks points for one face
  85. """
  86. lmrks = landmarks[0]
  87. raw_x = lmrks[::2] * r_w + r_x
  88. raw_y = lmrks[1::2] * r_h + r_y
  89. return np.array([[int(x), int(y)] for x, y in zip(raw_x, raw_y)])
  90. def eye_box(p_1, p_2, scale=1.8):
  91. """ Get bounding box of eye
  92. Params:
  93. p_1: point of left edge of eye
  94. p_2: point of right edge of eye
  95. scale: change size of box with this value
  96. Return:
  97. Bounding box of eye and its midpoint
  98. """
  99. size = np.linalg.norm(p_1 - p_2)
  100. midpoint = (p_1 + p_2) / 2
  101. width = scale * size
  102. height = width
  103. p_x = midpoint[0] - (width / 2)
  104. p_y = midpoint[1] - (height / 2)
  105. return (int(p_x), int(p_y), int(width), int(height)), list(map(int, midpoint))
  106. # ------------------------Custom graph operations------------------------
  107. @cv.gapi.op('custom.GProcessPoses',
  108. in_types=[cv.GArray.GMat, cv.GArray.GMat, cv.GArray.GMat],
  109. out_types=[cv.GArray.GMat])
  110. class GProcessPoses:
  111. @staticmethod
  112. def outMeta(arr_desc0, arr_desc1, arr_desc2):
  113. return cv.empty_array_desc()
  114. @cv.gapi.op('custom.GParseEyes',
  115. in_types=[cv.GArray.GMat, cv.GArray.Rect, cv.GOpaque.Size],
  116. out_types=[cv.GArray.Rect, cv.GArray.Rect, cv.GArray.Point, cv.GArray.Point])
  117. class GParseEyes:
  118. @staticmethod
  119. def outMeta(arr_desc0, arr_desc1, arr_desc2):
  120. return cv.empty_array_desc(), cv.empty_array_desc(), \
  121. cv.empty_array_desc(), cv.empty_array_desc()
  122. @cv.gapi.op('custom.GGetStates',
  123. in_types=[cv.GArray.GMat, cv.GArray.GMat],
  124. out_types=[cv.GArray.Int, cv.GArray.Int])
  125. class GGetStates:
  126. @staticmethod
  127. def outMeta(arr_desc0, arr_desc1):
  128. return cv.empty_array_desc(), cv.empty_array_desc()
  129. # ------------------------Custom kernels------------------------
  130. @cv.gapi.kernel(GProcessPoses)
  131. class GProcessPosesImpl:
  132. """ Custom kernel. Processed poses of heads
  133. """
  134. @staticmethod
  135. def run(in_ys, in_ps, in_rs):
  136. """ Сustom kernel executable code
  137. Params:
  138. in_ys: yaw angle of head
  139. in_ps: pitch angle of head
  140. in_rs: roll angle of head
  141. Return:
  142. Arrays with heads poses
  143. """
  144. return [np.array([ys[0], ps[0], rs[0]]).T for ys, ps, rs in zip(in_ys, in_ps, in_rs)]
  145. @cv.gapi.kernel(GParseEyes)
  146. class GParseEyesImpl:
  147. """ Custom kernel. Get information about eyes
  148. """
  149. @staticmethod
  150. def run(in_landm_per_face, in_face_rcs, frame_size):
  151. """ Сustom kernel executable code
  152. Params:
  153. in_landm_per_face: landmarks from inference of facial-landmarks network for each face
  154. in_face_rcs: bounding boxes for each face
  155. frame_size: size of input image
  156. Return:
  157. Arrays of ROI for left and right eyes, array of midpoints and
  158. array of landmarks points
  159. """
  160. left_eyes = []
  161. right_eyes = []
  162. midpoints = []
  163. lmarks = []
  164. surface = (0, 0, *frame_size)
  165. for landm_face, rect in zip(in_landm_per_face, in_face_rcs):
  166. points = process_landmarks(*rect, landm_face)
  167. lmarks.extend(points)
  168. rect, midpoint_l = eye_box(points[0], points[1])
  169. left_eyes.append(intersection(surface, rect))
  170. rect, midpoint_r = eye_box(points[2], points[3])
  171. right_eyes.append(intersection(surface, rect))
  172. midpoints.append(midpoint_l)
  173. midpoints.append(midpoint_r)
  174. return left_eyes, right_eyes, midpoints, lmarks
  175. @cv.gapi.kernel(GGetStates)
  176. class GGetStatesImpl:
  177. """ Custom kernel. Get state of eye - open or closed
  178. """
  179. @staticmethod
  180. def run(eyesl, eyesr):
  181. """ Сustom kernel executable code
  182. Params:
  183. eyesl: result of inference of open-closed-eye network for left eye
  184. eyesr: result of inference of open-closed-eye network for right eye
  185. Return:
  186. States of left eyes and states of right eyes
  187. """
  188. out_l_st = [int(st) for eye_l in eyesl for st in (eye_l[:, 0] < eye_l[:, 1]).ravel()]
  189. out_r_st = [int(st) for eye_r in eyesr for st in (eye_r[:, 0] < eye_r[:, 1]).ravel()]
  190. return out_l_st, out_r_st
  191. if __name__ == '__main__':
  192. ARGUMENTS = build_argparser().parse_args()
  193. # ------------------------Demo's graph------------------------
  194. g_in = cv.GMat()
  195. # Detect faces
  196. face_inputs = cv.GInferInputs()
  197. face_inputs.setInput('data', g_in)
  198. face_outputs = cv.gapi.infer('face-detection', face_inputs)
  199. faces = face_outputs.at('detection_out')
  200. # Parse faces
  201. sz = cv.gapi.streaming.size(g_in)
  202. faces_rc = cv.gapi.parseSSD(faces, sz, 0.5, False, False)
  203. # Detect poses
  204. head_inputs = cv.GInferInputs()
  205. head_inputs.setInput('data', g_in)
  206. face_outputs = cv.gapi.infer('head-pose', faces_rc, head_inputs)
  207. angles_y = face_outputs.at('angle_y_fc')
  208. angles_p = face_outputs.at('angle_p_fc')
  209. angles_r = face_outputs.at('angle_r_fc')
  210. # Parse poses
  211. heads_pos = GProcessPoses.on(angles_y, angles_p, angles_r)
  212. # Detect landmarks
  213. landmark_inputs = cv.GInferInputs()
  214. landmark_inputs.setInput('data', g_in)
  215. landmark_outputs = cv.gapi.infer('facial-landmarks', faces_rc,
  216. landmark_inputs)
  217. landmark = landmark_outputs.at('align_fc3')
  218. # Parse landmarks
  219. left_eyes, right_eyes, mids, lmarks = GParseEyes.on(landmark, faces_rc, sz)
  220. # Detect eyes
  221. eyes_inputs = cv.GInferInputs()
  222. eyes_inputs.setInput('input.1', g_in)
  223. eyesl_outputs = cv.gapi.infer('open-closed-eye', left_eyes, eyes_inputs)
  224. eyesr_outputs = cv.gapi.infer('open-closed-eye', right_eyes, eyes_inputs)
  225. eyesl = eyesl_outputs.at('19')
  226. eyesr = eyesr_outputs.at('19')
  227. # Process eyes states
  228. l_eye_st, r_eye_st = GGetStates.on(eyesl, eyesr)
  229. # Gaze estimation
  230. gaze_inputs = cv.GInferListInputs()
  231. gaze_inputs.setInput('left_eye_image', left_eyes)
  232. gaze_inputs.setInput('right_eye_image', right_eyes)
  233. gaze_inputs.setInput('head_pose_angles', heads_pos)
  234. gaze_outputs = cv.gapi.infer2('gaze-estimation', g_in, gaze_inputs)
  235. gaze_vectors = gaze_outputs.at('gaze_vector')
  236. out = cv.gapi.copy(g_in)
  237. # ------------------------End of graph------------------------
  238. comp = cv.GComputation(cv.GIn(g_in), cv.GOut(out,
  239. faces_rc,
  240. left_eyes,
  241. right_eyes,
  242. gaze_vectors,
  243. angles_y,
  244. angles_p,
  245. angles_r,
  246. l_eye_st,
  247. r_eye_st,
  248. mids,
  249. lmarks))
  250. # Networks
  251. face_net = cv.gapi.ie.params('face-detection', ARGUMENTS.facem,
  252. weight_path(ARGUMENTS.facem), ARGUMENTS.faced)
  253. head_pose_net = cv.gapi.ie.params('head-pose', ARGUMENTS.headm,
  254. weight_path(ARGUMENTS.headm), ARGUMENTS.headd)
  255. landmarks_net = cv.gapi.ie.params('facial-landmarks', ARGUMENTS.landm,
  256. weight_path(ARGUMENTS.landm), ARGUMENTS.landd)
  257. gaze_net = cv.gapi.ie.params('gaze-estimation', ARGUMENTS.gazem,
  258. weight_path(ARGUMENTS.gazem), ARGUMENTS.gazed)
  259. eye_net = cv.gapi.ie.params('open-closed-eye', ARGUMENTS.eyem,
  260. weight_path(ARGUMENTS.eyem), ARGUMENTS.eyed)
  261. nets = cv.gapi.networks(face_net, head_pose_net, landmarks_net, gaze_net, eye_net)
  262. # Kernels pack
  263. kernels = cv.gapi.kernels(GParseEyesImpl, GProcessPosesImpl, GGetStatesImpl)
  264. # ------------------------Execution part------------------------
  265. ccomp = comp.compileStreaming(args=cv.gapi.compile_args(kernels, nets))
  266. source = cv.gapi.wip.make_capture_src(ARGUMENTS.input)
  267. ccomp.setSource(cv.gin(source))
  268. ccomp.start()
  269. frames = 0
  270. fps = 0
  271. print('Processing')
  272. START_TIME = time.time()
  273. while True:
  274. start_time_cycle = time.time()
  275. has_frame, (oimg,
  276. outr,
  277. l_eyes,
  278. r_eyes,
  279. outg,
  280. out_y,
  281. out_p,
  282. out_r,
  283. out_st_l,
  284. out_st_r,
  285. out_mids,
  286. outl) = ccomp.pull()
  287. if not has_frame:
  288. break
  289. # Draw
  290. GREEN = (0, 255, 0)
  291. RED = (0, 0, 255)
  292. WHITE = (255, 255, 255)
  293. BLUE = (255, 0, 0)
  294. PINK = (255, 0, 255)
  295. YELLOW = (0, 255, 255)
  296. M_PI_180 = np.pi / 180
  297. M_PI_2 = np.pi / 2
  298. M_PI = np.pi
  299. FACES_SIZE = len(outr)
  300. for i, out_rect in enumerate(outr):
  301. # Face box
  302. cv.rectangle(oimg, out_rect, WHITE, 1)
  303. rx, ry, rwidth, rheight = out_rect
  304. # Landmarks
  305. lm_radius = int(0.01 * rwidth + 1)
  306. lmsize = int(len(outl) / FACES_SIZE)
  307. for j in range(lmsize):
  308. cv.circle(oimg, outl[j + i * lmsize], lm_radius, YELLOW, -1)
  309. # Headposes
  310. yaw = out_y[i]
  311. pitch = out_p[i]
  312. roll = out_r[i]
  313. sin_y = np.sin(yaw[:] * M_PI_180)
  314. sin_p = np.sin(pitch[:] * M_PI_180)
  315. sin_r = np.sin(roll[:] * M_PI_180)
  316. cos_y = np.cos(yaw[:] * M_PI_180)
  317. cos_p = np.cos(pitch[:] * M_PI_180)
  318. cos_r = np.cos(roll[:] * M_PI_180)
  319. axis_length = 0.4 * rwidth
  320. x_center = int(rx + rwidth / 2)
  321. y_center = int(ry + rheight / 2)
  322. # center to right
  323. cv.line(oimg, [x_center, y_center],
  324. [int(x_center + axis_length * (cos_r * cos_y + sin_y * sin_p * sin_r)),
  325. int(y_center + axis_length * cos_p * sin_r)],
  326. RED, 2)
  327. # center to top
  328. cv.line(oimg, [x_center, y_center],
  329. [int(x_center + axis_length * (cos_r * sin_y * sin_p + cos_y * sin_r)),
  330. int(y_center - axis_length * cos_p * cos_r)],
  331. GREEN, 2)
  332. # center to forward
  333. cv.line(oimg, [x_center, y_center],
  334. [int(x_center + axis_length * sin_y * cos_p),
  335. int(y_center + axis_length * sin_p)],
  336. PINK, 2)
  337. scale_box = 0.002 * rwidth
  338. cv.putText(oimg, "head pose: (y=%0.0f, p=%0.0f, r=%0.0f)" %
  339. (np.round(yaw), np.round(pitch), np.round(roll)),
  340. [int(rx), int(ry + rheight + 5 * rwidth / 100)],
  341. cv.FONT_HERSHEY_PLAIN, scale_box * 2, WHITE, 1)
  342. # Eyes boxes
  343. color_l = GREEN if out_st_l[i] else RED
  344. cv.rectangle(oimg, l_eyes[i], color_l, 1)
  345. color_r = GREEN if out_st_r[i] else RED
  346. cv.rectangle(oimg, r_eyes[i], color_r, 1)
  347. # Gaze vectors
  348. norm_gazes = np.linalg.norm(outg[i][0])
  349. gaze_vector = outg[i][0] / norm_gazes
  350. arrow_length = 0.4 * rwidth
  351. gaze_arrow = [arrow_length * gaze_vector[0], -arrow_length * gaze_vector[1]]
  352. left_arrow = [int(a+b) for a, b in zip(out_mids[0 + i * 2], gaze_arrow)]
  353. right_arrow = [int(a+b) for a, b in zip(out_mids[1 + i * 2], gaze_arrow)]
  354. if out_st_l[i]:
  355. cv.arrowedLine(oimg, out_mids[0 + i * 2], left_arrow, BLUE, 2)
  356. if out_st_r[i]:
  357. cv.arrowedLine(oimg, out_mids[1 + i * 2], right_arrow, BLUE, 2)
  358. v0, v1, v2 = outg[i][0]
  359. gaze_angles = [180 / M_PI * (M_PI_2 + np.arctan2(v2, v0)),
  360. 180 / M_PI * (M_PI_2 - np.arccos(v1 / norm_gazes))]
  361. cv.putText(oimg, "gaze angles: (h=%0.0f, v=%0.0f)" %
  362. (np.round(gaze_angles[0]), np.round(gaze_angles[1])),
  363. [int(rx), int(ry + rheight + 12 * rwidth / 100)],
  364. cv.FONT_HERSHEY_PLAIN, scale_box * 2, WHITE, 1)
  365. # Add FPS value to frame
  366. cv.putText(oimg, "FPS: %0i" % (fps), [int(20), int(40)],
  367. cv.FONT_HERSHEY_PLAIN, 2, RED, 2)
  368. # Show result
  369. cv.imshow('Gaze Estimation', oimg)
  370. cv.waitKey(1)
  371. fps = int(1. / (time.time() - start_time_cycle))
  372. frames += 1
  373. EXECUTION_TIME = time.time() - START_TIME
  374. print('Execution successful')
  375. print('Mean FPS is ', int(frames / EXECUTION_TIME))