object_detection.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. import cv2 as cv
  2. import argparse
  3. import numpy as np
  4. import sys
  5. import time
  6. from threading import Thread
  7. if sys.version_info[0] == 2:
  8. import Queue as queue
  9. else:
  10. import queue
  11. from common import *
  12. from tf_text_graph_common import readTextMessage
  13. from tf_text_graph_ssd import createSSDGraph
  14. from tf_text_graph_faster_rcnn import createFasterRCNNGraph
  15. backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
  16. cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
  17. targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL,
  18. cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
  19. parser = argparse.ArgumentParser(add_help=False)
  20. parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
  21. help='An optional path to file with preprocessing parameters.')
  22. parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
  23. parser.add_argument('--out_tf_graph', default='graph.pbtxt',
  24. help='For models from TensorFlow Object Detection API, you may '
  25. 'pass a .config file which was used for training through --config '
  26. 'argument. This way an additional .pbtxt file with TensorFlow graph will be created.')
  27. parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'dldt'],
  28. help='Optional name of an origin framework of the model. '
  29. 'Detect it automatically if it does not set.')
  30. parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
  31. parser.add_argument('--nms', type=float, default=0.4, help='Non-maximum suppression threshold')
  32. parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
  33. help="Choose one of computation backends: "
  34. "%d: automatically (by default), "
  35. "%d: Halide language (http://halide-lang.org/), "
  36. "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
  37. "%d: OpenCV implementation, "
  38. "%d: VKCOM, "
  39. "%d: CUDA" % backends)
  40. parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
  41. help='Choose one of target computation devices: '
  42. '%d: CPU target (by default), '
  43. '%d: OpenCL, '
  44. '%d: OpenCL fp16 (half-float precision), '
  45. '%d: NCS2 VPU, '
  46. '%d: HDDL VPU, '
  47. '%d: Vulkan, '
  48. '%d: CUDA, '
  49. '%d: CUDA fp16 (half-float preprocess)' % targets)
  50. parser.add_argument('--async', type=int, default=0,
  51. dest='asyncN',
  52. help='Number of asynchronous forwards at the same time. '
  53. 'Choose 0 for synchronous mode')
  54. args, _ = parser.parse_known_args()
  55. add_preproc_args(args.zoo, parser, 'object_detection')
  56. parser = argparse.ArgumentParser(parents=[parser],
  57. description='Use this script to run object detection deep learning networks using OpenCV.',
  58. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  59. args = parser.parse_args()
  60. args.model = findFile(args.model)
  61. args.config = findFile(args.config)
  62. args.classes = findFile(args.classes)
  63. # If config specified, try to load it as TensorFlow Object Detection API's pipeline.
  64. config = readTextMessage(args.config)
  65. if 'model' in config:
  66. print('TensorFlow Object Detection API config detected')
  67. if 'ssd' in config['model'][0]:
  68. print('Preparing text graph representation for SSD model: ' + args.out_tf_graph)
  69. createSSDGraph(args.model, args.config, args.out_tf_graph)
  70. args.config = args.out_tf_graph
  71. elif 'faster_rcnn' in config['model'][0]:
  72. print('Preparing text graph representation for Faster-RCNN model: ' + args.out_tf_graph)
  73. createFasterRCNNGraph(args.model, args.config, args.out_tf_graph)
  74. args.config = args.out_tf_graph
  75. # Load names of classes
  76. classes = None
  77. if args.classes:
  78. with open(args.classes, 'rt') as f:
  79. classes = f.read().rstrip('\n').split('\n')
  80. # Load a network
  81. net = cv.dnn.readNet(cv.samples.findFile(args.model), cv.samples.findFile(args.config), args.framework)
  82. net.setPreferableBackend(args.backend)
  83. net.setPreferableTarget(args.target)
  84. outNames = net.getUnconnectedOutLayersNames()
  85. confThreshold = args.thr
  86. nmsThreshold = args.nms
  87. def postprocess(frame, outs):
  88. frameHeight = frame.shape[0]
  89. frameWidth = frame.shape[1]
  90. def drawPred(classId, conf, left, top, right, bottom):
  91. # Draw a bounding box.
  92. cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))
  93. label = '%.2f' % conf
  94. # Print a label of class.
  95. if classes:
  96. assert(classId < len(classes))
  97. label = '%s: %s' % (classes[classId], label)
  98. labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
  99. top = max(top, labelSize[1])
  100. cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
  101. cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
  102. layerNames = net.getLayerNames()
  103. lastLayerId = net.getLayerId(layerNames[-1])
  104. lastLayer = net.getLayer(lastLayerId)
  105. classIds = []
  106. confidences = []
  107. boxes = []
  108. if lastLayer.type == 'DetectionOutput':
  109. # Network produces output blob with a shape 1x1xNx7 where N is a number of
  110. # detections and an every detection is a vector of values
  111. # [batchId, classId, confidence, left, top, right, bottom]
  112. for out in outs:
  113. for detection in out[0, 0]:
  114. confidence = detection[2]
  115. if confidence > confThreshold:
  116. left = int(detection[3])
  117. top = int(detection[4])
  118. right = int(detection[5])
  119. bottom = int(detection[6])
  120. width = right - left + 1
  121. height = bottom - top + 1
  122. if width <= 2 or height <= 2:
  123. left = int(detection[3] * frameWidth)
  124. top = int(detection[4] * frameHeight)
  125. right = int(detection[5] * frameWidth)
  126. bottom = int(detection[6] * frameHeight)
  127. width = right - left + 1
  128. height = bottom - top + 1
  129. classIds.append(int(detection[1]) - 1) # Skip background label
  130. confidences.append(float(confidence))
  131. boxes.append([left, top, width, height])
  132. elif lastLayer.type == 'Region':
  133. # Network produces output blob with a shape NxC where N is a number of
  134. # detected objects and C is a number of classes + 4 where the first 4
  135. # numbers are [center_x, center_y, width, height]
  136. for out in outs:
  137. for detection in out:
  138. scores = detection[5:]
  139. classId = np.argmax(scores)
  140. confidence = scores[classId]
  141. if confidence > confThreshold:
  142. center_x = int(detection[0] * frameWidth)
  143. center_y = int(detection[1] * frameHeight)
  144. width = int(detection[2] * frameWidth)
  145. height = int(detection[3] * frameHeight)
  146. left = int(center_x - width / 2)
  147. top = int(center_y - height / 2)
  148. classIds.append(classId)
  149. confidences.append(float(confidence))
  150. boxes.append([left, top, width, height])
  151. else:
  152. print('Unknown output layer type: ' + lastLayer.type)
  153. exit()
  154. # NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
  155. # or NMS is required if number of outputs > 1
  156. if len(outNames) > 1 or lastLayer.type == 'Region' and args.backend != cv.dnn.DNN_BACKEND_OPENCV:
  157. indices = []
  158. classIds = np.array(classIds)
  159. boxes = np.array(boxes)
  160. confidences = np.array(confidences)
  161. unique_classes = set(classIds)
  162. for cl in unique_classes:
  163. class_indices = np.where(classIds == cl)[0]
  164. conf = confidences[class_indices]
  165. box = boxes[class_indices].tolist()
  166. nms_indices = cv.dnn.NMSBoxes(box, conf, confThreshold, nmsThreshold)
  167. nms_indices = nms_indices[:, 0] if len(nms_indices) else []
  168. indices.extend(class_indices[nms_indices])
  169. else:
  170. indices = np.arange(0, len(classIds))
  171. for i in indices:
  172. box = boxes[i]
  173. left = box[0]
  174. top = box[1]
  175. width = box[2]
  176. height = box[3]
  177. drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
  178. # Process inputs
  179. winName = 'Deep learning object detection in OpenCV'
  180. cv.namedWindow(winName, cv.WINDOW_NORMAL)
  181. def callback(pos):
  182. global confThreshold
  183. confThreshold = pos / 100.0
  184. cv.createTrackbar('Confidence threshold, %', winName, int(confThreshold * 100), 99, callback)
  185. cap = cv.VideoCapture(cv.samples.findFileOrKeep(args.input) if args.input else 0)
  186. class QueueFPS(queue.Queue):
  187. def __init__(self):
  188. queue.Queue.__init__(self)
  189. self.startTime = 0
  190. self.counter = 0
  191. def put(self, v):
  192. queue.Queue.put(self, v)
  193. self.counter += 1
  194. if self.counter == 1:
  195. self.startTime = time.time()
  196. def getFPS(self):
  197. return self.counter / (time.time() - self.startTime)
  198. process = True
  199. #
  200. # Frames capturing thread
  201. #
  202. framesQueue = QueueFPS()
  203. def framesThreadBody():
  204. global framesQueue, process
  205. while process:
  206. hasFrame, frame = cap.read()
  207. if not hasFrame:
  208. break
  209. framesQueue.put(frame)
  210. #
  211. # Frames processing thread
  212. #
  213. processedFramesQueue = queue.Queue()
  214. predictionsQueue = QueueFPS()
  215. def processingThreadBody():
  216. global processedFramesQueue, predictionsQueue, args, process
  217. futureOutputs = []
  218. while process:
  219. # Get a next frame
  220. frame = None
  221. try:
  222. frame = framesQueue.get_nowait()
  223. if args.asyncN:
  224. if len(futureOutputs) == args.asyncN:
  225. frame = None # Skip the frame
  226. else:
  227. framesQueue.queue.clear() # Skip the rest of frames
  228. except queue.Empty:
  229. pass
  230. if not frame is None:
  231. frameHeight = frame.shape[0]
  232. frameWidth = frame.shape[1]
  233. # Create a 4D blob from a frame.
  234. inpWidth = args.width if args.width else frameWidth
  235. inpHeight = args.height if args.height else frameHeight
  236. blob = cv.dnn.blobFromImage(frame, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_8U)
  237. processedFramesQueue.put(frame)
  238. # Run a model
  239. net.setInput(blob, scalefactor=args.scale, mean=args.mean)
  240. if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN
  241. frame = cv.resize(frame, (inpWidth, inpHeight))
  242. net.setInput(np.array([[inpHeight, inpWidth, 1.6]], dtype=np.float32), 'im_info')
  243. if args.asyncN:
  244. futureOutputs.append(net.forwardAsync())
  245. else:
  246. outs = net.forward(outNames)
  247. predictionsQueue.put(np.copy(outs))
  248. while futureOutputs and futureOutputs[0].wait_for(0):
  249. out = futureOutputs[0].get()
  250. predictionsQueue.put(np.copy([out]))
  251. del futureOutputs[0]
  252. framesThread = Thread(target=framesThreadBody)
  253. framesThread.start()
  254. processingThread = Thread(target=processingThreadBody)
  255. processingThread.start()
  256. #
  257. # Postprocessing and rendering loop
  258. #
  259. while cv.waitKey(1) < 0:
  260. try:
  261. # Request prediction first because they put after frames
  262. outs = predictionsQueue.get_nowait()
  263. frame = processedFramesQueue.get_nowait()
  264. postprocess(frame, outs)
  265. # Put efficiency information.
  266. if predictionsQueue.counter > 1:
  267. label = 'Camera: %.2f FPS' % (framesQueue.getFPS())
  268. cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
  269. label = 'Network: %.2f FPS' % (predictionsQueue.getFPS())
  270. cv.putText(frame, label, (0, 30), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
  271. label = 'Skipped frames: %d' % (framesQueue.counter - predictionsQueue.counter)
  272. cv.putText(frame, label, (0, 45), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
  273. cv.imshow(winName, frame)
  274. except queue.Empty:
  275. pass
  276. process = False
  277. framesThread.join()
  278. processingThread.join()