webcam_demo.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. /*
  2. * webcam-demo.cpp
  3. *
  4. * A demo program of End-to-end Scene Text Detection and Recognition using webcam or video.
  5. *
  6. * Created on: Jul 31, 2014
  7. * Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
  8. */
  9. #include "opencv2/text.hpp"
  10. #include "opencv2/highgui.hpp"
  11. #include "opencv2/imgproc.hpp"
  12. #include "opencv2/features2d.hpp"
  13. #include <iostream>
  14. using namespace std;
  15. using namespace cv;
  16. using namespace cv::text;
  17. //ERStat extraction is done in parallel for different channels
  18. class Parallel_extractCSER: public cv::ParallelLoopBody
  19. {
  20. private:
  21. vector<Mat> &channels;
  22. vector< vector<ERStat> > &regions;
  23. vector< Ptr<ERFilter> > er_filter1;
  24. vector< Ptr<ERFilter> > er_filter2;
  25. public:
  26. Parallel_extractCSER(vector<Mat> &_channels, vector< vector<ERStat> > &_regions,
  27. vector<Ptr<ERFilter> >_er_filter1, vector<Ptr<ERFilter> >_er_filter2)
  28. : channels(_channels),regions(_regions),er_filter1(_er_filter1),er_filter2(_er_filter2) {}
  29. virtual void operator()( const cv::Range &r ) const CV_OVERRIDE
  30. {
  31. for (int c=r.start; c < r.end; c++)
  32. {
  33. er_filter1[c]->run(channels[c], regions[c]);
  34. er_filter2[c]->run(channels[c], regions[c]);
  35. }
  36. }
  37. Parallel_extractCSER & operator=(const Parallel_extractCSER &a);
  38. };
  39. //OCR recognition is done in parallel for different detections
  40. template <class T>
  41. class Parallel_OCR: public cv::ParallelLoopBody
  42. {
  43. private:
  44. vector<Mat> &detections;
  45. vector<string> &outputs;
  46. vector< vector<Rect> > &boxes;
  47. vector< vector<string> > &words;
  48. vector< vector<float> > &confidences;
  49. vector< Ptr<T> > &ocrs;
  50. public:
  51. Parallel_OCR(vector<Mat> &_detections, vector<string> &_outputs, vector< vector<Rect> > &_boxes,
  52. vector< vector<string> > &_words, vector< vector<float> > &_confidences,
  53. vector< Ptr<T> > &_ocrs)
  54. : detections(_detections), outputs(_outputs), boxes(_boxes), words(_words),
  55. confidences(_confidences), ocrs(_ocrs)
  56. {}
  57. virtual void operator()( const cv::Range &r ) const CV_OVERRIDE
  58. {
  59. for (int c=r.start; c < r.end; c++)
  60. {
  61. ocrs[c%ocrs.size()]->run(detections[c], outputs[c], &boxes[c], &words[c], &confidences[c], OCR_LEVEL_WORD);
  62. }
  63. }
  64. Parallel_OCR & operator=(const Parallel_OCR &a);
  65. };
  66. //Discard wrongly recognised strings
  67. bool isRepetitive(const string& s);
  68. //Draw ER's in an image via floodFill
  69. void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation);
  70. const char* keys =
  71. {
  72. "{@input | 0 | camera index or video file name}"
  73. "{ image i | | specify input image}"
  74. };
  75. //Perform text detection and recognition from webcam or video
  76. int main(int argc, char* argv[])
  77. {
  78. CommandLineParser parser(argc, argv, keys);
  79. cout << "A demo program of End-to-end Scene Text Detection and Recognition using webcam or video." << endl << endl;
  80. cout << " Keys: " << endl;
  81. cout << " Press 'r' to switch between MSER/CSER regions." << endl;
  82. cout << " Press 'g' to switch between Horizontal and Arbitrary oriented grouping." << endl;
  83. cout << " Press 'o' to switch between OCRTesseract/OCRHMMDecoder recognition." << endl;
  84. cout << " Press 's' to scale down frame size to 320x240." << endl;
  85. cout << " Press 'ESC' to exit." << endl << endl;
  86. parser.printMessage();
  87. VideoCapture cap;
  88. Mat frame, image, gray, out_img;
  89. String input = parser.get<String>("@input");
  90. String image_file_name = parser.get<String>("image");
  91. if (image_file_name != "")
  92. {
  93. image = imread(image_file_name);
  94. if (image.empty())
  95. {
  96. cout << "\nunable to open " << image_file_name << "\nprogram terminated!\n";
  97. return 1;
  98. }
  99. else
  100. {
  101. cout << "\nimage " << image_file_name << " loaded!\n";
  102. frame = image.clone();
  103. }
  104. }
  105. else
  106. {
  107. cout << "\nInitializing capturing... ";
  108. if (input.size() == 1 && isdigit(input[0]))
  109. cap.open(input[0] - '0');
  110. else
  111. cap.open(input);
  112. if (!cap.isOpened())
  113. {
  114. cout << "\nCould not initialize capturing!\n";
  115. return 1;
  116. }
  117. cout << " Done!" << endl;
  118. cap.read(frame);
  119. }
  120. namedWindow("recognition",WINDOW_NORMAL);
  121. imshow("recognition", frame);
  122. waitKey(1);
  123. bool downsize = false;
  124. int REGION_TYPE = 1;
  125. int GROUPING_ALGORITHM = 0;
  126. int RECOGNITION = 0;
  127. String region_types_str[2] = {"ERStats", "MSER"};
  128. String grouping_algorithms_str[2] = {"exhaustive_search", "multioriented"};
  129. String recognitions_str[2] = {"Tesseract", "NM_chain_features + KNN"};
  130. vector<Mat> channels;
  131. vector<vector<ERStat> > regions(2); //two channels
  132. // Create ERFilter objects with the 1st and 2nd stage default classifiers
  133. // since er algorithm is not reentrant we need one filter for channel
  134. vector< Ptr<ERFilter> > er_filters1;
  135. vector< Ptr<ERFilter> > er_filters2;
  136. for (int i=0; i<2; i++)
  137. {
  138. Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015f,0.13f,0.2f,true,0.1f);
  139. Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);
  140. er_filters1.push_back(er_filter1);
  141. er_filters2.push_back(er_filter2);
  142. }
  143. //Initialize OCR engine (we initialize 10 instances in order to work several recognitions in parallel)
  144. cout << "Initializing OCR engines ... ";
  145. int num_ocrs = 10;
  146. vector< Ptr<OCRTesseract> > ocrs;
  147. for (int o=0; o<num_ocrs; o++)
  148. {
  149. ocrs.push_back(OCRTesseract::create());
  150. }
  151. Mat transition_p;
  152. string filename = "OCRHMM_transitions_table.xml";
  153. FileStorage fs(filename, FileStorage::READ);
  154. fs["transition_probabilities"] >> transition_p;
  155. fs.release();
  156. Mat emission_p = Mat::eye(62,62,CV_64FC1);
  157. string voc = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
  158. vector< Ptr<OCRHMMDecoder> > decoders;
  159. for (int o=0; o<num_ocrs; o++)
  160. {
  161. decoders.push_back(OCRHMMDecoder::create(loadOCRHMMClassifierNM("OCRHMM_knn_model_data.xml.gz"),
  162. voc, transition_p, emission_p));
  163. }
  164. cout << " Done!" << endl;
  165. while ( true )
  166. {
  167. double t_all = (double)getTickCount();
  168. if (downsize)
  169. resize(frame,frame,Size(320,240),0,0,INTER_LINEAR_EXACT);
  170. /*Text Detection*/
  171. cvtColor(frame,gray,COLOR_BGR2GRAY);
  172. // Extract channels to be processed individually
  173. channels.clear();
  174. channels.push_back(gray);
  175. channels.push_back(255-gray);
  176. regions[0].clear();
  177. regions[1].clear();
  178. switch (REGION_TYPE)
  179. {
  180. case 0: // ERStats
  181. parallel_for_(cv::Range(0, (int)channels.size()), Parallel_extractCSER(channels, regions, er_filters1, er_filters2));
  182. break;
  183. case 1: // MSER
  184. vector<vector<Point> > contours;
  185. vector<Rect> bboxes;
  186. Ptr<MSER> mser = MSER::create(21, (int)(0.00002*gray.cols*gray.rows), (int)(0.05*gray.cols*gray.rows), 1, 0.7);
  187. mser->detectRegions(gray, contours, bboxes);
  188. //Convert the output of MSER to suitable input for the grouping/recognition algorithms
  189. if (contours.size() > 0)
  190. MSERsToERStats(gray, contours, regions);
  191. break;
  192. }
  193. // Detect character groups
  194. vector< vector<Vec2i> > nm_region_groups;
  195. vector<Rect> nm_boxes;
  196. switch (GROUPING_ALGORITHM)
  197. {
  198. case 0: // exhaustive_search
  199. erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_HORIZ);
  200. break;
  201. case 1: //multioriented
  202. erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_ANY, "./trained_classifier_erGrouping.xml", 0.5);
  203. break;
  204. }
  205. /*Text Recognition (OCR)*/
  206. int bottom_bar_height= out_img.rows/7 ;
  207. copyMakeBorder(frame, out_img, 0, bottom_bar_height, 0, 0, BORDER_CONSTANT, Scalar(150, 150, 150));
  208. float scale_font = (float)(bottom_bar_height /85.0);
  209. vector<string> words_detection;
  210. float min_confidence1 = 0.f, min_confidence2 = 0.f;
  211. if (RECOGNITION == 0)
  212. {
  213. min_confidence1 = 51.f;
  214. min_confidence2 = 60.f;
  215. }
  216. vector<Mat> detections;
  217. for (int i=0; i<(int)nm_boxes.size(); i++)
  218. {
  219. rectangle(out_img, nm_boxes[i].tl(), nm_boxes[i].br(), Scalar(255,255,0),3);
  220. Mat group_img = Mat::zeros(frame.rows+2, frame.cols+2, CV_8UC1);
  221. er_draw(channels, regions, nm_region_groups[i], group_img);
  222. group_img(nm_boxes[i]).copyTo(group_img);
  223. copyMakeBorder(group_img,group_img,15,15,15,15,BORDER_CONSTANT,Scalar(0));
  224. detections.push_back(group_img);
  225. }
  226. vector<string> outputs((int)detections.size());
  227. vector< vector<Rect> > boxes((int)detections.size());
  228. vector< vector<string> > words((int)detections.size());
  229. vector< vector<float> > confidences((int)detections.size());
  230. // parallel process detections in batches of ocrs.size() (== num_ocrs)
  231. for (int i=0; i<(int)detections.size(); i=i+(int)num_ocrs)
  232. {
  233. Range r;
  234. if (i+(int)num_ocrs <= (int)detections.size())
  235. r = Range(i,i+(int)num_ocrs);
  236. else
  237. r = Range(i,(int)detections.size());
  238. switch(RECOGNITION)
  239. {
  240. case 0: // Tesseract
  241. parallel_for_(r, Parallel_OCR<OCRTesseract>(detections, outputs, boxes, words, confidences, ocrs));
  242. break;
  243. case 1: // NM_chain_features + KNN
  244. parallel_for_(r, Parallel_OCR<OCRHMMDecoder>(detections, outputs, boxes, words, confidences, decoders));
  245. break;
  246. }
  247. }
  248. for (int i=0; i<(int)detections.size(); i++)
  249. {
  250. outputs[i].erase(remove(outputs[i].begin(), outputs[i].end(), '\n'), outputs[i].end());
  251. //cout << "OCR output = \"" << outputs[i] << "\" length = " << outputs[i].size() << endl;
  252. if (outputs[i].size() < 3)
  253. continue;
  254. for (int j=0; j<(int)boxes[i].size(); j++)
  255. {
  256. boxes[i][j].x += nm_boxes[i].x-15;
  257. boxes[i][j].y += nm_boxes[i].y-15;
  258. //cout << " word = " << words[j] << "\t confidence = " << confidences[j] << endl;
  259. if ((words[i][j].size() < 2) || (confidences[i][j] < min_confidence1) ||
  260. ((words[i][j].size()==2) && (words[i][j][0] == words[i][j][1])) ||
  261. ((words[i][j].size()< 4) && (confidences[i][j] < min_confidence2)) ||
  262. isRepetitive(words[i][j]))
  263. continue;
  264. words_detection.push_back(words[i][j]);
  265. rectangle(out_img, boxes[i][j].tl(), boxes[i][j].br(), Scalar(255,0,255),3);
  266. Size word_size = getTextSize(words[i][j], FONT_HERSHEY_SIMPLEX, (double)scale_font, (int)(3*scale_font), NULL);
  267. rectangle(out_img, boxes[i][j].tl()-Point(3,word_size.height+3), boxes[i][j].tl()+Point(word_size.width,0), Scalar(255,0,255),-1);
  268. putText(out_img, words[i][j], boxes[i][j].tl()-Point(1,1), FONT_HERSHEY_SIMPLEX, scale_font, Scalar(255,255,255),(int)(3*scale_font));
  269. }
  270. }
  271. t_all = ((double)getTickCount() - t_all)*1000/getTickFrequency();
  272. int text_thickness = 1+(out_img.rows/500);
  273. string fps_info = format("%2.1f Fps. %dx%d", (float)(1000 / t_all), frame.cols, frame.rows);
  274. putText(out_img, fps_info, Point( 10,out_img.rows-5 ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0), text_thickness);
  275. putText(out_img, region_types_str[REGION_TYPE], Point((int)(out_img.cols*0.5), out_img.rows - (int)(bottom_bar_height / 1.5)), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0), text_thickness);
  276. putText(out_img, grouping_algorithms_str[GROUPING_ALGORITHM], Point((int)(out_img.cols*0.5),out_img.rows-((int)(bottom_bar_height /3)+4) ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0), text_thickness);
  277. putText(out_img, recognitions_str[RECOGNITION], Point((int)(out_img.cols*0.5),out_img.rows-5 ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0), text_thickness);
  278. imshow("recognition", out_img);
  279. if ((image_file_name == "") && !cap.read(frame))
  280. {
  281. cout << "Capturing ended! press any key to exit." << endl;
  282. waitKey();
  283. return 0;
  284. }
  285. int key = waitKey(30); //wait for a key press
  286. switch (key)
  287. {
  288. case 27: //ESC
  289. cout << "ESC key pressed and exited." << endl;
  290. return 0;
  291. case 32: //SPACE
  292. imwrite("recognition_alt.jpg", out_img);
  293. break;
  294. case 103: //'g'
  295. GROUPING_ALGORITHM = (GROUPING_ALGORITHM+1)%2;
  296. cout << "Grouping switched to " << grouping_algorithms_str[GROUPING_ALGORITHM] << endl;
  297. break;
  298. case 111: //'o'
  299. RECOGNITION = (RECOGNITION+1)%2;
  300. cout << "OCR switched to " << recognitions_str[RECOGNITION] << endl;
  301. break;
  302. case 114: //'r'
  303. REGION_TYPE = (REGION_TYPE+1)%2;
  304. cout << "Regions switched to " << region_types_str[REGION_TYPE] << endl;
  305. break;
  306. case 115: //'s'
  307. downsize = !downsize;
  308. if (!image.empty())
  309. {
  310. frame = image.clone();
  311. }
  312. break;
  313. default:
  314. break;
  315. }
  316. }
  317. return 0;
  318. }
  319. bool isRepetitive(const string& s)
  320. {
  321. int count = 0;
  322. int count2 = 0;
  323. int count3 = 0;
  324. int first=(int)s[0];
  325. int last=(int)s[(int)s.size()-1];
  326. for (int i=0; i<(int)s.size(); i++)
  327. {
  328. if ((s[i] == 'i') ||
  329. (s[i] == 'l') ||
  330. (s[i] == 'I'))
  331. count++;
  332. if((int)s[i]==first)
  333. count2++;
  334. if((int)s[i]==last)
  335. count3++;
  336. }
  337. if ((count > ((int)s.size()+1)/2) || (count2 == (int)s.size()) || (count3 > ((int)s.size()*2)/3))
  338. {
  339. return true;
  340. }
  341. return false;
  342. }
  343. void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation)
  344. {
  345. for (int r=0; r<(int)group.size(); r++)
  346. {
  347. ERStat er = regions[group[r][0]][group[r][1]];
  348. if (er.parent != NULL) // deprecate the root region
  349. {
  350. int newMaskVal = 255;
  351. int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY;
  352. floodFill(channels[group[r][0]],segmentation,Point(er.pixel%channels[group[r][0]].cols,er.pixel/channels[group[r][0]].cols),
  353. Scalar(255),0,Scalar(er.level),Scalar(0),flags);
  354. }
  355. }
  356. }