end_to_end_recognition.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. /*
  2. * textdetection.cpp
  3. *
  4. * A demo program of End-to-end Scene Text Detection and Recognition:
  5. * Shows the use of the Tesseract OCR API with the Extremal Region Filter algorithm described in:
  6. * Neumann L., Matas J.: Real-Time Scene Text Localization and Recognition, CVPR 2012
  7. *
  8. * Created on: Jul 31, 2014
  9. * Author: Lluis Gomez i Bigorda <lgomez AT cvc.uab.es>
  10. */
  11. #include "opencv2/text.hpp"
  12. #include "opencv2/core/utility.hpp"
  13. #include "opencv2/highgui.hpp"
  14. #include "opencv2/imgproc.hpp"
  15. #include <iostream>
  16. using namespace std;
  17. using namespace cv;
  18. using namespace cv::text;
  19. //Calculate edit distance between two words
  20. size_t edit_distance(const string& A, const string& B);
  21. size_t min(size_t x, size_t y, size_t z);
  22. bool isRepetitive(const string& s);
  23. bool sort_by_length(const string &a, const string &b);
  24. //Draw ER's in an image via floodFill
  25. void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation);
  26. //Perform text detection and recognition and evaluate results using edit distance
  27. int main(int argc, char* argv[])
  28. {
  29. cout << endl << argv[0] << endl << endl;
  30. cout << "A demo program of End-to-end Scene Text Detection and Recognition: " << endl;
  31. cout << "Shows the use of the Tesseract OCR API with the Extremal Region Filter algorithm described in:" << endl;
  32. cout << "Neumann L., Matas J.: Real-Time Scene Text Localization and Recognition, CVPR 2012" << endl << endl;
  33. Mat image;
  34. if(argc>1)
  35. image = imread(argv[1]);
  36. else
  37. {
  38. cout << " Usage: " << argv[0] << " <input_image> [<gt_word1> ... <gt_wordN>]" << endl;
  39. return(0);
  40. }
  41. cout << "IMG_W=" << image.cols << endl;
  42. cout << "IMG_H=" << image.rows << endl;
  43. /*Text Detection*/
  44. // Extract channels to be processed individually
  45. vector<Mat> channels;
  46. Mat grey;
  47. cvtColor(image,grey,COLOR_RGB2GRAY);
  48. // Notice here we are only using grey channel, see textdetection.cpp for example with more channels
  49. channels.push_back(grey);
  50. channels.push_back(255-grey);
  51. double t_d = (double)getTickCount();
  52. // Create ERFilter objects with the 1st and 2nd stage default classifiers
  53. Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015f,0.13f,0.2f,true,0.1f);
  54. Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);
  55. vector<vector<ERStat> > regions(channels.size());
  56. // Apply the default cascade classifier to each independent channel (could be done in parallel)
  57. for (int c=0; c<(int)channels.size(); c++)
  58. {
  59. er_filter1->run(channels[c], regions[c]);
  60. er_filter2->run(channels[c], regions[c]);
  61. }
  62. cout << "TIME_REGION_DETECTION = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl;
  63. Mat out_img_decomposition= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
  64. vector<Vec2i> tmp_group;
  65. for (int i=0; i<(int)regions.size(); i++)
  66. {
  67. for (int j=0; j<(int)regions[i].size();j++)
  68. {
  69. tmp_group.push_back(Vec2i(i,j));
  70. }
  71. Mat tmp= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
  72. er_draw(channels, regions, tmp_group, tmp);
  73. if (i > 0)
  74. tmp = tmp / 2;
  75. out_img_decomposition = out_img_decomposition | tmp;
  76. tmp_group.clear();
  77. }
  78. double t_g = (double)getTickCount();
  79. // Detect character groups
  80. vector< vector<Vec2i> > nm_region_groups;
  81. vector<Rect> nm_boxes;
  82. erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_HORIZ);
  83. cout << "TIME_GROUPING = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl;
  84. /*Text Recognition (OCR)*/
  85. double t_r = (double)getTickCount();
  86. Ptr<OCRTesseract> ocr = OCRTesseract::create();
  87. cout << "TIME_OCR_INITIALIZATION = " << ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl;
  88. string output;
  89. Mat out_img;
  90. Mat out_img_detection;
  91. Mat out_img_segmentation = Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
  92. image.copyTo(out_img);
  93. image.copyTo(out_img_detection);
  94. float scale_img = 600.f/image.rows;
  95. float scale_font = (float)(2-scale_img)/1.4f;
  96. vector<string> words_detection;
  97. t_r = (double)getTickCount();
  98. for (int i=0; i<(int)nm_boxes.size(); i++)
  99. {
  100. rectangle(out_img_detection, nm_boxes[i].tl(), nm_boxes[i].br(), Scalar(0,255,255), 3);
  101. Mat group_img = Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
  102. er_draw(channels, regions, nm_region_groups[i], group_img);
  103. Mat group_segmentation;
  104. group_img.copyTo(group_segmentation);
  105. //image(nm_boxes[i]).copyTo(group_img);
  106. group_img(nm_boxes[i]).copyTo(group_img);
  107. copyMakeBorder(group_img,group_img,15,15,15,15,BORDER_CONSTANT,Scalar(0));
  108. vector<Rect> boxes;
  109. vector<string> words;
  110. vector<float> confidences;
  111. ocr->run(group_img, output, &boxes, &words, &confidences, OCR_LEVEL_WORD);
  112. output.erase(remove(output.begin(), output.end(), '\n'), output.end());
  113. //cout << "OCR output = \"" << output << "\" length = " << output.size() << endl;
  114. if (output.size() < 3)
  115. continue;
  116. for (int j=0; j<(int)boxes.size(); j++)
  117. {
  118. boxes[j].x += nm_boxes[i].x-15;
  119. boxes[j].y += nm_boxes[i].y-15;
  120. //cout << " word = " << words[j] << "\t confidence = " << confidences[j] << endl;
  121. if ((words[j].size() < 2) || (confidences[j] < 51) ||
  122. ((words[j].size()==2) && (words[j][0] == words[j][1])) ||
  123. ((words[j].size()< 4) && (confidences[j] < 60)) ||
  124. isRepetitive(words[j]))
  125. continue;
  126. words_detection.push_back(words[j]);
  127. rectangle(out_img, boxes[j].tl(), boxes[j].br(), Scalar(255,0,255),3);
  128. Size word_size = getTextSize(words[j], FONT_HERSHEY_SIMPLEX, (double)scale_font, (int)(3*scale_font), NULL);
  129. rectangle(out_img, boxes[j].tl()-Point(3,word_size.height+3), boxes[j].tl()+Point(word_size.width,0), Scalar(255,0,255),-1);
  130. putText(out_img, words[j], boxes[j].tl()-Point(1,1), FONT_HERSHEY_SIMPLEX, scale_font, Scalar(255,255,255),(int)(3*scale_font));
  131. out_img_segmentation = out_img_segmentation | group_segmentation;
  132. }
  133. }
  134. cout << "TIME_OCR = " << ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl;
  135. /* Recognition evaluation with (approximate) Hungarian matching and edit distances */
  136. if(argc>2)
  137. {
  138. int num_gt_characters = 0;
  139. vector<string> words_gt;
  140. for (int i=2; i<argc; i++)
  141. {
  142. string s = string(argv[i]);
  143. if (s.size() > 0)
  144. {
  145. words_gt.push_back(string(argv[i]));
  146. //cout << " GT word " << words_gt[words_gt.size()-1] << endl;
  147. num_gt_characters += (int)(words_gt[words_gt.size()-1].size());
  148. }
  149. }
  150. if (words_detection.empty())
  151. {
  152. //cout << endl << "number of characters in gt = " << num_gt_characters << endl;
  153. cout << "TOTAL_EDIT_DISTANCE = " << num_gt_characters << endl;
  154. cout << "EDIT_DISTANCE_RATIO = 1" << endl;
  155. }
  156. else
  157. {
  158. sort(words_gt.begin(),words_gt.end(),sort_by_length);
  159. int max_dist=0;
  160. vector< vector<int> > assignment_mat;
  161. for (int i=0; i<(int)words_gt.size(); i++)
  162. {
  163. vector<int> assignment_row(words_detection.size(),0);
  164. assignment_mat.push_back(assignment_row);
  165. for (int j=0; j<(int)words_detection.size(); j++)
  166. {
  167. assignment_mat[i][j] = (int)(edit_distance(words_gt[i],words_detection[j]));
  168. max_dist = max(max_dist,assignment_mat[i][j]);
  169. }
  170. }
  171. vector<int> words_detection_matched;
  172. int total_edit_distance = 0;
  173. int tp=0, fp=0, fn=0;
  174. for (int search_dist=0; search_dist<=max_dist; search_dist++)
  175. {
  176. for (int i=0; i<(int)assignment_mat.size(); i++)
  177. {
  178. int min_dist_idx = (int)distance(assignment_mat[i].begin(),
  179. min_element(assignment_mat[i].begin(),assignment_mat[i].end()));
  180. if (assignment_mat[i][min_dist_idx] == search_dist)
  181. {
  182. //cout << " GT word \"" << words_gt[i] << "\" best match \"" << words_detection[min_dist_idx] << "\" with dist " << assignment_mat[i][min_dist_idx] << endl;
  183. if(search_dist == 0)
  184. tp++;
  185. else { fp++; fn++; }
  186. total_edit_distance += assignment_mat[i][min_dist_idx];
  187. words_detection_matched.push_back(min_dist_idx);
  188. words_gt.erase(words_gt.begin()+i);
  189. assignment_mat.erase(assignment_mat.begin()+i);
  190. for (int j=0; j<(int)assignment_mat.size(); j++)
  191. {
  192. assignment_mat[j][min_dist_idx]=INT_MAX;
  193. }
  194. i--;
  195. }
  196. }
  197. }
  198. for (int j=0; j<(int)words_gt.size(); j++)
  199. {
  200. //cout << " GT word \"" << words_gt[j] << "\" no match found" << endl;
  201. fn++;
  202. total_edit_distance += (int)words_gt[j].size();
  203. }
  204. for (int j=0; j<(int)words_detection.size(); j++)
  205. {
  206. if (find(words_detection_matched.begin(),words_detection_matched.end(),j) == words_detection_matched.end())
  207. {
  208. //cout << " Detection word \"" << words_detection[j] << "\" no match found" << endl;
  209. fp++;
  210. total_edit_distance += (int)words_detection[j].size();
  211. }
  212. }
  213. //cout << endl << "number of characters in gt = " << num_gt_characters << endl;
  214. cout << "TOTAL_EDIT_DISTANCE = " << total_edit_distance << endl;
  215. cout << "EDIT_DISTANCE_RATIO = " << (float)total_edit_distance / num_gt_characters << endl;
  216. cout << "TP = " << tp << endl;
  217. cout << "FP = " << fp << endl;
  218. cout << "FN = " << fn << endl;
  219. }
  220. }
  221. //resize(out_img_detection,out_img_detection,Size(image.cols*scale_img,image.rows*scale_img),0,0,INTER_LINEAR_EXACT);
  222. //imshow("detection", out_img_detection);
  223. //imwrite("detection.jpg", out_img_detection);
  224. //resize(out_img,out_img,Size(image.cols*scale_img,image.rows*scale_img),0,0,INTER_LINEAR_EXACT);
  225. namedWindow("recognition",WINDOW_NORMAL);
  226. imshow("recognition", out_img);
  227. waitKey(0);
  228. //imwrite("recognition.jpg", out_img);
  229. //imwrite("segmentation.jpg", out_img_segmentation);
  230. //imwrite("decomposition.jpg", out_img_decomposition);
  231. return 0;
  232. }
  233. size_t min(size_t x, size_t y, size_t z)
  234. {
  235. return x < y ? min(x,z) : min(y,z);
  236. }
  237. size_t edit_distance(const string& A, const string& B)
  238. {
  239. size_t NA = A.size();
  240. size_t NB = B.size();
  241. vector< vector<size_t> > M(NA + 1, vector<size_t>(NB + 1));
  242. for (size_t a = 0; a <= NA; ++a)
  243. M[a][0] = a;
  244. for (size_t b = 0; b <= NB; ++b)
  245. M[0][b] = b;
  246. for (size_t a = 1; a <= NA; ++a)
  247. for (size_t b = 1; b <= NB; ++b)
  248. {
  249. size_t x = M[a-1][b] + 1;
  250. size_t y = M[a][b-1] + 1;
  251. size_t z = M[a-1][b-1] + (A[a-1] == B[b-1] ? 0 : 1);
  252. M[a][b] = min(x,y,z);
  253. }
  254. return M[A.size()][B.size()];
  255. }
  256. bool isRepetitive(const string& s)
  257. {
  258. int count = 0;
  259. for (int i=0; i<(int)s.size(); i++)
  260. {
  261. if ((s[i] == 'i') ||
  262. (s[i] == 'l') ||
  263. (s[i] == 'I'))
  264. count++;
  265. }
  266. if (count > ((int)s.size()+1)/2)
  267. {
  268. return true;
  269. }
  270. return false;
  271. }
  272. void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation)
  273. {
  274. for (int r=0; r<(int)group.size(); r++)
  275. {
  276. ERStat er = regions[group[r][0]][group[r][1]];
  277. if (er.parent != NULL) // deprecate the root region
  278. {
  279. int newMaskVal = 255;
  280. int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY;
  281. floodFill(channels[group[r][0]],segmentation,Point(er.pixel%channels[group[r][0]].cols,er.pixel/channels[group[r][0]].cols),
  282. Scalar(255),0,Scalar(er.level),Scalar(0),flags);
  283. }
  284. }
  285. }
  286. bool sort_by_length(const string &a, const string &b){return (a.size()>b.size());}