tr_icdar_benchmark.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2014, Itseez Inc, all rights reserved.
  14. // Third party copyrights are property of their respective owners.
  15. //
  16. // Redistribution and use in source and binary forms, with or without modification,
  17. // are permitted provided that the following conditions are met:
  18. //
  19. // * Redistribution's of source code must retain the above copyright notice,
  20. // this list of conditions and the following disclaimer.
  21. //
  22. // * Redistribution's in binary form must reproduce the above copyright notice,
  23. // this list of conditions and the following disclaimer in the documentation
  24. // and/or other materials provided with the distribution.
  25. //
  26. // * The name of the copyright holders may not be used to endorse or promote products
  27. // derived from this software without specific prior written permission.
  28. //
  29. // This software is provided by the copyright holders and contributors "as is" and
  30. // any express or implied warranties, including, but not limited to, the implied
  31. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  32. // In no event shall the Itseez Inc or contributors be liable for any direct,
  33. // indirect, incidental, special, exemplary, or consequential damages
  34. // (including, but not limited to, procurement of substitute goods or services;
  35. // loss of use, data, or profits; or business interruption) however caused
  36. // and on any theory of liability, whether in contract, strict liability,
  37. // or tort (including negligence or otherwise) arising in any way out of
  38. // the use of this software, even if advised of the possibility of such damage.
  39. //
  40. //M*/
  41. #include <iostream>
  42. #include <opencv2/opencv_modules.hpp>
  43. #ifdef HAVE_OPENCV_TEXT
  44. #include "opencv2/datasets/tr_icdar.hpp"
  45. #include <opencv2/core.hpp>
  46. #include "opencv2/text.hpp"
  47. #include "opencv2/imgproc.hpp"
  48. #include "opencv2/imgcodecs.hpp"
  49. #include <cstdio>
  50. #include <cstdlib> // atoi
  51. #include <string>
  52. #include <vector>
  53. using namespace std;
  54. using namespace cv;
  55. using namespace cv::datasets;
  56. using namespace cv::text;
  57. //Calculate edit distance between two words
  58. size_t edit_distance(const string& A, const string& B);
  59. size_t min(size_t x, size_t y, size_t z);
  60. bool isRepetitive(const string& s);
  61. bool sort_by_length(const string &a, const string &b);
  62. //Draw ER's in an image via floodFill
  63. void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation);
  64. size_t min(size_t x, size_t y, size_t z)
  65. {
  66. return x < y ? min(x,z) : min(y,z);
  67. }
  68. size_t edit_distance(const string& A, const string& B)
  69. {
  70. size_t NA = A.size();
  71. size_t NB = B.size();
  72. vector< vector<size_t> > M(NA + 1, vector<size_t>(NB + 1));
  73. for (size_t a = 0; a <= NA; ++a)
  74. M[a][0] = a;
  75. for (size_t b = 0; b <= NB; ++b)
  76. M[0][b] = b;
  77. for (size_t a = 1; a <= NA; ++a)
  78. for (size_t b = 1; b <= NB; ++b)
  79. {
  80. size_t x = M[a-1][b] + 1;
  81. size_t y = M[a][b-1] + 1;
  82. size_t z = M[a-1][b-1] + (A[a-1] == B[b-1] ? 0 : 1);
  83. M[a][b] = min(x,y,z);
  84. }
  85. return M[A.size()][B.size()];
  86. }
  87. bool sort_by_length(const string &a, const string &b){return (a.size()>b.size());}
  88. bool isRepetitive(const string& s)
  89. {
  90. int count = 0;
  91. for (int i=0; i<(int)s.size(); i++)
  92. {
  93. if ((s[i] == 'i') ||
  94. (s[i] == 'l') ||
  95. (s[i] == 'I'))
  96. count++;
  97. }
  98. if (count > ((int)s.size()+1)/2)
  99. {
  100. return true;
  101. }
  102. return false;
  103. }
  104. void er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation)
  105. {
  106. for (int r=0; r<(int)group.size(); r++)
  107. {
  108. ERStat er = regions[group[r][0]][group[r][1]];
  109. if (er.parent != NULL) // deprecate the root region
  110. {
  111. int newMaskVal = 255;
  112. int flags = 4 + (newMaskVal << 8) + FLOODFILL_FIXED_RANGE + FLOODFILL_MASK_ONLY;
  113. floodFill(channels[group[r][0]],segmentation,Point(er.pixel%channels[group[r][0]].cols,er.pixel/channels[group[r][0]].cols),
  114. Scalar(255),0,Scalar(er.level),Scalar(0),flags);
  115. }
  116. }
  117. }
  118. // std::toupper is int->int
  119. static char char_toupper(char ch)
  120. {
  121. return (char)std::toupper((int)ch);
  122. }
  123. int main(int argc, char *argv[])
  124. {
  125. const char *keys =
  126. "{ help h usage ? | | show this message }"
  127. "{ path p |true| path to dataset root folder }"
  128. "{ ws wordspotting| | evaluate \"word spotting\" results }"
  129. "{ lex lexicon |1 | 0:no-lexicon, 1:100-words, 2:full-lexicon }";
  130. CommandLineParser parser(argc, argv, keys);
  131. string path(parser.get<string>("path"));
  132. if (parser.has("help") || path=="true")
  133. {
  134. parser.printMessage();
  135. return -1;
  136. }
  137. bool is_word_spotting = parser.has("ws");
  138. int selected_lex = parser.get<int>("lex");
  139. if ((selected_lex < 0) || (selected_lex > 2))
  140. {
  141. parser.printMessage();
  142. printf("Unsupported lex value.\n");
  143. return -1;
  144. }
  145. // loading train & test images description
  146. Ptr<TR_icdar> dataset = TR_icdar::create();
  147. dataset->load(path);
  148. vector<double> f1Each;
  149. unsigned int correctNum = 0;
  150. unsigned int returnedNum = 0;
  151. unsigned int returnedCorrectNum = 0;
  152. vector< Ptr<Object> >& test = dataset->getTest();
  153. unsigned int num = 0;
  154. for (vector< Ptr<Object> >::iterator itT=test.begin(); itT!=test.end(); ++itT)
  155. {
  156. TR_icdarObj *example = static_cast<TR_icdarObj *>((*itT).get());
  157. num++;
  158. printf("processed image: %u, name: %s\n", num, example->fileName.c_str());
  159. vector<string> empty_lexicon;
  160. vector<string> *lex;
  161. switch (selected_lex)
  162. {
  163. case 0:
  164. lex = &empty_lexicon;
  165. break;
  166. case 2:
  167. lex = &example->lexFull;
  168. break;
  169. default:
  170. lex = &example->lex100;
  171. break;
  172. }
  173. correctNum += example->words.size();
  174. unsigned int correctNumEach = example->words.size();
  175. // Take care of dontcare regions t.value == "###"
  176. for (size_t w=0; w<example->words.size(); w++)
  177. {
  178. string w_upper = example->words[w].value;
  179. transform(w_upper.begin(), w_upper.end(), w_upper.begin(), char_toupper);
  180. if ((find (lex->begin(), lex->end(), w_upper) == lex->end()) &&
  181. (is_word_spotting) && (selected_lex != 0))
  182. example->words[w].value = "###";
  183. if ( (example->words[w].value == "###") || (example->words[w].value.size()<3) )
  184. {
  185. correctNum --;
  186. correctNumEach --;
  187. }
  188. }
  189. unsigned int returnedNumEach = 0;
  190. unsigned int returnedCorrectNumEach = 0;
  191. Mat image = imread((path+"/test/"+example->fileName).c_str());
  192. /*Text Detection*/
  193. // Extract channels to be processed individually
  194. vector<Mat> channels;
  195. Mat grey;
  196. cvtColor(image,grey,COLOR_RGB2GRAY);
  197. // Notice here we are only using grey channel, see textdetection.cpp for example with more channels
  198. channels.push_back(grey);
  199. channels.push_back(255-grey);
  200. // Create ERFilter objects with the 1st and 2nd sworde default classifiers
  201. Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015f,0.13f,0.2f,true,0.1f);
  202. Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);
  203. vector<vector<ERStat> > regions(channels.size());
  204. // Apply the default cascade classifier to each independent channel (could be done in parallel)
  205. for (int c=0; c<(int)channels.size(); c++)
  206. {
  207. er_filter1->run(channels[c], regions[c]);
  208. er_filter2->run(channels[c], regions[c]);
  209. }
  210. // Detect character groups
  211. vector< vector<Vec2i> > nm_region_groups;
  212. vector<Rect> nm_boxes;
  213. erGrouping(image, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_HORIZ);
  214. /*Text Recognition (OCR)*/
  215. Ptr<OCRTesseract> ocr = OCRTesseract::create();
  216. bool ocr_is_tesseract = true;
  217. vector<string> final_words;
  218. vector<Rect> final_boxes;
  219. vector<float> final_confs;
  220. for (int i=0; i<(int)nm_boxes.size(); i++)
  221. {
  222. Mat group_img = Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
  223. er_draw(channels, regions, nm_region_groups[i], group_img);
  224. if (ocr_is_tesseract)
  225. {
  226. group_img(nm_boxes[i]).copyTo(group_img);
  227. copyMakeBorder(group_img,group_img,15,15,15,15,BORDER_CONSTANT,Scalar(0));
  228. } else {
  229. group_img(Rect(1,1,image.cols,image.rows)).copyTo(group_img);
  230. }
  231. string output;
  232. vector<Rect> boxes;
  233. vector<string> words;
  234. vector<float> confidences;
  235. ocr->run(grey, group_img, output, &boxes, &words, &confidences, OCR_LEVEL_WORD);
  236. output.erase(remove(output.begin(), output.end(), '\n'), output.end());
  237. //cout << "OCR output = \"" << output << "\" length = " << output.size() << endl;
  238. if (output.size() < 3)
  239. continue;
  240. for (int j=0; j<(int)boxes.size(); j++)
  241. {
  242. if (ocr_is_tesseract)
  243. {
  244. boxes[j].x += nm_boxes[i].x-15;
  245. boxes[j].y += nm_boxes[i].y-15;
  246. }
  247. float min_confidence = (ocr_is_tesseract)? (float)51. : (float)0.;
  248. float min_confidence4 = (ocr_is_tesseract)? (float)60. : (float)0.;
  249. //cout << " word = " << words[j] << "\t confidence = " << confidences[j] << endl;
  250. if ((words[j].size() < 2) || (confidences[j] < min_confidence) ||
  251. ((words[j].size()==2) && (words[j][0] == words[j][1])) ||
  252. ((words[j].size()< 4) && (confidences[j] < min_confidence4)) ||
  253. isRepetitive(words[j]))
  254. {
  255. continue;
  256. }
  257. std::transform(words[j].begin(), words[j].end(), words[j].begin(), char_toupper);
  258. /* Increase confidence of predicted words matching a word in the lexicon */
  259. if (lex->size() > 0)
  260. {
  261. if (find(lex->begin(), lex->end(), words[j]) == lex->end())
  262. confidences[j] = 200;
  263. }
  264. final_words.push_back(words[j]);
  265. final_boxes.push_back(boxes[j]);
  266. final_confs.push_back(confidences[j]);
  267. }
  268. }
  269. /* Non Maximal Suppression using OCR confidence */
  270. float thr = 0.5;
  271. for (size_t i=0; i<final_words.size(); )
  272. {
  273. int to_delete = -1;
  274. for (size_t j=i+1; j<final_words.size(); )
  275. {
  276. to_delete = -1;
  277. Rect intersection = final_boxes[i] & final_boxes[j];
  278. float IoU = (float)intersection.area() / (final_boxes[i].area() + final_boxes[j].area() - intersection.area());
  279. if ((IoU > thr) || (intersection.area() > 0.8*final_boxes[i].area()) || (intersection.area() > 0.8*final_boxes[j].area()))
  280. {
  281. // if regions overlap more than thr delete the one with lower confidence
  282. to_delete = (final_confs[i] < final_confs[j]) ? i : j;
  283. if (to_delete == (int)j )
  284. {
  285. final_words.erase(final_words.begin()+j);
  286. final_boxes.erase(final_boxes.begin()+j);
  287. final_confs.erase(final_confs.begin()+j);
  288. continue;
  289. } else {
  290. break;
  291. }
  292. }
  293. j++;
  294. }
  295. if (to_delete == (int)i )
  296. {
  297. final_words.erase(final_words.begin()+i);
  298. final_boxes.erase(final_boxes.begin()+i);
  299. final_confs.erase(final_confs.begin()+i);
  300. continue;
  301. }
  302. i++;
  303. }
  304. /* Predicted words which are not in the lexicon are filtered
  305. or changed to match one (when edit distance ratio < 0.34)*/
  306. float max_edit_distance_ratio = (float)0.34;
  307. for (size_t j=0; j<final_boxes.size(); j++)
  308. {
  309. if (lex->size() > 0)
  310. {
  311. if (find(lex->begin(), lex->end(), final_words[j]) == lex->end())
  312. {
  313. int best_match = -1;
  314. int best_dist = final_words[j].size();
  315. for (size_t l=0; l<lex->size(); l++)
  316. {
  317. int dist = edit_distance(lex->at(l),final_words[j]);
  318. if (dist < best_dist)
  319. {
  320. best_match = l;
  321. best_dist = dist;
  322. }
  323. }
  324. if (best_dist/final_words[j].size() < max_edit_distance_ratio)
  325. final_words[j] = lex->at(best_match);
  326. else
  327. continue;
  328. }
  329. }
  330. if ((find (lex->begin(), lex->end(), final_words[j])
  331. == lex->end()) && (is_word_spotting) && (selected_lex != 0))
  332. continue;
  333. // Output final recognition in csv format compatible with the ICDAR Competition
  334. /*cout << final_boxes[j].tl().x << ","
  335. << final_boxes[j].tl().y << ","
  336. << min(final_boxes[j].br().x,image.cols-2)
  337. << "," << final_boxes[j].tl().y << ","
  338. << min(final_boxes[j].br().x,image.cols-2) << ","
  339. << min(final_boxes[j].br().y,image.rows-2) << ","
  340. << final_boxes[j].tl().x << ","
  341. << min(final_boxes[j].br().y,image.rows-2) << ","
  342. << final_words[j] << endl ;*/
  343. returnedNum++;
  344. returnedNumEach++;
  345. bool matched = false;
  346. for (vector<word>::iterator it=example->words.begin(); it!=example->words.end(); ++it)
  347. {
  348. word &t = (*it);
  349. // ICDAR protocol accepts recognition up to the first non alphanumeric char
  350. string alnum_value = t.value;
  351. for (size_t c=0; c<alnum_value.size(); c++)
  352. {
  353. if (!isalnum(alnum_value[c]))
  354. {
  355. alnum_value = alnum_value.substr(0,c);
  356. break;
  357. }
  358. }
  359. std::transform(t.value.begin(), t.value.end(), t.value.begin(), char_toupper);
  360. if (((t.value==final_words[j]) || (alnum_value==final_words[j])) &&
  361. !(final_boxes[j].tl().x > t.x+t.width || final_boxes[j].br().x < t.x ||
  362. final_boxes[j].tl().y > t.y+t.height || final_boxes[j].br().y < t.y))
  363. {
  364. matched = true;
  365. returnedCorrectNum++;
  366. returnedCorrectNumEach++;
  367. //cout << "OK!" << endl;
  368. break;
  369. }
  370. }
  371. if (!matched) // Take care of dontcare regions t.value == "###"
  372. for (vector<word>::iterator it=example->words.begin(); it!=example->words.end(); ++it)
  373. {
  374. word &t = (*it);
  375. std::transform(t.value.begin(), t.value.end(), t.value.begin(), char_toupper);
  376. if ((t.value == "###") &&
  377. !(final_boxes[j].tl().x > t.x+t.width || final_boxes[j].br().x < t.x ||
  378. final_boxes[j].tl().y > t.y+t.height || final_boxes[j].br().y < t.y))
  379. {
  380. matched = true;
  381. returnedNum--;
  382. returnedNumEach--;
  383. //cout << "DontCare!" << endl;
  384. break;
  385. }
  386. }
  387. //if (!matched) cout << "FAIL." << endl;
  388. }
  389. double p = 0.0;
  390. if (0 != returnedNumEach)
  391. {
  392. p = 1.0*returnedCorrectNumEach/returnedNumEach;
  393. }
  394. double r = 0.0;
  395. if (0 != correctNumEach)
  396. {
  397. r = 1.0*returnedCorrectNumEach/correctNumEach;
  398. }
  399. double f1 = 0.0;
  400. if (0 != p+r)
  401. {
  402. f1 = 2*(p*r)/(p+r);
  403. }
  404. if ( (correctNumEach == 0) && (returnedNumEach == 0) )
  405. {
  406. p = 1.;
  407. r = 1.;
  408. f1 = 1.;
  409. }
  410. //printf("|%f|%f|%f|\n",r,p,f1);
  411. f1Each.push_back(f1);
  412. }
  413. double p = 1.0*returnedCorrectNum/returnedNum;
  414. double r = 1.0*returnedCorrectNum/correctNum;
  415. double f1 = 2*(p*r)/(p+r);
  416. printf("\n-------------------------------------------------------------------------\n");
  417. printf("ICDAR2015 -- Challenge 2: \"Focused Scene Text\" -- Task 4 \"End-to-End\"\n");
  418. if (is_word_spotting) printf(" Word spotting results -- ");
  419. else printf(" End-to-End recognition results -- ");
  420. switch (selected_lex)
  421. {
  422. case 0:
  423. printf("generic recognition (no given lexicon)\n");
  424. break;
  425. case 2:
  426. printf("weakly contextualized lexicon (624 words)\n");
  427. break;
  428. default:
  429. printf("strongly contextualized lexicon (100 words)\n");
  430. break;
  431. }
  432. printf(" Recall: %f | Precision: %f | F-score: %f\n", r, p, f1);
  433. printf("-------------------------------------------------------------------------\n\n");
  434. /*double mf1 = 0.0;
  435. for (vector<double>::iterator it=f1Each.begin(); it!=f1Each.end(); ++it)
  436. {
  437. mf1 += *it;
  438. }
  439. mf1 /= f1Each.size();
  440. printf("mean f1: %f\n", mf1);*/
  441. return 0;
  442. }
  443. #else
  444. int main()
  445. {
  446. std::cerr << "OpenCV was built without text module" << std::endl;
  447. return 0;
  448. }
  449. #endif // HAVE_OPENCV_TEXT