123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- #!/usr/bin/env python
- '''
- The sample demonstrates how to train Random Trees classifier
- (or Boosting classifier, or MLP, or Knearest, or Support Vector Machines) using the provided dataset.
- We use the sample database letter-recognition.data
- from UCI Repository, here is the link:
- Newman, D.J. & Hettich, S. & Blake, C.L. & Merz, C.J. (1998).
- UCI Repository of machine learning databases
- [http://www.ics.uci.edu/~mlearn/MLRepository.html].
- Irvine, CA: University of California, Department of Information and Computer Science.
- The dataset consists of 20000 feature vectors along with the
- responses - capital latin letters A..Z.
- The first 10000 samples are used for training
- and the remaining 10000 - to test the classifier.
- ======================================================
- Models: RTrees, KNearest, Boost, SVM, MLP
- '''
- # Python 2/3 compatibility
- from __future__ import print_function
- import numpy as np
- import cv2 as cv
- def load_base(fn):
- a = np.loadtxt(fn, np.float32, delimiter=',', converters={ 0 : lambda ch : ord(ch)-ord('A') })
- samples, responses = a[:,1:], a[:,0]
- return samples, responses
- class LetterStatModel(object):
- class_n = 26
- train_ratio = 0.5
- def load(self, fn):
- self.model.load(fn)
- def save(self, fn):
- self.model.save(fn)
- def unroll_samples(self, samples):
- sample_n, var_n = samples.shape
- new_samples = np.zeros((sample_n * self.class_n, var_n+1), np.float32)
- new_samples[:,:-1] = np.repeat(samples, self.class_n, axis=0)
- new_samples[:,-1] = np.tile(np.arange(self.class_n), sample_n)
- return new_samples
- def unroll_responses(self, responses):
- sample_n = len(responses)
- new_responses = np.zeros(sample_n*self.class_n, np.int32)
- resp_idx = np.int32( responses + np.arange(sample_n)*self.class_n )
- new_responses[resp_idx] = 1
- return new_responses
- class RTrees(LetterStatModel):
- def __init__(self):
- self.model = cv.ml.RTrees_create()
- def train(self, samples, responses):
- #sample_n, var_n = samples.shape
- self.model.setMaxDepth(20)
- self.model.train(samples, cv.ml.ROW_SAMPLE, responses.astype(int))
- def predict(self, samples):
- _ret, resp = self.model.predict(samples)
- return resp.ravel()
- class KNearest(LetterStatModel):
- def __init__(self):
- self.model = cv.ml.KNearest_create()
- def train(self, samples, responses):
- self.model.train(samples, cv.ml.ROW_SAMPLE, responses)
- def predict(self, samples):
- _retval, results, _neigh_resp, _dists = self.model.findNearest(samples, k = 10)
- return results.ravel()
- class Boost(LetterStatModel):
- def __init__(self):
- self.model = cv.ml.Boost_create()
- def train(self, samples, responses):
- _sample_n, var_n = samples.shape
- new_samples = self.unroll_samples(samples)
- new_responses = self.unroll_responses(responses)
- var_types = np.array([cv.ml.VAR_NUMERICAL] * var_n + [cv.ml.VAR_CATEGORICAL, cv.ml.VAR_CATEGORICAL], np.uint8)
- self.model.setWeakCount(15)
- self.model.setMaxDepth(10)
- self.model.train(cv.ml.TrainData_create(new_samples, cv.ml.ROW_SAMPLE, new_responses.astype(int), varType = var_types))
- def predict(self, samples):
- new_samples = self.unroll_samples(samples)
- _ret, resp = self.model.predict(new_samples)
- return resp.ravel().reshape(-1, self.class_n).argmax(1)
- class SVM(LetterStatModel):
- def __init__(self):
- self.model = cv.ml.SVM_create()
- def train(self, samples, responses):
- self.model.setType(cv.ml.SVM_C_SVC)
- self.model.setC(1)
- self.model.setKernel(cv.ml.SVM_RBF)
- self.model.setGamma(.1)
- self.model.train(samples, cv.ml.ROW_SAMPLE, responses.astype(int))
- def predict(self, samples):
- _ret, resp = self.model.predict(samples)
- return resp.ravel()
- class MLP(LetterStatModel):
- def __init__(self):
- self.model = cv.ml.ANN_MLP_create()
- def train(self, samples, responses):
- _sample_n, var_n = samples.shape
- new_responses = self.unroll_responses(responses).reshape(-1, self.class_n)
- layer_sizes = np.int32([var_n, 100, 100, self.class_n])
- self.model.setLayerSizes(layer_sizes)
- self.model.setTrainMethod(cv.ml.ANN_MLP_BACKPROP)
- self.model.setBackpropMomentumScale(0)
- self.model.setBackpropWeightScale(0.001)
- self.model.setTermCriteria((cv.TERM_CRITERIA_COUNT, 20, 0.01))
- self.model.setActivationFunction(cv.ml.ANN_MLP_SIGMOID_SYM, 2, 1)
- self.model.train(samples, cv.ml.ROW_SAMPLE, np.float32(new_responses))
- def predict(self, samples):
- _ret, resp = self.model.predict(samples)
- return resp.argmax(-1)
- from tests_common import NewOpenCVTests
- class letter_recog_test(NewOpenCVTests):
- def test_letter_recog(self):
- eps = 0.01
- models = [RTrees, KNearest, Boost, SVM, MLP]
- models = dict( [(cls.__name__.lower(), cls) for cls in models] )
- testErrors = {RTrees: (98.930000, 92.390000), KNearest: (94.960000, 92.010000),
- Boost: (85.970000, 74.920000), SVM: (99.780000, 95.680000), MLP: (90.060000, 87.410000)}
- for model in models:
- Model = models[model]
- classifier = Model()
- samples, responses = load_base(self.repoPath + '/samples/data/letter-recognition.data')
- train_n = int(len(samples)*classifier.train_ratio)
- classifier.train(samples[:train_n], responses[:train_n])
- train_rate = np.mean(classifier.predict(samples[:train_n]) == responses[:train_n].astype(int))
- test_rate = np.mean(classifier.predict(samples[train_n:]) == responses[train_n:].astype(int))
- self.assertLess(train_rate - testErrors[Model][0], eps)
- self.assertLess(test_rate - testErrors[Model][1], eps)
- if __name__ == '__main__':
- NewOpenCVTests.bootstrap()
|