Renesas / opencv-lib

Dependents:   RZ_A2M_Mbed_samples

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers kmeans_index.h Source File

kmeans_index.h

00001 /***********************************************************************
00002  * Software License Agreement (BSD License)
00003  *
00004  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
00005  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
00006  *
00007  * THE BSD LICENSE
00008  *
00009  * Redistribution and use in source and binary forms, with or without
00010  * modification, are permitted provided that the following conditions
00011  * are met:
00012  *
00013  * 1. Redistributions of source code must retain the above copyright
00014  *    notice, this list of conditions and the following disclaimer.
00015  * 2. Redistributions in binary form must reproduce the above copyright
00016  *    notice, this list of conditions and the following disclaimer in the
00017  *    documentation and/or other materials provided with the distribution.
00018  *
00019  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
00020  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00021  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
00022  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
00023  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
00024  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00025  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00026  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
00028  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  *************************************************************************/
00030 
00031 #ifndef OPENCV_FLANN_KMEANS_INDEX_H_
00032 #define OPENCV_FLANN_KMEANS_INDEX_H_
00033 
00034 #include <algorithm>
00035 #include <map>
00036 #include <cassert>
00037 #include <limits>
00038 #include <cmath>
00039 
00040 #include "general.h"
00041 #include "nn_index.h"
00042 #include "dist.h"
00043 #include "matrix.h"
00044 #include "result_set.h"
00045 #include "heap.h"
00046 #include "allocator.h"
00047 #include "random.h"
00048 #include "saving.h"
00049 #include "logger.h"
00050 
00051 
00052 namespace cvflann
00053 {
00054 
00055 struct KMeansIndexParams : public IndexParams
00056 {
00057     KMeansIndexParams(int branching = 32, int iterations = 11,
00058                       flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
00059     {
00060         (*this)["algorithm"] = FLANN_INDEX_KMEANS;
00061         // branching factor
00062         (*this)["branching"] = branching;
00063         // max iterations to perform in one kmeans clustering (kmeans tree)
00064         (*this)["iterations"] = iterations;
00065         // algorithm used for picking the initial cluster centers for kmeans tree
00066         (*this)["centers_init"] = centers_init;
00067         // cluster boundary index. Used when searching the kmeans tree
00068         (*this)["cb_index"] = cb_index;
00069     }
00070 };
00071 
00072 
00073 /**
00074  * Hierarchical kmeans index
00075  *
00076  * Contains a tree constructed through a hierarchical kmeans clustering
00077  * and other information for indexing a set of points for nearest-neighbour matching.
00078  */
00079 template <typename Distance>
00080 class KMeansIndex : public NNIndex<Distance>
00081 {
00082 public:
00083     typedef typename Distance::ElementType ElementType;
00084     typedef typename Distance::ResultType DistanceType;
00085 
00086 
00087 
00088     typedef void (KMeansIndex::* centersAlgFunction)(int, int*, int, int*, int&);
00089 
00090     /**
00091      * The function used for choosing the cluster centers.
00092      */
00093     centersAlgFunction chooseCenters;
00094 
00095 
00096 
00097     /**
00098      * Chooses the initial centers in the k-means clustering in a random manner.
00099      *
00100      * Params:
00101      *     k = number of centers
00102      *     vecs = the dataset of points
00103      *     indices = indices in the dataset
00104      *     indices_length = length of indices vector
00105      *
00106      */
00107     void chooseCentersRandom(int k, int* indices, int indices_length, int* centers, int& centers_length)
00108     {
00109         UniqueRandom r(indices_length);
00110 
00111         int index;
00112         for (index=0; index<k; ++index) {
00113             bool duplicate = true;
00114             int rnd;
00115             while (duplicate) {
00116                 duplicate = false;
00117                 rnd = r.next();
00118                 if (rnd<0) {
00119                     centers_length = index;
00120                     return;
00121                 }
00122 
00123                 centers[index] = indices[rnd];
00124 
00125                 for (int j=0; j<index; ++j) {
00126                     DistanceType sq = distance_(dataset_[centers[index]], dataset_[centers[j]], dataset_.cols);
00127                     if (sq<1e-16) {
00128                         duplicate = true;
00129                     }
00130                 }
00131             }
00132         }
00133 
00134         centers_length = index;
00135     }
00136 
00137 
00138     /**
00139      * Chooses the initial centers in the k-means using Gonzales' algorithm
00140      * so that the centers are spaced apart from each other.
00141      *
00142      * Params:
00143      *     k = number of centers
00144      *     vecs = the dataset of points
00145      *     indices = indices in the dataset
00146      * Returns:
00147      */
00148     void chooseCentersGonzales(int k, int* indices, int indices_length, int* centers, int& centers_length)
00149     {
00150         int n = indices_length;
00151 
00152         int rnd = rand_int(n);
00153         assert(rnd >=0 && rnd < n);
00154 
00155         centers[0] = indices[rnd];
00156 
00157         int index;
00158         for (index=1; index<k; ++index) {
00159 
00160             int best_index = -1;
00161             DistanceType best_val = 0;
00162             for (int j=0; j<n; ++j) {
00163                 DistanceType dist = distance_(dataset_[centers[0]],dataset_[indices[j]],dataset_.cols);
00164                 for (int i=1; i<index; ++i) {
00165                     DistanceType tmp_dist = distance_(dataset_[centers[i]],dataset_[indices[j]],dataset_.cols);
00166                     if (tmp_dist<dist) {
00167                         dist = tmp_dist;
00168                     }
00169                 }
00170                 if (dist>best_val) {
00171                     best_val = dist;
00172                     best_index = j;
00173                 }
00174             }
00175             if (best_index!=-1) {
00176                 centers[index] = indices[best_index];
00177             }
00178             else {
00179                 break;
00180             }
00181         }
00182         centers_length = index;
00183     }
00184 
00185 
00186     /**
00187      * Chooses the initial centers in the k-means using the algorithm
00188      * proposed in the KMeans++ paper:
00189      * Arthur, David; Vassilvitskii, Sergei - k-means++: The Advantages of Careful Seeding
00190      *
00191      * Implementation of this function was converted from the one provided in Arthur's code.
00192      *
00193      * Params:
00194      *     k = number of centers
00195      *     vecs = the dataset of points
00196      *     indices = indices in the dataset
00197      * Returns:
00198      */
00199     void chooseCentersKMeanspp(int k, int* indices, int indices_length, int* centers, int& centers_length)
00200     {
00201         int n = indices_length;
00202 
00203         double currentPot = 0;
00204         DistanceType* closestDistSq = new DistanceType[n];
00205 
00206         // Choose one random center and set the closestDistSq values
00207         int index = rand_int(n);
00208         assert(index >=0 && index < n);
00209         centers[0] = indices[index];
00210 
00211         for (int i = 0; i < n; i++) {
00212             closestDistSq[i] = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols);
00213             closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] );
00214             currentPot += closestDistSq[i];
00215         }
00216 
00217 
00218         const int numLocalTries = 1;
00219 
00220         // Choose each center
00221         int centerCount;
00222         for (centerCount = 1; centerCount < k; centerCount++) {
00223 
00224             // Repeat several trials
00225             double bestNewPot = -1;
00226             int bestNewIndex = -1;
00227             for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {
00228 
00229                 // Choose our center - have to be slightly careful to return a valid answer even accounting
00230                 // for possible rounding errors
00231                 double randVal = rand_double(currentPot);
00232                 for (index = 0; index < n-1; index++) {
00233                     if (randVal <= closestDistSq[index]) break;
00234                     else randVal -= closestDistSq[index];
00235                 }
00236 
00237                 // Compute the new potential
00238                 double newPot = 0;
00239                 for (int i = 0; i < n; i++) {
00240                     DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols);
00241                     newPot += std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
00242                 }
00243 
00244                 // Store the best result
00245                 if ((bestNewPot < 0)||(newPot < bestNewPot)) {
00246                     bestNewPot = newPot;
00247                     bestNewIndex = index;
00248                 }
00249             }
00250 
00251             // Add the appropriate center
00252             centers[centerCount] = indices[bestNewIndex];
00253             currentPot = bestNewPot;
00254             for (int i = 0; i < n; i++) {
00255                 DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[bestNewIndex]], dataset_.cols);
00256                 closestDistSq[i] = std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
00257             }
00258         }
00259 
00260         centers_length = centerCount;
00261 
00262         delete[] closestDistSq;
00263     }
00264 
00265 
00266 
00267 public:
00268 
00269     flann_algorithm_t getType () const
00270     {
00271         return FLANN_INDEX_KMEANS;
00272     }
00273 
00274     class KMeansDistanceComputer : public cv::ParallelLoopBody
00275     {
00276     public:
00277         KMeansDistanceComputer(Distance _distance, const Matrix<ElementType> & _dataset,
00278             const int _branching, const int* _indices, const Matrix<double> & _dcenters, const size_t _veclen,
00279             int* _count, int* _belongs_to, std::vector<DistanceType>& _radiuses, bool& _converged, cv::Mutex& _mtx)
00280             : distance(_distance)
00281             , dataset(_dataset)
00282             , branching(_branching)
00283             , indices(_indices)
00284             , dcenters(_dcenters)
00285             , veclen(_veclen)
00286             , count(_count)
00287             , belongs_to(_belongs_to)
00288             , radiuses(_radiuses)
00289             , converged(_converged)
00290             , mtx(_mtx)
00291         {
00292         }
00293 
00294         void operator()(const cv::Range& range) const
00295         {
00296             const int begin = range.start;
00297             const int end = range.end;
00298 
00299             for( int i = begin; i<end; ++i)
00300             {
00301                 DistanceType sq_dist = distance(dataset[indices[i]], dcenters[0], veclen);
00302                 int new_centroid = 0;
00303                 for (int j=1; j<branching; ++j) {
00304                     DistanceType new_sq_dist = distance(dataset[indices[i]], dcenters[j], veclen);
00305                     if (sq_dist>new_sq_dist) {
00306                         new_centroid = j;
00307                         sq_dist = new_sq_dist;
00308                     }
00309                 }
00310                 if (sq_dist > radiuses[new_centroid]) {
00311                     radiuses[new_centroid] = sq_dist;
00312                 }
00313                 if (new_centroid != belongs_to[i]) {
00314                     count[belongs_to[i]]--;
00315                     count[new_centroid]++;
00316                     belongs_to[i] = new_centroid;
00317                     mtx.lock();
00318                     converged = false;
00319                     mtx.unlock();
00320                 }
00321             }
00322         }
00323 
00324     private:
00325         Distance distance;
00326         const Matrix<ElementType>& dataset;
00327         const int branching;
00328         const int* indices;
00329         const Matrix<double>& dcenters;
00330         const size_t veclen;
00331         int* count;
00332         int* belongs_to;
00333         std::vector<DistanceType>& radiuses;
00334         bool& converged;
00335         cv::Mutex& mtx;
00336         KMeansDistanceComputer& operator=( const KMeansDistanceComputer & ) { return *this; }
00337     };
00338 
00339     /**
00340      * Index constructor
00341      *
00342      * Params:
00343      *          inputData = dataset with the input features
00344      *          params = parameters passed to the hierarchical k-means algorithm
00345      */
00346     KMeansIndex(const Matrix<ElementType> & inputData, const IndexParams& params = KMeansIndexParams(),
00347                 Distance d = Distance())
00348         : dataset_(inputData), index_params_(params), root_(NULL), indices_(NULL), distance_(d)
00349     {
00350         memoryCounter_ = 0;
00351 
00352         size_ = dataset_.rows;
00353         veclen_ = dataset_.cols;
00354 
00355         branching_ = get_param(params,"branching",32);
00356         iterations_ = get_param(params,"iterations",11);
00357         if (iterations_<0) {
00358             iterations_ = (std::numeric_limits<int>::max)();
00359         }
00360         centers_init_  = get_param(params,"centers_init",FLANN_CENTERS_RANDOM);
00361 
00362         if (centers_init_==FLANN_CENTERS_RANDOM) {
00363             chooseCenters = &KMeansIndex::chooseCentersRandom;
00364         }
00365         else if (centers_init_==FLANN_CENTERS_GONZALES) {
00366             chooseCenters = &KMeansIndex::chooseCentersGonzales;
00367         }
00368         else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
00369             chooseCenters = &KMeansIndex::chooseCentersKMeanspp;
00370         }
00371         else {
00372             throw FLANNException("Unknown algorithm for choosing initial centers.");
00373         }
00374         cb_index_ = 0.4f;
00375 
00376     }
00377 
00378 
00379     KMeansIndex(const KMeansIndex&);
00380     KMeansIndex& operator=(const KMeansIndex&);
00381 
00382 
00383     /**
00384      * Index destructor.
00385      *
00386      * Release the memory used by the index.
00387      */
00388     virtual ~KMeansIndex()
00389     {
00390         if (root_ != NULL) {
00391             free_centers(root_);
00392         }
00393         if (indices_!=NULL) {
00394             delete[] indices_;
00395         }
00396     }
00397 
00398     /**
00399      *  Returns size of index.
00400      */
00401     size_t size() const
00402     {
00403         return size_;
00404     }
00405 
00406     /**
00407      * Returns the length of an index feature.
00408      */
00409     size_t veclen() const
00410     {
00411         return veclen_;
00412     }
00413 
00414 
00415     void set_cb_index( float index)
00416     {
00417         cb_index_ = index;
00418     }
00419 
00420     /**
00421      * Computes the inde memory usage
00422      * Returns: memory used by the index
00423      */
00424     int usedMemory() const
00425     {
00426         return pool_.usedMemory+pool_.wastedMemory+memoryCounter_;
00427     }
00428 
00429     /**
00430      * Builds the index
00431      */
00432     void buildIndex()
00433     {
00434         if (branching_<2) {
00435             throw FLANNException("Branching factor must be at least 2");
00436         }
00437 
00438         indices_ = new int[size_];
00439         for (size_t i=0; i<size_; ++i) {
00440             indices_[i] = int(i);
00441         }
00442 
00443         root_ = pool_.allocate<KMeansNode>();
00444         std::memset(root_, 0, sizeof(KMeansNode));
00445 
00446         computeNodeStatistics(root_, indices_, (int)size_);
00447         computeClustering(root_, indices_, (int)size_, branching_,0);
00448     }
00449 
00450 
00451     void saveIndex(FILE* stream)
00452     {
00453         save_value(stream, branching_);
00454         save_value(stream, iterations_);
00455         save_value(stream, memoryCounter_);
00456         save_value(stream, cb_index_);
00457         save_value(stream, *indices_, (int)size_);
00458 
00459         save_tree(stream, root_);
00460     }
00461 
00462 
00463     void loadIndex(FILE* stream)
00464     {
00465         load_value(stream, branching_);
00466         load_value(stream, iterations_);
00467         load_value(stream, memoryCounter_);
00468         load_value(stream, cb_index_);
00469         if (indices_!=NULL) {
00470             delete[] indices_;
00471         }
00472         indices_ = new int[size_];
00473         load_value(stream, *indices_, size_);
00474 
00475         if (root_!=NULL) {
00476             free_centers(root_);
00477         }
00478         load_tree(stream, root_);
00479 
00480         index_params_["algorithm"] = getType ();
00481         index_params_["branching"] = branching_;
00482         index_params_["iterations"] = iterations_;
00483         index_params_["centers_init"] = centers_init_;
00484         index_params_["cb_index"] = cb_index_;
00485 
00486     }
00487 
00488 
00489     /**
00490      * Find set of nearest neighbors to vec. Their indices are stored inside
00491      * the result object.
00492      *
00493      * Params:
00494      *     result = the result object in which the indices of the nearest-neighbors are stored
00495      *     vec = the vector for which to search the nearest neighbors
00496      *     searchParams = parameters that influence the search algorithm (checks, cb_index)
00497      */
00498     void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
00499     {
00500 
00501         int maxChecks = get_param(searchParams,"checks",32);
00502 
00503         if (maxChecks==FLANN_CHECKS_UNLIMITED) {
00504             findExactNN(root_, result, vec);
00505         }
00506         else {
00507             // Priority queue storing intermediate branches in the best-bin-first search
00508             Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
00509 
00510             int checks = 0;
00511             findNN(root_, result, vec, checks, maxChecks, heap);
00512 
00513             BranchSt branch;
00514             while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
00515                 KMeansNodePtr node = branch.node;
00516                 findNN(node, result, vec, checks, maxChecks, heap);
00517             }
00518             assert(result.full());
00519 
00520             delete heap;
00521         }
00522 
00523     }
00524 
00525     /**
00526      * Clustering function that takes a cut in the hierarchical k-means
00527      * tree and return the clusters centers of that clustering.
00528      * Params:
00529      *     numClusters = number of clusters to have in the clustering computed
00530      * Returns: number of cluster centers
00531      */
00532     int getClusterCenters(Matrix<DistanceType>& centers)
00533     {
00534         int numClusters = centers.rows;
00535         if (numClusters<1) {
00536             throw FLANNException("Number of clusters must be at least 1");
00537         }
00538 
00539         DistanceType variance;
00540         KMeansNodePtr* clusters = new KMeansNodePtr[numClusters];
00541 
00542         int clusterCount = getMinVarianceClusters(root_, clusters, numClusters, variance);
00543 
00544         Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);
00545 
00546         for (int i=0; i<clusterCount; ++i) {
00547             DistanceType* center = clusters[i]->pivot;
00548             for (size_t j=0; j<veclen_; ++j) {
00549                 centers[i][j] = center[j];
00550             }
00551         }
00552         delete[] clusters;
00553 
00554         return clusterCount;
00555     }
00556 
00557     IndexParams getParameters () const
00558     {
00559         return index_params_;
00560     }
00561 
00562 
00563 private:
00564     /**
00565      * Struture representing a node in the hierarchical k-means tree.
00566      */
00567     struct KMeansNode
00568     {
00569         /**
00570          * The cluster center.
00571          */
00572         DistanceType* pivot;
00573         /**
00574          * The cluster radius.
00575          */
00576         DistanceType radius;
00577         /**
00578          * The cluster mean radius.
00579          */
00580         DistanceType mean_radius;
00581         /**
00582          * The cluster variance.
00583          */
00584         DistanceType variance;
00585         /**
00586          * The cluster size (number of points in the cluster)
00587          */
00588         int size;
00589         /**
00590          * Child nodes (only for non-terminal nodes)
00591          */
00592         KMeansNode** childs;
00593         /**
00594          * Node points (only for terminal nodes)
00595          */
00596         int* indices;
00597         /**
00598          * Level
00599          */
00600         int level;
00601     };
00602     typedef KMeansNode* KMeansNodePtr;
00603 
00604     /**
00605      * Alias definition for a nicer syntax.
00606      */
00607     typedef BranchStruct<KMeansNodePtr, DistanceType> BranchSt;
00608 
00609 
00610 
00611 
00612     void save_tree(FILE* stream, KMeansNodePtr node)
00613     {
00614         save_value(stream, *node);
00615         save_value(stream, *(node->pivot), (int)veclen_);
00616         if (node->childs==NULL) {
00617             int indices_offset = (int)(node->indices - indices_);
00618             save_value(stream, indices_offset);
00619         }
00620         else {
00621             for(int i=0; i<branching_; ++i) {
00622                 save_tree(stream, node->childs[i]);
00623             }
00624         }
00625     }
00626 
00627 
00628     void load_tree(FILE* stream, KMeansNodePtr& node)
00629     {
00630         node = pool_.allocate<KMeansNode>();
00631         load_value(stream, *node);
00632         node->pivot = new DistanceType[veclen_];
00633         load_value(stream, *(node->pivot), (int)veclen_);
00634         if (node->childs==NULL) {
00635             int indices_offset;
00636             load_value(stream, indices_offset);
00637             node->indices = indices_ + indices_offset;
00638         }
00639         else {
00640             node->childs = pool_.allocate<KMeansNodePtr>(branching_);
00641             for(int i=0; i<branching_; ++i) {
00642                 load_tree(stream, node->childs[i]);
00643             }
00644         }
00645     }
00646 
00647 
00648     /**
00649      * Helper function
00650      */
00651     void free_centers(KMeansNodePtr node)
00652     {
00653         delete[] node->pivot;
00654         if (node->childs!=NULL) {
00655             for (int k=0; k<branching_; ++k) {
00656                 free_centers(node->childs[k]);
00657             }
00658         }
00659     }
00660 
00661     /**
00662      * Computes the statistics of a node (mean, radius, variance).
00663      *
00664      * Params:
00665      *     node = the node to use
00666      *     indices = the indices of the points belonging to the node
00667      */
00668     void computeNodeStatistics(KMeansNodePtr node, int* indices, int indices_length)
00669     {
00670 
00671         DistanceType radius = 0;
00672         DistanceType variance = 0;
00673         DistanceType* mean = new DistanceType[veclen_];
00674         memoryCounter_ += int(veclen_*sizeof(DistanceType));
00675 
00676         memset(mean,0,veclen_*sizeof(DistanceType));
00677 
00678         for (size_t i=0; i<size_; ++i) {
00679             ElementType* vec = dataset_[indices[i]];
00680             for (size_t j=0; j<veclen_; ++j) {
00681                 mean[j] += vec[j];
00682             }
00683             variance += distance_(vec, ZeroIterator<ElementType>(), veclen_);
00684         }
00685         for (size_t j=0; j<veclen_; ++j) {
00686             mean[j] /= size_;
00687         }
00688         variance /= size_;
00689         variance -= distance_(mean, ZeroIterator<ElementType>(), veclen_);
00690 
00691         DistanceType tmp = 0;
00692         for (int i=0; i<indices_length; ++i) {
00693             tmp = distance_(mean, dataset_[indices[i]], veclen_);
00694             if (tmp>radius) {
00695                 radius = tmp;
00696             }
00697         }
00698 
00699         node->variance = variance;
00700         node->radius = radius;
00701         node->pivot = mean;
00702     }
00703 
00704 
00705     /**
00706      * The method responsible with actually doing the recursive hierarchical
00707      * clustering
00708      *
00709      * Params:
00710      *     node = the node to cluster
00711      *     indices = indices of the points belonging to the current node
00712      *     branching = the branching factor to use in the clustering
00713      *
00714      * TODO: for 1-sized clusters don't store a cluster center (it's the same as the single cluster point)
00715      */
00716     void computeClustering(KMeansNodePtr node, int* indices, int indices_length, int branching, int level)
00717     {
00718         node->size = indices_length;
00719         node->level = level;
00720 
00721         if (indices_length < branching) {
00722             node->indices = indices;
00723             std::sort(node->indices,node->indices+indices_length);
00724             node->childs = NULL;
00725             return;
00726         }
00727 
00728         cv::AutoBuffer<int> centers_idx_buf(branching);
00729         int* centers_idx = (int*)centers_idx_buf;
00730         int centers_length;
00731         (this->*chooseCenters)(branching, indices, indices_length, centers_idx, centers_length);
00732 
00733         if (centers_length<branching) {
00734             node->indices = indices;
00735             std::sort(node->indices,node->indices+indices_length);
00736             node->childs = NULL;
00737             return;
00738         }
00739 
00740 
00741         cv::AutoBuffer<double> dcenters_buf(branching*veclen_);
00742         Matrix<double> dcenters((double*)dcenters_buf,branching,veclen_);
00743         for (int i=0; i<centers_length; ++i) {
00744             ElementType* vec = dataset_[centers_idx[i]];
00745             for (size_t k=0; k<veclen_; ++k) {
00746                 dcenters[i][k] = double(vec[k]);
00747             }
00748         }
00749 
00750         std::vector<DistanceType> radiuses(branching);
00751         cv::AutoBuffer<int> count_buf(branching);
00752         int* count = (int*)count_buf;
00753         for (int i=0; i<branching; ++i) {
00754             radiuses[i] = 0;
00755             count[i] = 0;
00756         }
00757 
00758         //  assign points to clusters
00759         cv::AutoBuffer<int> belongs_to_buf(indices_length);
00760         int* belongs_to = (int*)belongs_to_buf;
00761         for (int i=0; i<indices_length; ++i) {
00762 
00763             DistanceType sq_dist = distance_(dataset_[indices[i]], dcenters[0], veclen_);
00764             belongs_to[i] = 0;
00765             for (int j=1; j<branching; ++j) {
00766                 DistanceType new_sq_dist = distance_(dataset_[indices[i]], dcenters[j], veclen_);
00767                 if (sq_dist>new_sq_dist) {
00768                     belongs_to[i] = j;
00769                     sq_dist = new_sq_dist;
00770                 }
00771             }
00772             if (sq_dist>radiuses[belongs_to[i]]) {
00773                 radiuses[belongs_to[i]] = sq_dist;
00774             }
00775             count[belongs_to[i]]++;
00776         }
00777 
00778         bool converged = false;
00779         int iteration = 0;
00780         while (!converged && iteration<iterations_) {
00781             converged = true;
00782             iteration++;
00783 
00784             // compute the new cluster centers
00785             for (int i=0; i<branching; ++i) {
00786                 memset(dcenters[i],0,sizeof(double)*veclen_);
00787                 radiuses[i] = 0;
00788             }
00789             for (int i=0; i<indices_length; ++i) {
00790                 ElementType* vec = dataset_[indices[i]];
00791                 double* center = dcenters[belongs_to[i]];
00792                 for (size_t k=0; k<veclen_; ++k) {
00793                     center[k] += vec[k];
00794                 }
00795             }
00796             for (int i=0; i<branching; ++i) {
00797                 int cnt = count[i];
00798                 for (size_t k=0; k<veclen_; ++k) {
00799                     dcenters[i][k] /= cnt;
00800                 }
00801             }
00802 
00803             // reassign points to clusters
00804             cv::Mutex mtx;
00805             KMeansDistanceComputer invoker(distance_, dataset_, branching, indices, dcenters, veclen_, count, belongs_to, radiuses, converged, mtx);
00806             parallel_for_(cv::Range(0, (int)indices_length), invoker);
00807 
00808             for (int i=0; i<branching; ++i) {
00809                 // if one cluster converges to an empty cluster,
00810                 // move an element into that cluster
00811                 if (count[i]==0) {
00812                     int j = (i+1)%branching;
00813                     while (count[j]<=1) {
00814                         j = (j+1)%branching;
00815                     }
00816 
00817                     for (int k=0; k<indices_length; ++k) {
00818                         if (belongs_to[k]==j) {
00819                             // for cluster j, we move the furthest element from the center to the empty cluster i
00820                             if ( distance_(dataset_[indices[k]], dcenters[j], veclen_) == radiuses[j] ) {
00821                                 belongs_to[k] = i;
00822                                 count[j]--;
00823                                 count[i]++;
00824                                 break;
00825                             }
00826                         }
00827                     }
00828                     converged = false;
00829                 }
00830             }
00831 
00832         }
00833 
00834         DistanceType** centers = new DistanceType*[branching];
00835 
00836         for (int i=0; i<branching; ++i) {
00837             centers[i] = new DistanceType[veclen_];
00838             memoryCounter_ += (int)(veclen_*sizeof(DistanceType));
00839             for (size_t k=0; k<veclen_; ++k) {
00840                 centers[i][k] = (DistanceType)dcenters[i][k];
00841             }
00842         }
00843 
00844 
00845         // compute kmeans clustering for each of the resulting clusters
00846         node->childs = pool_.allocate<KMeansNodePtr>(branching);
00847         int start = 0;
00848         int end = start;
00849         for (int c=0; c<branching; ++c) {
00850             int s = count[c];
00851 
00852             DistanceType variance = 0;
00853             DistanceType mean_radius =0;
00854             for (int i=0; i<indices_length; ++i) {
00855                 if (belongs_to[i]==c) {
00856                     DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
00857                     variance += d;
00858                     mean_radius += sqrt(d);
00859                     std::swap(indices[i],indices[end]);
00860                     std::swap(belongs_to[i],belongs_to[end]);
00861                     end++;
00862                 }
00863             }
00864             variance /= s;
00865             mean_radius /= s;
00866             variance -= distance_(centers[c], ZeroIterator<ElementType>(), veclen_);
00867 
00868             node->childs[c] = pool_.allocate<KMeansNode>();
00869             std::memset(node->childs[c], 0, sizeof(KMeansNode));
00870             node->childs[c]->radius = radiuses[c];
00871             node->childs[c]->pivot = centers[c];
00872             node->childs[c]->variance = variance;
00873             node->childs[c]->mean_radius = mean_radius;
00874             computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
00875             start=end;
00876         }
00877 
00878         delete[] centers;
00879     }
00880 
00881 
00882 
00883     /**
00884      * Performs one descent in the hierarchical k-means tree. The branches not
00885      * visited are stored in a priority queue.
00886      *
00887      * Params:
00888      *      node = node to explore
00889      *      result = container for the k-nearest neighbors found
00890      *      vec = query points
00891      *      checks = how many points in the dataset have been checked so far
00892      *      maxChecks = maximum dataset points to checks
00893      */
00894 
00895 
00896     void findNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
00897                 Heap<BranchSt>* heap)
00898     {
00899         // Ignore those clusters that are too far away
00900         {
00901             DistanceType bsq = distance_(vec, node->pivot, veclen_);
00902             DistanceType rsq = node->radius;
00903             DistanceType wsq = result.worstDist();
00904 
00905             DistanceType val = bsq-rsq-wsq;
00906             DistanceType val2 = val*val-4*rsq*wsq;
00907 
00908             //if (val>0) {
00909             if ((val>0)&&(val2>0)) {
00910                 return;
00911             }
00912         }
00913 
00914         if (node->childs==NULL) {
00915             if (checks>=maxChecks) {
00916                 if (result.full()) return;
00917             }
00918             checks += node->size;
00919             for (int i=0; i<node->size; ++i) {
00920                 int index = node->indices[i];
00921                 DistanceType dist = distance_(dataset_[index], vec, veclen_);
00922                 result.addPoint(dist, index);
00923             }
00924         }
00925         else {
00926             DistanceType* domain_distances = new DistanceType[branching_];
00927             int closest_center = exploreNodeBranches(node, vec, domain_distances, heap);
00928             delete[] domain_distances;
00929             findNN(node->childs[closest_center],result,vec, checks, maxChecks, heap);
00930         }
00931     }
00932 
00933     /**
00934      * Helper function that computes the nearest childs of a node to a given query point.
00935      * Params:
00936      *     node = the node
00937      *     q = the query point
00938      *     distances = array with the distances to each child node.
00939      * Returns:
00940      */
00941     int exploreNodeBranches(KMeansNodePtr node, const ElementType* q, DistanceType* domain_distances, Heap<BranchSt>* heap)
00942     {
00943 
00944         int best_index = 0;
00945         domain_distances[best_index] = distance_(q, node->childs[best_index]->pivot, veclen_);
00946         for (int i=1; i<branching_; ++i) {
00947             domain_distances[i] = distance_(q, node->childs[i]->pivot, veclen_);
00948             if (domain_distances[i]<domain_distances[best_index]) {
00949                 best_index = i;
00950             }
00951         }
00952 
00953         //      float* best_center = node->childs[best_index]->pivot;
00954         for (int i=0; i<branching_; ++i) {
00955             if (i != best_index) {
00956                 domain_distances[i] -= cb_index_*node->childs[i]->variance;
00957 
00958                 //              float dist_to_border = getDistanceToBorder(node.childs[i].pivot,best_center,q);
00959                 //              if (domain_distances[i]<dist_to_border) {
00960                 //                  domain_distances[i] = dist_to_border;
00961                 //              }
00962                 heap->insert(BranchSt(node->childs[i],domain_distances[i]));
00963             }
00964         }
00965 
00966         return best_index;
00967     }
00968 
00969 
00970     /**
00971      * Function the performs exact nearest neighbor search by traversing the entire tree.
00972      */
00973     void findExactNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec)
00974     {
00975         // Ignore those clusters that are too far away
00976         {
00977             DistanceType bsq = distance_(vec, node->pivot, veclen_);
00978             DistanceType rsq = node->radius;
00979             DistanceType wsq = result.worstDist();
00980 
00981             DistanceType val = bsq-rsq-wsq;
00982             DistanceType val2 = val*val-4*rsq*wsq;
00983 
00984             //                  if (val>0) {
00985             if ((val>0)&&(val2>0)) {
00986                 return;
00987             }
00988         }
00989 
00990 
00991         if (node->childs==NULL) {
00992             for (int i=0; i<node->size; ++i) {
00993                 int index = node->indices[i];
00994                 DistanceType dist = distance_(dataset_[index], vec, veclen_);
00995                 result.addPoint(dist, index);
00996             }
00997         }
00998         else {
00999             int* sort_indices = new int[branching_];
01000 
01001             getCenterOrdering(node, vec, sort_indices);
01002 
01003             for (int i=0; i<branching_; ++i) {
01004                 findExactNN(node->childs[sort_indices[i]],result,vec);
01005             }
01006 
01007             delete[] sort_indices;
01008         }
01009     }
01010 
01011 
01012     /**
01013      * Helper function.
01014      *
01015      * I computes the order in which to traverse the child nodes of a particular node.
01016      */
01017     void getCenterOrdering(KMeansNodePtr node, const ElementType* q, int* sort_indices)
01018     {
01019         DistanceType* domain_distances = new DistanceType[branching_];
01020         for (int i=0; i<branching_; ++i) {
01021             DistanceType dist = distance_(q, node->childs[i]->pivot, veclen_);
01022 
01023             int j=0;
01024             while (domain_distances[j]<dist && j<i) j++;
01025             for (int k=i; k>j; --k) {
01026                 domain_distances[k] = domain_distances[k-1];
01027                 sort_indices[k] = sort_indices[k-1];
01028             }
01029             domain_distances[j] = dist;
01030             sort_indices[j] = i;
01031         }
01032         delete[] domain_distances;
01033     }
01034 
01035     /**
01036      * Method that computes the squared distance from the query point q
01037      * from inside region with center c to the border between this
01038      * region and the region with center p
01039      */
01040     DistanceType getDistanceToBorder(DistanceType* p, DistanceType* c, DistanceType* q)
01041     {
01042         DistanceType sum = 0;
01043         DistanceType sum2 = 0;
01044 
01045         for (int i=0; i<veclen_; ++i) {
01046             DistanceType t = c[i]-p[i];
01047             sum += t*(q[i]-(c[i]+p[i])/2);
01048             sum2 += t*t;
01049         }
01050 
01051         return sum*sum/sum2;
01052     }
01053 
01054 
01055     /**
01056      * Helper function the descends in the hierarchical k-means tree by spliting those clusters that minimize
01057      * the overall variance of the clustering.
01058      * Params:
01059      *     root = root node
01060      *     clusters = array with clusters centers (return value)
01061      *     varianceValue = variance of the clustering (return value)
01062      * Returns:
01063      */
01064     int getMinVarianceClusters(KMeansNodePtr root, KMeansNodePtr* clusters, int clusters_length, DistanceType& varianceValue)
01065     {
01066         int clusterCount = 1;
01067         clusters[0] = root;
01068 
01069         DistanceType meanVariance = root->variance*root->size;
01070 
01071         while (clusterCount<clusters_length) {
01072             DistanceType minVariance = (std::numeric_limits<DistanceType>::max)();
01073             int splitIndex = -1;
01074 
01075             for (int i=0; i<clusterCount; ++i) {
01076                 if (clusters[i]->childs != NULL) {
01077 
01078                     DistanceType variance = meanVariance - clusters[i]->variance*clusters[i]->size;
01079 
01080                     for (int j=0; j<branching_; ++j) {
01081                         variance += clusters[i]->childs[j]->variance*clusters[i]->childs[j]->size;
01082                     }
01083                     if (variance<minVariance) {
01084                         minVariance = variance;
01085                         splitIndex = i;
01086                     }
01087                 }
01088             }
01089 
01090             if (splitIndex==-1) break;
01091             if ( (branching_+clusterCount-1) > clusters_length) break;
01092 
01093             meanVariance = minVariance;
01094 
01095             // split node
01096             KMeansNodePtr toSplit = clusters[splitIndex];
01097             clusters[splitIndex] = toSplit->childs[0];
01098             for (int i=1; i<branching_; ++i) {
01099                 clusters[clusterCount++] = toSplit->childs[i];
01100             }
01101         }
01102 
01103         varianceValue = meanVariance/root->size;
01104         return clusterCount;
01105     }
01106 
01107 private:
01108     /** The branching factor used in the hierarchical k-means clustering */
01109     int branching_;
01110 
01111     /** Maximum number of iterations to use when performing k-means clustering */
01112     int iterations_;
01113 
01114     /** Algorithm for choosing the cluster centers */
01115     flann_centers_init_t centers_init_;
01116 
01117     /**
01118      * Cluster border index. This is used in the tree search phase when determining
01119      * the closest cluster to explore next. A zero value takes into account only
01120      * the cluster centres, a value greater then zero also take into account the size
01121      * of the cluster.
01122      */
01123     float cb_index_;
01124 
01125     /**
01126      * The dataset used by this index
01127      */
01128     const Matrix<ElementType> dataset_;
01129 
01130     /** Index parameters */
01131     IndexParams index_params_;
01132 
01133     /**
01134      * Number of features in the dataset.
01135      */
01136     size_t size_;
01137 
01138     /**
01139      * Length of each feature.
01140      */
01141     size_t veclen_;
01142 
01143     /**
01144      * The root node in the tree.
01145      */
01146     KMeansNodePtr root_;
01147 
01148     /**
01149      *  Array of indices to vectors in the dataset.
01150      */
01151     int* indices_;
01152 
01153     /**
01154      * The distance
01155      */
01156     Distance distance_;
01157 
01158     /**
01159      * Pooled memory allocator.
01160      */
01161     PooledAllocator pool_;
01162 
01163     /**
01164      * Memory occupied by the index.
01165      */
01166     int memoryCounter_;
01167 };
01168 
01169 }
01170 
01171 #endif //OPENCV_FLANN_KMEANS_INDEX_H_