Joe Verbout
/
main
opencv on mbed
Embed:
(wiki syntax)
Show/hide line numbers
lsh_table.h
00001 /*********************************************************************** 00002 * Software License Agreement (BSD License) 00003 * 00004 * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved. 00005 * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved. 00006 * 00007 * THE BSD LICENSE 00008 * 00009 * Redistribution and use in source and binary forms, with or without 00010 * modification, are permitted provided that the following conditions 00011 * are met: 00012 * 00013 * 1. Redistributions of source code must retain the above copyright 00014 * notice, this list of conditions and the following disclaimer. 00015 * 2. Redistributions in binary form must reproduce the above copyright 00016 * notice, this list of conditions and the following disclaimer in the 00017 * documentation and/or other materials provided with the distribution. 00018 * 00019 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 00020 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00021 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 00022 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 00023 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 00024 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00025 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00026 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 00028 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 *************************************************************************/ 00030 00031 /*********************************************************************** 00032 * Author: Vincent Rabaud 00033 *************************************************************************/ 00034 00035 #ifndef OPENCV_FLANN_LSH_TABLE_H_ 00036 #define OPENCV_FLANN_LSH_TABLE_H_ 00037 00038 #include <algorithm> 00039 #include <iostream> 00040 #include <iomanip> 00041 #include <limits.h> 00042 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP 00043 #ifdef __GXX_EXPERIMENTAL_CXX0X__ 00044 # define USE_UNORDERED_MAP 1 00045 #else 00046 # define USE_UNORDERED_MAP 0 00047 #endif 00048 #if USE_UNORDERED_MAP 00049 #include <unordered_map> 00050 #else 00051 #include <map> 00052 #endif 00053 #include <math.h> 00054 #include <stddef.h> 00055 00056 #include "dynamic_bitset.h" 00057 #include "matrix.h" 00058 00059 namespace cvflann 00060 { 00061 00062 namespace lsh 00063 { 00064 00065 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 00066 00067 /** What is stored in an LSH bucket 00068 */ 00069 typedef uint32_t FeatureIndex; 00070 /** The id from which we can get a bucket back in an LSH table 00071 */ 00072 typedef unsigned int BucketKey; 00073 00074 /** A bucket in an LSH table 00075 */ 00076 typedef std::vector<FeatureIndex> Bucket; 00077 00078 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 00079 00080 /** POD for stats about an LSH table 00081 */ 00082 struct LshStats 00083 { 00084 std::vector<unsigned int> bucket_sizes_; 00085 size_t n_buckets_; 00086 size_t bucket_size_mean_; 00087 size_t bucket_size_median_; 00088 size_t bucket_size_min_; 00089 size_t bucket_size_max_; 00090 size_t bucket_size_std_dev; 00091 /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin 00092 */ 00093 std::vector<std::vector<unsigned int> > size_histogram_; 00094 }; 00095 00096 /** Overload the << operator for LshStats 00097 * @param out the streams 00098 * @param stats the stats to display 00099 * @return the streams 00100 */ 00101 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats) 00102 { 00103 int w = 20; 00104 out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : " 00105 << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : " 00106 << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w) 00107 << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w) 00108 << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left) 00109 << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : " 00110 << std::setiosflags(std::ios::left) << stats.bucket_size_max_; 00111 00112 // Display the histogram 00113 out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : " 00114 << std::setiosflags(std::ios::left); 00115 for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end = 00116 stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ", "; 00117 00118 return out; 00119 } 00120 00121 00122 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 00123 00124 /** Lsh hash table. As its key is a sub-feature, and as usually 00125 * the size of it is pretty small, we keep it as a continuous memory array. 00126 * The value is an index in the corpus of features (we keep it as an unsigned 00127 * int for pure memory reasons, it could be a size_t) 00128 */ 00129 template<typename ElementType> 00130 class LshTable 00131 { 00132 public: 00133 /** A container of all the feature indices. Optimized for space 00134 */ 00135 #if USE_UNORDERED_MAP 00136 typedef std::unordered_map<BucketKey, Bucket> BucketsSpace; 00137 #else 00138 typedef std::map<BucketKey, Bucket> BucketsSpace; 00139 #endif 00140 00141 /** A container of all the feature indices. Optimized for speed 00142 */ 00143 typedef std::vector<Bucket> BucketsSpeed; 00144 00145 /** Default constructor 00146 */ 00147 LshTable() 00148 { 00149 } 00150 00151 /** Default constructor 00152 * Create the mask and allocate the memory 00153 * @param feature_size is the size of the feature (considered as a ElementType[]) 00154 * @param key_size is the number of bits that are turned on in the feature 00155 */ 00156 LshTable(unsigned int feature_size, unsigned int key_size) 00157 { 00158 (void)feature_size; 00159 (void)key_size; 00160 std::cerr << "LSH is not implemented for that type" << std::endl; 00161 assert(0); 00162 } 00163 00164 /** Add a feature to the table 00165 * @param value the value to store for that feature 00166 * @param feature the feature itself 00167 */ 00168 void add(unsigned int value, const ElementType* feature) 00169 { 00170 // Add the value to the corresponding bucket 00171 BucketKey key = (lsh::BucketKey)getKey(feature); 00172 00173 switch (speed_level_) { 00174 case kArray: 00175 // That means we get the buckets from an array 00176 buckets_speed_[key].push_back(value); 00177 break; 00178 case kBitsetHash: 00179 // That means we can check the bitset for the presence of a key 00180 key_bitset_.set(key); 00181 buckets_space_[key].push_back(value); 00182 break; 00183 case kHash: 00184 { 00185 // That means we have to check for the hash table for the presence of a key 00186 buckets_space_[key].push_back(value); 00187 break; 00188 } 00189 } 00190 } 00191 00192 /** Add a set of features to the table 00193 * @param dataset the values to store 00194 */ 00195 void add(Matrix<ElementType> dataset) 00196 { 00197 #if USE_UNORDERED_MAP 00198 buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2); 00199 #endif 00200 // Add the features to the table 00201 for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]); 00202 // Now that the table is full, optimize it for speed/space 00203 optimize(); 00204 } 00205 00206 /** Get a bucket given the key 00207 * @param key 00208 * @return 00209 */ 00210 inline const Bucket* getBucketFromKey(BucketKey key) const 00211 { 00212 // Generate other buckets 00213 switch (speed_level_) { 00214 case kArray: 00215 // That means we get the buckets from an array 00216 return &buckets_speed_[key]; 00217 break; 00218 case kBitsetHash: 00219 // That means we can check the bitset for the presence of a key 00220 if (key_bitset_.test(key)) return &buckets_space_.find(key)->second; 00221 else return 0; 00222 break; 00223 case kHash: 00224 { 00225 // That means we have to check for the hash table for the presence of a key 00226 BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end(); 00227 bucket_it = buckets_space_.find(key); 00228 // Stop here if that bucket does not exist 00229 if (bucket_it == bucket_end) return 0; 00230 else return &bucket_it->second; 00231 break; 00232 } 00233 } 00234 return 0; 00235 } 00236 00237 /** Compute the sub-signature of a feature 00238 */ 00239 size_t getKey(const ElementType* /*feature*/) const 00240 { 00241 std::cerr << "LSH is not implemented for that type" << std::endl; 00242 assert(0); 00243 return 1; 00244 } 00245 00246 /** Get statistics about the table 00247 * @return 00248 */ 00249 LshStats getStats() const; 00250 00251 private: 00252 /** defines the speed fo the implementation 00253 * kArray uses a vector for storing data 00254 * kBitsetHash uses a hash map but checks for the validity of a key with a bitset 00255 * kHash uses a hash map only 00256 */ 00257 enum SpeedLevel 00258 { 00259 kArray, kBitsetHash, kHash 00260 }; 00261 00262 /** Initialize some variables 00263 */ 00264 void initialize(size_t key_size) 00265 { 00266 const size_t key_size_lower_bound = 1; 00267 //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t 00268 const size_t key_size_upper_bound = std::min(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT); 00269 if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound) 00270 { 00271 CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound)); 00272 } 00273 00274 speed_level_ = kHash; 00275 key_size_ = (unsigned)key_size; 00276 } 00277 00278 /** Optimize the table for speed/space 00279 */ 00280 void optimize() 00281 { 00282 // If we are already using the fast storage, no need to do anything 00283 if (speed_level_ == kArray) return; 00284 00285 // Use an array if it will be more than half full 00286 if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) { 00287 speed_level_ = kArray; 00288 // Fill the array version of it 00289 buckets_speed_.resize(size_t(1) << key_size_); 00290 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second; 00291 00292 // Empty the hash table 00293 buckets_space_.clear(); 00294 return; 00295 } 00296 00297 // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two 00298 // for the vector) or less than 512MB (key_size_ <= 30) 00299 if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10 00300 >= (size_t(1) << key_size_)) || (key_size_ <= 32)) { 00301 speed_level_ = kBitsetHash; 00302 key_bitset_.resize(size_t(1) << key_size_); 00303 key_bitset_.reset(); 00304 // Try with the BucketsSpace 00305 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first); 00306 } 00307 else { 00308 speed_level_ = kHash; 00309 key_bitset_.clear(); 00310 } 00311 } 00312 00313 /** The vector of all the buckets if they are held for speed 00314 */ 00315 BucketsSpeed buckets_speed_; 00316 00317 /** The hash table of all the buckets in case we cannot use the speed version 00318 */ 00319 BucketsSpace buckets_space_; 00320 00321 /** What is used to store the data */ 00322 SpeedLevel speed_level_; 00323 00324 /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset 00325 * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset) 00326 */ 00327 DynamicBitset key_bitset_; 00328 00329 /** The size of the sub-signature in bits 00330 */ 00331 unsigned int key_size_; 00332 00333 // Members only used for the unsigned char specialization 00334 /** The mask to apply to a feature to get the hash key 00335 * Only used in the unsigned char case 00336 */ 00337 std::vector<size_t> mask_; 00338 }; 00339 00340 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 00341 // Specialization for unsigned char 00342 00343 template<> 00344 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size) 00345 { 00346 initialize(subsignature_size); 00347 // Allocate the mask 00348 mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0); 00349 00350 // A bit brutal but fast to code 00351 std::vector<size_t> indices(feature_size * CHAR_BIT); 00352 for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i; 00353 std::random_shuffle(indices.begin(), indices.end()); 00354 00355 // Generate a random set of order of subsignature_size_ bits 00356 for (unsigned int i = 0; i < key_size_; ++i) { 00357 size_t index = indices[i]; 00358 00359 // Set that bit in the mask 00360 size_t divisor = CHAR_BIT * sizeof(size_t); 00361 size_t idx = index / divisor; //pick the right size_t index 00362 mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset 00363 } 00364 00365 // Set to 1 if you want to display the mask for debug 00366 #if 0 00367 { 00368 size_t bcount = 0; 00369 BOOST_FOREACH(size_t mask_block, mask_){ 00370 out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block 00371 << std::endl; 00372 bcount += __builtin_popcountll(mask_block); 00373 } 00374 out << "bit count : " << std::dec << bcount << std::endl; 00375 out << "mask size : " << mask_.size() << std::endl; 00376 return out; 00377 } 00378 #endif 00379 } 00380 00381 /** Return the Subsignature of a feature 00382 * @param feature the feature to analyze 00383 */ 00384 template<> 00385 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const 00386 { 00387 // no need to check if T is dividable by sizeof(size_t) like in the Hamming 00388 // distance computation as we have a mask 00389 const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature); 00390 00391 // Figure out the subsignature of the feature 00392 // Given the feature ABCDEF, and the mask 001011, the output will be 00393 // 000CEF 00394 size_t subsignature = 0; 00395 size_t bit_index = 1; 00396 00397 for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) { 00398 // get the mask and signature blocks 00399 size_t feature_block = *feature_block_ptr; 00400 size_t mask_block = *pmask_block; 00401 while (mask_block) { 00402 // Get the lowest set bit in the mask block 00403 size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block); 00404 // Add it to the current subsignature if necessary 00405 subsignature += (feature_block & lowest_bit) ? bit_index : 0; 00406 // Reset the bit in the mask block 00407 mask_block ^= lowest_bit; 00408 // increment the bit index for the subsignature 00409 bit_index <<= 1; 00410 } 00411 // Check the next feature block 00412 ++feature_block_ptr; 00413 } 00414 return subsignature; 00415 } 00416 00417 template<> 00418 inline LshStats LshTable<unsigned char>::getStats() const 00419 { 00420 LshStats stats; 00421 stats.bucket_size_mean_ = 0; 00422 if ((buckets_speed_.empty()) && (buckets_space_.empty())) { 00423 stats.n_buckets_ = 0; 00424 stats.bucket_size_median_ = 0; 00425 stats.bucket_size_min_ = 0; 00426 stats.bucket_size_max_ = 0; 00427 return stats; 00428 } 00429 00430 if (!buckets_speed_.empty()) { 00431 for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) { 00432 stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size()); 00433 stats.bucket_size_mean_ += pbucket->size(); 00434 } 00435 stats.bucket_size_mean_ /= buckets_speed_.size(); 00436 stats.n_buckets_ = buckets_speed_.size(); 00437 } 00438 else { 00439 for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) { 00440 stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size()); 00441 stats.bucket_size_mean_ += x->second.size(); 00442 } 00443 stats.bucket_size_mean_ /= buckets_space_.size(); 00444 stats.n_buckets_ = buckets_space_.size(); 00445 } 00446 00447 std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end()); 00448 00449 // BOOST_FOREACH(int size, stats.bucket_sizes_) 00450 // std::cout << size << " "; 00451 // std::cout << std::endl; 00452 stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2]; 00453 stats.bucket_size_min_ = stats.bucket_sizes_.front(); 00454 stats.bucket_size_max_ = stats.bucket_sizes_.back(); 00455 00456 // TODO compute mean and std 00457 /*float mean, stddev; 00458 stats.bucket_size_mean_ = mean; 00459 stats.bucket_size_std_dev = stddev;*/ 00460 00461 // Include a histogram of the buckets 00462 unsigned int bin_start = 0; 00463 unsigned int bin_end = 20; 00464 bool is_new_bin = true; 00465 for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator 00466 != end; ) 00467 if (*iterator < bin_end) { 00468 if (is_new_bin) { 00469 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0)); 00470 stats.size_histogram_.back()[0] = bin_start; 00471 stats.size_histogram_.back()[1] = bin_end - 1; 00472 is_new_bin = false; 00473 } 00474 ++stats.size_histogram_.back()[2]; 00475 ++iterator; 00476 } 00477 else { 00478 bin_start += 20; 00479 bin_end += 20; 00480 is_new_bin = true; 00481 } 00482 00483 return stats; 00484 } 00485 00486 // End the two namespaces 00487 } 00488 } 00489 00490 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 00491 00492 #endif /* OPENCV_FLANN_LSH_TABLE_H_ */ 00493
Generated on Tue Jul 12 2022 16:42:38 by 1.7.2