// include/bloom_filter.hh // This file is part of libpbe; see http://svn.chezphil.org/libpbe/ // (C) 2008 Philip Endecott // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #ifndef bloom_filter_hh #define bloom_filter_hh #include // This class implements an approximation to set using a Bloom Filter. // It's approximate in the senses that: // - It sometimes returns false positives. (It never returns false negatives.) // - It provides only a subset of the features of a real set, in particular it does // not support iteration or erasure of elements. Only insertion and testing are // provided. // // Insertion and testing are both O(1) in the number of keys stored. // // See e.g. the Wikipedia Bloom Filter page for the theory. // // There are a few template parameters that you need to set: // Choose n_bits based on the acceptable probability of false positives (f) and the // number of stored keys (n_keys) to satisfy: // f = 0.618 ^ (n_bits/n_keys) // => n_bits = n_keys ln(f)/-0.48 // It's best to round n_bits to a power of 2; if you don't, mod is needed // at run-time. In any case it must be a multiple of 32. // Space is O(n_bits). // // Choose n_hashes using // n_hashes = 0.7 n_bits / n_keys // Insertion and testing are O(n_hashes). Using fewer hashes will make these operations // faster at the expense of more false positives; this can be offset by increasing // n_bits i.e. trading off time vs. space. Using more hashes will NOT reduce the rate of // false positivies. // // (Hmm, I now wonder if the better approach is to choose n_hashes based on f, and then // to choose n_bits.) // // So for example, to store the 56840 words in /usr/share/dict/words with a probability // of a false positive of 1% (i.e. 1% of your mis-spellings are identified as correct), // we get n_bits = 545329 which rounds down to 512*1024 or up to 1024*1024. n_hashes // is 9 or 18 respectively. // // These values are theoretical optimums; in practice you may need to experiment a bit // to find good values if you're fussy about the space required and the false positive rate. // // You need to supply a hash function. It takes the key as an argument and returns a tr1::array // of n_hashes hash values, which much each have at least log2(n_bits) bits. // Possible sources of hash functions are: // - The standard library includes them, returning size_t, for integer types and std::string. // - Boost.CRC. // // In cases where you want to do the same insert or lookup on multiple filters, it makes // sense to do the hash calculation only once. To allow this the insert and check methods // are overloaded with versions that take hash results. template < typename KEY_TYPE, int n_bits, int n_hashes, typename HASH > class bloom_filter { public: typedef KEY_TYPE key_type; typedef std::tr1::array< unsigned int, n_hashes > hash_return_type; bloom_filter() { bits_t b_ = {{0}}; bits = b_; } void insert(const hash_return_type& h) { for (int k=0; k bits_t; bits_t bits; void set_bit(int n) { bits[n>>5] |= 1<<(n&31); } bool test_bit(int n) const { return bits[n>>5] & (1<<(n&31)); } }; #endif