Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

hashlib.c

Go to the documentation of this file.
00001 
00016 /* {{{ Initial headers */
00017 /*
00018  * $LastChangedDate: 2003-11-27 01:55:17 +0100 (Thu, 27 Nov 2003) $
00019  * $LastChangedRevision: 5 $
00020  * $LastChangedBy: ckruse $
00021  *
00022  */
00023 /* }}} */
00024 
00025 /* {{{ Includes */
00026 #include "config.h"
00027 #include "defines.h"
00028 
00029 #include <stdio.h>
00030 #include <stdlib.h>
00031 #include <string.h>
00032 #include <sys/types.h>
00033 #include <stddef.h>
00034 
00035 #include "hashlib.h"
00036 /* }}} */
00037 
00038 #ifndef DOXYGEN
00039 /******************************* HASHING ALGORITHM, by Bob Jenkins *******************************/
00040 
00041 /*
00042  * This code has been developed by Bob Jenkins.
00043  */
00044 
00045 /*
00046  * mix -- mix 3 32-bit values reversibly.
00047  * For every delta with one or two bit set, and the deltas of all three
00048  * high bits or all three low bits, whether the original value of a,b,c
00049  * is almost all zero or is uniformly distributed,
00050  *
00051  * - If mix() is run forward or backward, at least 32 bits in a,b,c
00052  *   have at least 1/4 probability of changing.
00053  * - If mix() is run forward, every bit of c will change between 1/3 and
00054  *   2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
00055  *
00056  * mix() was built out of 36 single-cycle latency instructions in a 
00057  * structure that could supported 2x parallelism, like so:
00058  *     a -= b; 
00059  *     a -= c; x = (c>>13);
00060  *     b -= c; a ^= x;
00061  *     b -= a; x = (a<<8);
00062  *     c -= a; b ^= x;
00063  *     c -= b; x = (b>>13);
00064  *     ...
00065  * Unfortunately, superscalar Pentiums and Sparcs can't take advantage 
00066  * of that parallelism.  They've also turned some of those single-cycle
00067  * latency instructions into multi-cycle latency instructions.  Still,
00068  * this is the fastest good hash I could find.  There were about 2^^68
00069  * to choose from.  I only looked at a billion or so.
00070  */
00071 #define mix(a,b,c) \
00072 { \
00073   a -= b; a -= c; a ^= (c>>13); \
00074   b -= c; b -= a; b ^= (a<<8); \
00075   c -= a; c -= b; c ^= (b>>13); \
00076   a -= b; a -= c; a ^= (c>>12);  \
00077   b -= c; b -= a; b ^= (a<<16); \
00078   c -= a; c -= b; c ^= (b>>5); \
00079   a -= b; a -= c; a ^= (c>>3);  \
00080   b -= c; b -= a; b ^= (a<<10); \
00081   c -= a; c -= b; c ^= (b>>15); \
00082 }
00083 
00084 #endif
00085 
00086 /*
00087  * lookup() -- hash a variable-length key into a 32-bit value
00088  *
00089  * k     : the key (the unaligned variable-length array of bytes)
00090  * len   : the length of the key, counting by bytes
00091  * level : can be any 4-byte value
00092  *
00093  * Returns a 32-bit value.  Every bit of the key affects every bit of
00094  * the return value.  Every 1-bit and 2-bit delta achieves avalanche.
00095  * About 6len+35 instructions.
00096 
00097  * The best hashtable sizes are powers of 2.  There is no need to do
00098  * mod a prime (mod is sooo slow!).  If you need less than 32 bits,
00099  * use a bitmask.  For example, if you need only 10 bits, do
00100  *
00101  * h = (h & hashmask(10));
00102  *
00103  * In which case, the hash table should have hashsize(10) elements.
00104  *
00105  * If you are hashing n strings (ub1 **)k, do it like this:
00106  *  for (i=0, h=0; i<n; ++i) h = lookup( k[i], len[i], h);
00107  *
00108  * By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.
00109  *
00110  * See http://burtleburtle.net/bob/hash/evahash.html
00111  * Use for hash table lookup, or anything where one collision in 2^32 is
00112  * acceptable.  Do NOT use for cryptographic purposes.
00113  */
00114 ub4 lookup(register ub1 *k,register ub4 length,register ub4 level) {
00115   register ub4 a,b,c,len;
00116 
00117   /* Set up the internal state */
00118   len = length;
00119   a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
00120   c = level;           /* the previous hash value */
00121 
00122   /*---------------------------------------- handle most of the key */
00123   while (len >= 12) {
00124     a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
00125     b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
00126     c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
00127     mix(a,b,c);
00128     k += 12; len -= 12;
00129   }
00130 
00131   /*------------------------------------- handle the last 11 bytes */
00132   c += length;
00133   switch(len) {            /* all the case statements fall through */
00134     case 11: c+=((ub4)k[10]<<24);
00135     case 10: c+=((ub4)k[9]<<16);
00136     case 9 : c+=((ub4)k[8]<<8);
00137       /* the first byte of c is reserved for the length */
00138     case 8 : b+=((ub4)k[7]<<24);
00139     case 7 : b+=((ub4)k[6]<<16);
00140     case 6 : b+=((ub4)k[5]<<8);
00141     case 5 : b+=k[4];
00142     case 4 : a+=((ub4)k[3]<<24);
00143     case 3 : a+=((ub4)k[2]<<16);
00144     case 2 : a+=((ub4)k[1]<<8);
00145     case 1 : a+=k[0];
00146       /* case 0: nothing left to add */
00147   }
00148   mix(a,b,c);
00149   /*-------------------------------------------- report the result */
00150   return c;
00151 }
00152 
00153 /******************************* HASHING ALGORITHM END *************************************/
00154 
00155 /*
00156  * at this point the hashlib code begins ((c) by CK)
00157  */
00158 
00159 /*
00160  * Returns:                the new hashtable object
00161  * Parameters:
00162  *   - t_cf_hash_cleanup   a destructor function for the entry data
00163  *
00164  * This function constructs a new hash. It expects a pointer to a
00165  * destructor function. This function will be called when a hash
00166  * entry will be destroyed. This can happen if you call
00167  * 'cf_hash_delete' or if you call 'cf_hash_destroy'. You MUST NOT
00168  * free the data object itself! This will be done internaly. You
00169  * only should cleanup the structure if it is a complex data structure
00170  * like a struct or an array of arrays.
00171  *
00172  * If the function pointer is NULL, no cleanup function will be called.
00173  *
00174  */
00175 t_cf_hash *cf_hash_new(t_cf_hash_cleanup cl) {
00176   t_cf_hash *hsh = malloc(sizeof(t_cf_hash));
00177   ub4 elems = hashsize(CF_HASH_SIZE);
00178 
00179   if(!hsh) return NULL;
00180 
00181   /*
00182    * the tablesize is 2^10 (1024 entries) since we don't blame on double
00183    * values. This should be moderate for normal use. If not, increase
00184    * this value in the header file. The higher the tablesize, the lower the
00185    * possibility of a double hash value. But caution! This value will be powered
00186    * with the base of 2. So if you increase it to 11, you get a size of 2048.
00187    * If you increase it to 15, you get a size of 32768 and so on. We use 2 as
00188    * the base, because this hashing function works fastest with with exponents
00189    * of two. Modulo seems to be very slow...
00190    */
00191   hsh->tablesize = CF_HASH_SIZE;
00192 
00193   /*
00194    * this *sucks*. It costs a lot of time, but we need it because
00195    * we have to know if an element is set or not
00196    */
00197   hsh->table     = calloc(elems,sizeof(t_cf_hash));
00198 
00199   hsh->destroy   = cl;
00200 
00201   return hsh;
00202 }
00203 
00204 #ifndef DOXYGEN
00205 /*
00206  * Returns:                the data of the hash entry if found or NULL
00207  * Parameters:
00208  *   - t_cf_hash *hsh      the hash object
00209  *   - unsigned char *key  the key
00210  *   - size_t keylen       the length of the key
00211  *
00212  * This function looks up a hash entry in a hash table. If an entry with
00213  * the key has not been found NULL is returned.
00214  *
00215  * This function is private!
00216  */
00217 t_cf_hash_entry *_cf_hash_save(unsigned char *key,size_t keylen,void *data,size_t datalen,ub4 hashval) {
00218   t_cf_hash_entry *ent = malloc(sizeof(t_cf_hash_entry));
00219 
00220   if(!ent) return NULL;
00221 
00222   /*
00223    * we save the complete hashval because with this value
00224    * we can examine faster if a key exists in this hash
00225    * table
00226    */
00227   ent->hashval     = hashval;
00228 
00229   /*
00230    * the key of the entry has to be a 0 terminated unsigned char *
00231    */
00232   ent->key         = malloc((keylen + 1) * sizeof(unsigned char));
00233   if(!ent->key) return NULL;
00234 
00235   memcpy(ent->key,key,keylen);
00236   ent->key[keylen] = 0;
00237   ent->keylen      = keylen;
00238 
00239   /*
00240    * save the data
00241    */
00242   if(datalen) {
00243     ent->data        = malloc(datalen);
00244     ent->stat        = 0;
00245     if(!ent->data) return NULL;
00246 
00247     memcpy(ent->data,data,datalen);
00248   }
00249   else {
00250     ent->stat = 1;
00251     ent->data = data;
00252   }
00253 
00254   ent->prev        = NULL;
00255   ent->next        = NULL;
00256 
00257   return ent;
00258 }
00259 
00260 /*
00261  * Returns:                nothing
00262  * Parameters:
00263  *   - t_cf_hash *hsh      the hash object
00264  *   - unsigned char *key  the key
00265  *   - size_t keylen       the length of the key
00266  *   - void *data          the data object to save
00267  *   - size_t datalen      the size of the data object to save
00268  *   - ub4 hval            the hash value of the key
00269  *
00270  * This function reallocates and restructures a hashtable. Very
00271  * expensive, but not called very often.
00272  *
00273  * This function is private!
00274  *
00275  */
00276 void _cf_hash_split(t_cf_hash *hsh,unsigned char *key,size_t keylen,void *data,size_t datalen,ub4 hval) {
00277   ub4 elems,oelems,i,hval_short;
00278   t_cf_hash_entry *elem,*elem1;
00279 
00280   oelems = hashsize(hsh->tablesize);
00281   elems  = hashsize(++(hsh->tablesize));
00282 
00283   /*
00284    * first, we have to reallocate the hashtable. This is expensive enough...
00285    */
00286   hsh->table = realloc(hsh->table,elems * sizeof(t_cf_hash_entry **));
00287 
00288   /*
00289    * But this is not enough. We also have to set the second
00290    * half to NULL *grr* this is *very* expensive.
00291    */
00292   memset(&hsh->table[oelems],0,oelems * sizeof(t_cf_hash_entry **));
00293 
00294   /*
00295    * But as if this is not enough, we have to re-structure the complete
00296    * hashtable...
00297    */
00298   for(i=0;i<oelems;i++) {
00299     /*
00300      * empty elements are uninteresting
00301      */
00302     if(!hsh->table[i]) continue;
00303 
00304     /*
00305      * We need to check each element in an hash entry...
00306      */
00307     for(elem=hsh->table[i];elem;elem=elem1) {
00308       elem1 = elem->next;
00309 
00310       /*
00311        * Is the element at the right position? If yes, we can
00312        * continue with the next element.
00313        */
00314       if((elem->hashval & (elems - 1)) != i) {
00315         /*
00316          * Is this element the first entry? If no, we have to
00317          * knock out the actual element and if yes, we have to
00318          * set it to the next element (may even be NULL (aka end of list))
00319          */
00320         if(elem->prev) {
00321           elem->prev->next = elem->next;
00322         }
00323         else {
00324           hsh->table[i] = elem->next;
00325         }
00326 
00327         /*
00328          * has this element a next value? If yes, we have
00329          * to set the prev-pointer of the next element to
00330          * the prev-pointer of the actual element
00331          */
00332         if(elem->next) {
00333           elem->next->prev = elem->prev;
00334         }
00335 
00336         /*
00337          * And now: prepend the actual hash entry
00338          */
00339         elem->next = hsh->table[elem->hashval & (elems - 1)];
00340         elem->prev = NULL;
00341         hsh->table[elem->hashval & (elems - 1)] = elem;
00342 
00343         if(elem->next) elem->next->prev = elem;
00344       }
00345     }
00346   }
00347 
00348   /*
00349    * last, save the given element to the hash table. This time, we
00350    * don't check the CF_HASH_MAX_DOUBLES, because it would take a
00351    * *lot* of time to restructure the hash table
00352    */
00353   hval_short = hval & (elems - 1);
00354   if(hsh->table[hval_short]) {
00355     for(elem1=hsh->table[hval_short];elem1->next;elem1=elem1->next);
00356     elem1->next        = _cf_hash_save(key,keylen,data,datalen,hval);
00357     elem1->next->prev  = elem1;
00358   }
00359   else {
00360     hsh->table[hval_short] = _cf_hash_save(key,keylen,data,datalen,hval);
00361   }
00362 
00363 }
00364 #endif
00365 
00366 /*
00367  * Returns:                0 if unsuccessful, 1 if successful
00368  * Parameters:
00369  *   - t_cf_hash *hsh      the hash object
00370  *   - unsigned char *key  the key
00371  *   - size_t keylen       the length of the key
00372  *   - void *data          the data
00373  *   - size_t datalen      the size of the data object
00374  *
00375  * This function saves a hash entry with the given key in the hash
00376  * table. It can happen that the table has to be resized, but this
00377  * should not happen very often (there's one double hashvalue in a
00378  * 32 bit keylen and we accept 5 double hashvalues per entry). But
00379  * *when* it happens, this call is very expensive...
00380  *
00381  */
00382 int cf_hash_set(t_cf_hash *hsh,unsigned char *key,size_t keylen,void *data,size_t datalen) {
00383   ub4 hval,hval_short;
00384   t_cf_hash_entry *ent,*prev;
00385   int doubles;
00386 
00387   /*
00388    * generate the hash value
00389    */
00390   hval       = lookup(key,keylen,0);
00391 
00392   /*
00393    * because we need no 32 bit hash values (a hashtable of
00394    * a size of 32 bit is definitely a *very* to big!)
00395    */
00396   hval_short = hval & hashmask(hsh->tablesize);
00397 
00398   ent = hsh->table[hval_short];
00399 
00400   if(ent) {
00401     for(doubles=0,prev=NULL;ent;doubles++,ent=ent->next) {
00402       /*
00403        * We got a double value, so we have to free the actual value
00404        */
00405       if(ent->hashval == hval && strcmp(ent->key,key) == 0) {
00406         if(ent->stat == 0) {
00407           if(hsh->destroy) hsh->destroy(ent->data);
00408           free(ent->data);
00409         }
00410 
00411         ent->data = malloc(datalen);
00412         if(!ent->data) return 0;
00413         memcpy(ent->data,data,datalen);
00414 
00415         return 1;
00416       }
00417 
00418       prev = ent;
00419     }
00420 
00421     /*
00422      * phew... I *really* hope this case happens not very often.
00423      */
00424     if(doubles > CF_HASH_MAX_DOUBLES) {
00425       _cf_hash_split(hsh,key,keylen,data,datalen,hval);
00426     }
00427     else {
00428       prev->next       = _cf_hash_save(key,keylen,data,datalen,hval);
00429       prev->next->prev = prev;
00430     }
00431 
00432     return 1;
00433   }
00434   else {
00435     hsh->table[hval_short] = _cf_hash_save(key,keylen,data,datalen,hval);
00436     return 1;
00437   }
00438 
00439   return 0;
00440 }
00441 
00442 /*
00443  * Returns:                0 if unsuccessful, 1 if successful
00444  * Parameters:
00445  *   - t_cf_hash *hsh      the hash object
00446  *   - unsigned char *key  the key
00447  *   - size_t keylen       the length of the key
00448  *   - void *data          the data
00449  *
00450  * This function saves a hash entry with the given key in the hash
00451  * table. It can happen that the table has to be resized, but this
00452  * should not happen very often (there's one double hashvalue in a
00453  * 32 bit keylen and we accept 5 double hashvalues per entry). But
00454  * *when* it happens, this call is very expensive...
00455  *
00456  */
00457 int cf_hash_set_static(t_cf_hash *hsh,unsigned char *key,size_t keylen,void *data) {
00458   ub4 hval,hval_short;
00459   t_cf_hash_entry *ent,*prev;
00460   int doubles;
00461 
00462   /*
00463    * generate the hash value
00464    */
00465   hval       = lookup(key,keylen,0);
00466 
00467   /*
00468    * because we need no 32 bit hash values (a hashtable of
00469    * a size of 32 bit is definitely a *very* to big!)
00470    */
00471   hval_short = hval & hashmask(hsh->tablesize);
00472 
00473   ent = hsh->table[hval_short];
00474 
00475   if(ent) {
00476     for(doubles=0,prev=NULL;ent;doubles++,ent=ent->next) {
00477       /*
00478        * We got a double value, so we have to free the actual value
00479        */
00480       if(ent->hashval == hval && strcmp(ent->key,key) == 0) {
00481         if(ent->stat == 0) {
00482           if(hsh->destroy) hsh->destroy(ent->data);
00483           free(ent->data);
00484         }
00485 
00486         ent->data = data;
00487         ent->stat = 1;
00488 
00489         return 1;
00490       }
00491 
00492       prev = ent;
00493     }
00494 
00495     /*
00496      * phew... I *really* hope this case happens not very often.
00497      */
00498     if(doubles > CF_HASH_MAX_DOUBLES) {
00499       _cf_hash_split(hsh,key,keylen,data,0,hval);
00500     }
00501     else {
00502       prev->next       = _cf_hash_save(key,keylen,data,0,hval);
00503       prev->next->prev = prev;
00504     }
00505 
00506     return 1;
00507   }
00508   else {
00509     hsh->table[hval_short] = _cf_hash_save(key,keylen,data,0,hval);
00510     return 1;
00511   }
00512 
00513   return 0;
00514 }
00515 
00516 /*
00517  * Returns:                the data of the hash entry if found or NULL
00518  * Parameters:
00519  *   - t_cf_hash *hsh      the hash object
00520  *   - unsigned char *key  the key
00521  *   - size_t keylen       the length of the key
00522  *
00523  * This function looks up a hash entry in a hash table. If an entry with
00524  * the key has not been found NULL is returned.
00525  *
00526  */
00527 void *cf_hash_get(t_cf_hash *hsh,unsigned char *key,size_t keylen) {
00528   ub4 hval,hval_short;
00529   t_cf_hash_entry *ent;
00530 
00531   hval       = lookup(key,keylen,0);
00532   hval_short = hval & hashmask(hsh->tablesize);
00533 
00534   if(hsh->table[hval_short]) {
00535     for(ent = hsh->table[hval_short];ent && (ent->hashval != hval || strcmp(ent->key,key) != 0);ent=ent->next);
00536 
00537     if(ent) return ent->data;
00538   }
00539 
00540   return NULL;
00541 }
00542 
00543 /*
00544  * Returns:                0 or 1
00545  * Parameters:
00546  *   - t_cf_hash *hsh      the hash object
00547  *   - unsigned char *key  the key
00548  *   - size_t keylen       the length of the key
00549  *
00550  * This function deletes a hash entry in a hashtable. Returns
00551  * 0 if an entry with this key could not be found, returns 1 if
00552  * the hash entry has successfully been deleted
00553  *
00554  */
00555 int cf_hash_entry_delete(t_cf_hash *hsh,unsigned char *key,size_t keylen) {
00556   ub4 hval,hval_short;
00557   t_cf_hash_entry *ent;
00558 
00559   hval       = lookup(key,keylen,0);
00560   hval_short = hval & hashmask(hsh->tablesize);
00561 
00562   if(hsh->table[hval_short]) {
00563     for(ent = hsh->table[hval_short];ent && (ent->hashval != hval || strcmp(ent->key,key) != 0);ent=ent->next);
00564 
00565     if(ent) {
00566       if(ent->stat == 0) {
00567         if(hsh->destroy) hsh->destroy(ent->data);
00568         free(ent->data);
00569       }
00570       free(ent->key);
00571 
00572       if(ent->prev) {
00573         ent->prev->next = ent->next;
00574       }
00575       else {
00576         hsh->table[hval_short] = ent->next;
00577       }
00578       if(ent->next) {
00579         ent->next->prev = ent->prev;
00580       }
00581 
00582       free(ent);
00583 
00584       return 1;
00585     }
00586   }
00587 
00588   return 0;
00589 }
00590 
00591 /*
00592  * Returns:                nothing
00593  * Parameters:
00594  *   - t_cf_hash *hsh      the hash object
00595  *
00596  * This function destroys a hash and frees all of its values.
00597  * After this function call you have to create a new hash with
00598  * cf_hash_new()!
00599  *
00600  */
00601 void cf_hash_destroy(t_cf_hash *hsh) {
00602   ub4 elems = hashsize(hsh->tablesize);
00603   ub4 i;
00604   t_cf_hash_entry *ent,*ent1;
00605 
00606   for(i=0;i<elems;i++) {
00607     if(hsh->table[i]) {
00608       for(ent = hsh->table[i];ent;ent=ent1) {
00609         if(ent->stat == 0) {
00610           if(hsh->destroy) hsh->destroy(ent->data);
00611           free(ent->data);
00612         }
00613 
00614         ent1 = ent->next;
00615 
00616         free(ent->key);
00617         free(ent);
00618       }
00619     }
00620   }
00621 
00622   free(hsh->table);
00623   free(hsh);
00624 }
00625 
00626 /*
00627  * this is a small example
00628  *
00629  *
00630 
00631 int main(void) {
00632   t_cf_hash *hsh = cf_hash_new(NULL);
00633   int i = 0,len;
00634   unsigned char buff[50];
00635 
00636   for(i=0;i<34000;i++) {
00637     len = snprintf(buff,50,"key%d",i);
00638     cf_hash_set(hsh,buff,len,buff,len+1);
00639   }
00640 
00641   cf_hash_entry_delete(hsh,"key50",5);
00642   cf_hash_entry_delete(hsh,"key51",5);
00643 
00644   for(i=0;i<34000;i++) {
00645     len = snprintf(buff,50,"key%d",i);
00646     cf_hash_get(hsh,buff,len);
00647   }
00648 
00649   cf_hash_destroy(hsh);
00650 
00651   return 0;
00652 }
00653 */
00654 
00655 /* eof */

Generated on Sun Apr 25 16:37:39 2004 for Classic Forum by doxygen 1.3.5