diff options
author | Andy Wingo <wingo@pobox.com> | 2011-01-05 18:43:28 -0800 |
---|---|---|
committer | Andy Wingo <wingo@pobox.com> | 2011-01-07 09:18:36 -0800 |
commit | 622415380cb4b24d1d12641781f705f99b9e720e (patch) | |
tree | a9e461af606baa7af82209690733a2be1bec7038 | |
parent | f756cd30764c76d270cc34969d7156ebcfcfb214 (diff) |
add hash functions for locale, latin1, and utf8 strings
* libguile/hash.c (scm_i_locale_string_hash)
(scm_i_latin1_string_hash, scm_i_utf8_string_hash): New functions.
-rw-r--r-- | libguile/hash.c | 79 | ||||
-rw-r--r-- | libguile/hash.h | 9 |
2 files changed, 87 insertions, 1 deletions
diff --git a/libguile/hash.c b/libguile/hash.c index e1591d298..0dcd1c29e 100644 --- a/libguile/hash.c +++ b/libguile/hash.c @@ -22,6 +22,12 @@ # include <config.h> #endif +#ifdef HAVE_WCHAR_H +#include <wchar.h> +#endif + +#include <unistr.h> + #include "libguile/_scm.h" #include "libguile/chars.h" #include "libguile/ports.h" @@ -64,6 +70,79 @@ scm_i_string_hash (SCM str) return h; } +unsigned long +scm_i_locale_string_hash (const char *str, size_t len) +{ +#ifdef HAVE_WCHAR_H + mbstate_t state; + wchar_t c; + size_t byte_idx = 0, nbytes; + unsigned long h = 0; + + if (len == (size_t) -1) + len = strlen (str); + + while ((nbytes = mbrtowc (&c, str + byte_idx, len - byte_idx, &state)) > 0) + { + if (nbytes >= (size_t) -2) + /* Invalid input string; punt. */ + return scm_i_string_hash (scm_from_locale_stringn (str, len)); + + h = (unsigned long) c + h * 37; + byte_idx += nbytes; + } + + return h; +#else + return scm_i_string_hash (scm_from_locale_stringn (str, len)); +#endif +} + +unsigned long +scm_i_latin1_string_hash (const char *str, size_t len) +{ + const scm_t_uint8 *ustr = (const scm_t_uint8 *) str; + size_t i = 0; + unsigned long h = 0; + + if (len == (size_t) -1) + len = strlen (str); + + for (; i < len; i++) + h = (unsigned long) ustr[i] + h * 37; + + return h; +} + +unsigned long +scm_i_utf8_string_hash (const char *str, size_t len) +{ + const scm_t_uint8 *ustr = (const scm_t_uint8 *) str; + size_t byte_idx = 0; + unsigned long h = 0; + + if (len == (size_t) -1) + len = strlen (str); + + while (byte_idx < len) + { + ucs4_t c; + int nbytes; + + nbytes = u8_mbtouc (&c, ustr + byte_idx, len - byte_idx); + if (nbytes == 0) + break; + else if (nbytes < 0) + /* Bad UTF-8; punt. */ + return scm_i_string_hash (scm_from_utf8_stringn (str, len)); + + h = (unsigned long) c + h * 37; + byte_idx += nbytes; + } + + return h; +} + /* Dirk:FIXME:: why downcase for characters? (2x: scm_hasher, scm_ihashv) */ /* Dirk:FIXME:: scm_hasher could be made static. */ diff --git a/libguile/hash.h b/libguile/hash.h index 2ebc05352..307748617 100644 --- a/libguile/hash.h +++ b/libguile/hash.h @@ -3,7 +3,7 @@ #ifndef SCM_HASH_H #define SCM_HASH_H -/* Copyright (C) 1995,1996,2000, 2006, 2008 Free Software Foundation, Inc. +/* Copyright (C) 1995,1996,2000, 2006, 2008, 2011 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -28,6 +28,13 @@ SCM_API unsigned long scm_string_hash (const unsigned char *str, size_t len); +SCM_INTERNAL unsigned long scm_i_locale_string_hash (const char *str, + size_t len); +SCM_INTERNAL unsigned long scm_i_latin1_string_hash (const char *str, + size_t len); +SCM_INTERNAL unsigned long scm_i_utf8_string_hash (const char *str, + size_t len); + SCM_INTERNAL unsigned long scm_i_string_hash (SCM str); SCM_API unsigned long scm_hasher (SCM obj, unsigned long n, size_t d); SCM_API unsigned long scm_ihashq (SCM obj, unsigned long n); |