diff options
author | Michael Gran <spk121@yahoo.com> | 2009-08-21 08:57:35 -0700 |
---|---|---|
committer | Michael Gran <spk121@yahoo.com> | 2009-08-21 08:57:35 -0700 |
commit | e23106d53eb03d7cb4962282396269176ea7482e (patch) | |
tree | d7736a8f7f16af0c73365f688f75561aac81c5eb | |
parent | 90305ce9e429f0381ff79427e71287fdafd4d201 (diff) |
Add initial support for wide symbols
* libguile/hash.c (scm_i_string_hash): new function
(scm_hasher): don't unpack string: use scm_i_string_hash
* libguile/hash.h: new declaration for scm_i_string_hash
* libguile/print.c (quote_keywordish_symbol): use symbol accessors
(scm_i_print_symbol_name): new function
(scm_print_symbol_name): call scm_i_print_symbol_name
(iprin1): use scm_i_print_symbol_name to print symbols
* libguile/print.h: new declaration for scm_i_print_symbol_name
* libguile/symbols.c (lookup_interned_symbol): now takes scheme string
instead of c string; callers changed
(lookup_interned_symbol): add wide symbol support
(scm_i_c_mem2symbol): removed
(scm_i_mem2symbol): removed and replaced with scm_i_str2symbol
(scm_i_str2symbol): new function
(scm_i_mem2uninterned_symbol): removed and replaced with
scm_i_str2uninterned_symbol
(scm_i_str2uninterned_symbol): new function
(scm_make_symbol, scm_string_to_symbol, scm_from_locale_symbol)
(scm_from_locale_symboln): use scm_i_str2symbol
* test-suite/tests/symbols.test: new tests
-rw-r--r-- | libguile/hash.c | 17 | ||||
-rw-r--r-- | libguile/hash.h | 1 | ||||
-rw-r--r-- | libguile/print.c | 59 | ||||
-rw-r--r-- | libguile/print.h | 1 | ||||
-rw-r--r-- | libguile/symbols.c | 108 | ||||
-rw-r--r-- | test-suite/tests/symbols.test | 39 |
6 files changed, 113 insertions, 112 deletions
diff --git a/libguile/hash.c b/libguile/hash.c index d2fe17706..e6e38ba50 100644 --- a/libguile/hash.c +++ b/libguile/hash.c @@ -50,6 +50,20 @@ scm_string_hash (const unsigned char *str, size_t len) return h; } +unsigned long +scm_i_string_hash (SCM str) +{ + size_t len = scm_i_string_length (str); + size_t i = 0; + + unsigned long h = 0; + while (len-- > 0) + h = (unsigned long) scm_i_string_ref (str, i++) + h * 37; + + scm_remember_upto_here_1 (str); + return h; +} + /* Dirk:FIXME:: why downcase for characters? (2x: scm_hasher, scm_ihashv) */ /* Dirk:FIXME:: scm_hasher could be made static. */ @@ -115,8 +129,7 @@ scm_hasher(SCM obj, unsigned long n, size_t d) case scm_tc7_string: { unsigned long hash = - scm_string_hash ((const unsigned char *) scm_i_string_chars (obj), - scm_i_string_length (obj)) % n; + scm_i_string_hash (obj) % n; scm_remember_upto_here_1 (obj); return hash; } diff --git a/libguile/hash.h b/libguile/hash.h index 789595b42..2ebc05352 100644 --- a/libguile/hash.h +++ b/libguile/hash.h @@ -28,6 +28,7 @@ SCM_API unsigned long scm_string_hash (const unsigned char *str, size_t len); +SCM_INTERNAL unsigned long scm_i_string_hash (SCM str); SCM_API unsigned long scm_hasher (SCM obj, unsigned long n, size_t d); SCM_API unsigned long scm_ihashq (SCM obj, unsigned long n); SCM_API SCM scm_hashq (SCM obj, SCM n); diff --git a/libguile/print.c b/libguile/print.c index 74f7d8db6..07bff47fd 100644 --- a/libguile/print.c +++ b/libguile/print.c @@ -295,13 +295,12 @@ print_circref (SCM port, scm_print_state *pstate, SCM ref) /* Print the name of a symbol. */ static int -quote_keywordish_symbol (const char *str, size_t len) +quote_keywordish_symbol (SCM symbol) { SCM option; - /* LEN is guaranteed to be > 0. - */ - if (str[0] != ':' && str[len-1] != ':') + if (scm_i_symbol_ref (symbol, 0) != ':' + && scm_i_symbol_ref (symbol, scm_i_symbol_length (symbol) - 1) != ':') return 0; option = SCM_PRINT_KEYWORD_STYLE; @@ -313,7 +312,7 @@ quote_keywordish_symbol (const char *str, size_t len) } void -scm_print_symbol_name (const char *str, size_t len, SCM port) +scm_i_print_symbol_name (SCM str, SCM port) { /* This points to the first character that has not yet been written to the * port. */ @@ -334,18 +333,20 @@ scm_print_symbol_name (const char *str, size_t len, SCM port) * simpler and faster. */ int maybe_weird = 0; size_t mw_pos = 0; + size_t len = scm_i_symbol_length (str); + scm_t_wchar str0 = scm_i_symbol_ref (str, 0); - if (len == 0 || str[0] == '\'' || str[0] == '`' || str[0] == ',' - || quote_keywordish_symbol (str, len) - || (str[0] == '.' && len == 1) - || scm_is_true (scm_c_locale_stringn_to_number (str, len, 10))) + if (len == 0 || str0 == '\'' || str0 == '`' || str0 == ',' + || quote_keywordish_symbol (str) + || (str0 == '.' && len == 1) + || scm_is_true (scm_i_string_to_number (scm_symbol_to_string (str), 10))) { scm_lfwrite ("#{", 2, port); weird = 1; } for (end = pos; end < len; ++end) - switch (str[end]) + switch (scm_i_symbol_ref (str, end)) { #ifdef BRACKETS_AS_PARENS case '[': @@ -370,11 +371,11 @@ scm_print_symbol_name (const char *str, size_t len, SCM port) weird = 1; } if (pos < end) - scm_lfwrite (str + pos, end - pos, port); + scm_lfwrite_substr (scm_symbol_to_string (str), pos, end, port); { char buf[2]; buf[0] = '\\'; - buf[1] = str[end]; + buf[1] = (char) (unsigned char) scm_i_symbol_ref (str, end); scm_lfwrite (buf, 2, port); } pos = end + 1; @@ -392,11 +393,18 @@ scm_print_symbol_name (const char *str, size_t len, SCM port) break; } if (pos < end) - scm_lfwrite (str + pos, end - pos, port); + scm_lfwrite_substr (scm_symbol_to_string (str), pos, end, port); if (weird) scm_lfwrite ("}#", 2, port); } +void +scm_print_symbol_name (const char *str, size_t len, SCM port) +{ + SCM symbol = scm_from_locale_symboln (str, len); + return scm_i_print_symbol_name (symbol, port); +} + /* Print generally. Handles both write and display according to PSTATE. */ SCM_GPROC(s_write, "write", 1, 1, 0, scm_write, g_write); @@ -665,16 +673,13 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate) case scm_tc7_symbol: if (scm_i_symbol_is_interned (exp)) { - scm_print_symbol_name (scm_i_symbol_chars (exp), - scm_i_symbol_length (exp), port); + scm_i_print_symbol_name (exp, port); scm_remember_upto_here_1 (exp); } else { scm_puts ("#<uninterned-symbol ", port); - scm_print_symbol_name (scm_i_symbol_chars (exp), - scm_i_symbol_length (exp), - port); + scm_i_print_symbol_name (exp, port); scm_putc (' ', port); scm_uintprint (SCM_UNPACK (exp), 16, port); scm_putc ('>', port); @@ -726,14 +731,16 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate) EXIT_NESTED_DATA (pstate); break; case scm_tcs_subrs: - scm_puts (SCM_SUBR_GENERIC (exp) - ? "#<primitive-generic " - : "#<primitive-procedure ", - port); - scm_puts (scm_i_symbol_chars (SCM_SUBR_NAME (exp)), port); - scm_putc ('>', port); - break; - + { + SCM name = scm_symbol_to_string (SCM_SUBR_NAME (exp)); + scm_puts (SCM_SUBR_GENERIC (exp) + ? "#<primitive-generic " + : "#<primitive-procedure ", + port); + scm_lfwrite_str (name, port); + scm_putc ('>', port); + break; + } case scm_tc7_pws: scm_puts ("#<procedure-with-setter", port); { diff --git a/libguile/print.h b/libguile/print.h index 00648efc1..3e2333ddd 100644 --- a/libguile/print.h +++ b/libguile/print.h @@ -82,6 +82,7 @@ SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port); SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port); SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port); SCM_API void scm_iprlist (char *hdr, SCM exp, int tlr, SCM port, scm_print_state *pstate); +SCM_INTERNAL void scm_i_print_symbol_name (SCM sym, SCM port); SCM_API void scm_print_symbol_name (const char *str, size_t len, SCM port); SCM_API void scm_prin1 (SCM exp, SCM port, int writingp); SCM_API void scm_iprin1 (SCM exp, SCM port, scm_print_state *pstate); diff --git a/libguile/symbols.c b/libguile/symbols.c index c0ba2a8b4..a9320163a 100644 --- a/libguile/symbols.c +++ b/libguile/symbols.c @@ -89,11 +89,11 @@ scm_i_hash_symbol (SCM obj, unsigned long n, void *closure) } static SCM -lookup_interned_symbol (const char *name, size_t len, - unsigned long raw_hash) +lookup_interned_symbol (SCM name, unsigned long raw_hash) { /* Try to find the symbol in the symbols table */ SCM l; + size_t len = scm_i_string_length (name); unsigned long hash = raw_hash % SCM_HASHTABLE_N_BUCKETS (symbols); for (l = SCM_HASHTABLE_BUCKET (symbols, hash); @@ -104,15 +104,32 @@ lookup_interned_symbol (const char *name, size_t len, if (scm_i_symbol_hash (sym) == raw_hash && scm_i_symbol_length (sym) == len) { - const char *chrs = scm_i_symbol_chars (sym); - size_t i = len; - - while (i != 0) - { - --i; - if (name[i] != chrs[i]) - goto next_symbol; - } + size_t i = len; + + /* Slightly faster path for comparing narrow to narrow. */ + if (scm_i_is_narrow_string (name) && scm_i_is_narrow_symbol (sym)) + { + const char *chrs = scm_i_symbol_chars (sym); + const char *str = scm_i_string_chars (name); + + while (i != 0) + { + --i; + if (str[i] != chrs[i]) + goto next_symbol; + } + } + else + { + /* Somewhat slower path for comparing narrow to wide or + wide to wide. */ + while (i != 0) + { + --i; + if (scm_i_string_ref (name, i) != scm_i_symbol_ref (sym, i)) + goto next_symbol; + } + } return sym; } @@ -142,32 +159,12 @@ intern_symbol (SCM symbol) } static SCM -scm_i_c_mem2symbol (const char *name, size_t len) -{ - SCM symbol; - size_t raw_hash = scm_string_hash ((const unsigned char *) name, len); - - symbol = lookup_interned_symbol (name, len, raw_hash); - if (scm_is_false (symbol)) - { - /* The symbol was not found, create it. */ - symbol = scm_i_c_make_symbol (name, len, 0, raw_hash, - scm_cons (SCM_BOOL_F, SCM_EOL)); - intern_symbol (symbol); - } - - return symbol; -} - -static SCM -scm_i_mem2symbol (SCM str) +scm_i_str2symbol (SCM str) { SCM symbol; - const char *name = scm_i_string_chars (str); - size_t len = scm_i_string_length (str); - size_t raw_hash = scm_string_hash ((const unsigned char *) name, len); + size_t raw_hash = scm_i_string_hash (str); - symbol = lookup_interned_symbol (name, len, raw_hash); + symbol = lookup_interned_symbol (str, raw_hash); if (scm_is_false (symbol)) { /* The symbol was not found, create it. */ @@ -181,11 +178,9 @@ scm_i_mem2symbol (SCM str) static SCM -scm_i_mem2uninterned_symbol (SCM str) +scm_i_str2uninterned_symbol (SCM str) { - const char *name = scm_i_string_chars (str); - size_t len = scm_i_string_length (str); - size_t raw_hash = scm_string_hash ((const unsigned char *) name, len); + size_t raw_hash = scm_i_string_hash (str); return scm_i_make_symbol (str, SCM_I_F_SYMBOL_UNINTERNED, raw_hash, scm_cons (SCM_BOOL_F, SCM_EOL)); @@ -220,7 +215,7 @@ SCM_DEFINE (scm_make_symbol, "make-symbol", 1, 0, 0, #define FUNC_NAME s_scm_make_symbol { SCM_VALIDATE_STRING (1, name); - return scm_i_mem2uninterned_symbol (name); + return scm_i_str2uninterned_symbol (name); } #undef FUNC_NAME @@ -282,7 +277,7 @@ SCM_DEFINE (scm_string_to_symbol, "string->symbol", 1, 0, 0, #define FUNC_NAME s_scm_string_to_symbol { SCM_VALIDATE_STRING (1, string); - return scm_i_mem2symbol (string); + return scm_i_str2symbol (string); } #undef FUNC_NAME @@ -389,44 +384,23 @@ SCM_DEFINE (scm_symbol_pset_x, "symbol-pset!", 2, 0, 0, SCM scm_from_locale_symbol (const char *sym) { - return scm_i_c_mem2symbol (sym, strlen (sym)); + return scm_from_locale_symboln (sym, -1); } SCM scm_from_locale_symboln (const char *sym, size_t len) { - return scm_i_c_mem2symbol (sym, len); + SCM str = scm_from_locale_stringn (sym, len); + return scm_i_str2symbol (str); } SCM scm_take_locale_symboln (char *sym, size_t len) { - SCM res; - unsigned long raw_hash; - - if (len == (size_t)-1) - len = strlen (sym); - else - { - /* Ensure STR is null terminated. A realloc for 1 extra byte should - often be satisfied from the alignment padding after the block, with - no actual data movement. */ - sym = scm_realloc (sym, len+1); - sym[len] = '\0'; - } - - raw_hash = scm_string_hash ((unsigned char *)sym, len); - res = lookup_interned_symbol (sym, len, raw_hash); - if (scm_is_false (res)) - { - res = scm_i_c_take_symbol (sym, len, 0, raw_hash, - scm_cons (SCM_BOOL_F, SCM_EOL)); - intern_symbol (res); - } - else - free (sym); + SCM str; - return res; + str = scm_take_locale_stringn (sym, len); + return scm_i_str2symbol (str); } SCM diff --git a/test-suite/tests/symbols.test b/test-suite/tests/symbols.test index 3b1abe1e9..b6dbb9d59 100644 --- a/test-suite/tests/symbols.test +++ b/test-suite/tests/symbols.test @@ -61,15 +61,13 @@ (let ((s 'x0123456789012345678901234567890123456789)) (not (assq-ref (%symbol-dump s) 'stringbuf-inline)))) - ;; symbol->string isn't ready for UCS-4 yet - - ;;(pass-if "short UCS-4-encoded symbols are not inlined" - ;; (let ((s (string->symbol "\u0100"))) - ;; (not (assq-ref (%symbol-dump s) 'stringbuf-inline)))) + (pass-if "short UCS-4-encoded symbols are not inlined" + (let ((s (string->symbol "\u0100"))) + (not (assq-ref (%symbol-dump s) 'stringbuf-inline)))) - ;;(pass-if "long UCS-4-encoded symbols are not inlined" - ;; (let ((s (string->symbol "\u010012345678901234567890123456789"))) - ;; (not (assq-ref (%symbol-dump s) 'stringbuf-inline)))) + (pass-if "long UCS-4-encoded symbols are not inlined" + (let ((s (string->symbol "\u010012345678901234567890123456789"))) + (not (assq-ref (%symbol-dump s) 'stringbuf-inline)))) (with-test-prefix "hashes" @@ -99,16 +97,13 @@ (let ((s (string->symbol "\xC0\xC1\xC2"))) (not (assq-ref (%symbol-dump s) 'stringbuf-wide)))) - ;; symbol->string isn't ready for UCS-4 yet - - ;;(pass-if "BMP symbols are UCS-4 encoded" - ;; (let ((s (string->symbol "\u0100\u0101\x0102"))) - ;; (assq-ref (%symbol-dump s) 'stringbuf-wide))) + (pass-if "BMP symbols are UCS-4 encoded" + (let ((s (string->symbol "\u0100\u0101\x0102"))) + (assq-ref (%symbol-dump s) 'stringbuf-wide))) - ;;(pass-if "SMP symbols are UCS-4 encoded" - ;; (let ((s (string->symbol "\U010300\u010301\x010302"))) - ;; (assq-ref (%symbol-dump s) 'stringbuf-wide))) - )) + (pass-if "SMP symbols are UCS-4 encoded" + (let ((s (string->symbol "\U010300\u010301\x010302"))) + (assq-ref (%symbol-dump s) 'stringbuf-wide))))) ;;; ;;; symbol? @@ -125,6 +120,16 @@ (pass-if "symbol" (symbol? 'foo))) +;;; +;;; wide symbols +;;; + +(with-test-prefix "BMP symbols" + + (pass-if "BMP symbol's string" + (and (= 4 (string-length "abc\u0100")) + (string=? "abc\u0100" + (symbol->string (string->symbol "abc\u0100")))))) ;;; ;;; symbol->string |