diff options
author | Andy Wingo <wingo@pobox.com> | 2017-02-16 12:57:46 +0100 |
---|---|---|
committer | Andy Wingo <wingo@pobox.com> | 2017-02-16 13:11:29 +0100 |
commit | d0934df1f2f0e5d3fa9a1a1f15e6f2dec1d15698 (patch) | |
tree | 4f15abdb93948be9df6510b8dd9cb658a56fb3ae /libguile | |
parent | c38b9625c88f4d1760068926273b6d89ffbd4527 (diff) |
Stringbufs immutable by default
* libguile/snarf.h (SCM_IMMUTABLE_STRINGBUF): Remove shared flag.
Stringbufs are immutable by default.
* libguile/strings.c: Rewrite blurb. Change to have stringbufs be
immutable by default and mutable only when marked as such. Going
mutable means making a private copy.
(STRINGBUF_MUTABLE, STRINGBUF_F_MUTABLE): New definitions.
(SET_STRINGBUF_SHARED): Remove.
(scm_i_print_stringbuf): Simplify to just alias the stringbuf as-is.
(substring_with_immutable_stringbuf): New helper.
(scm_i_substring, scm_i_substring_read_only, scm_i_substring_copy):
use new helper.
(scm_i_string_ensure_mutable_x): New helper.
(scm_i_substring_shared): Use scm_i_string_ensure_mutable_x.
(stringbuf_write_mutex): Remove; yaaaaaaaay.
(scm_i_string_start_writing): Use scm_i_string_ensure_mutable_x. No
more mutex.
(scm_i_string_stop_writing): Now a no-op.
(scm_i_make_symbol): Use substring/copy.
(scm_sys_string_dump, scm_sys_symbol_dump): Update.
* libguile/strings.h (SCM_I_STRINGBUF_F_SHARED): Remove.
(SCM_I_STRINGBUF_F_MUTABLE): Add.
* module/system/vm/assembler.scm (link-data): Don't add shared flag any
more. Existing compiled flags are harmless tho.
* test-suite/tests/strings.test ("string internals"): Update.
Diffstat (limited to 'libguile')
-rw-r--r-- | libguile/snarf.h | 2 | ||||
-rw-r--r-- | libguile/strings.c | 325 | ||||
-rw-r--r-- | libguile/strings.h | 2 |
3 files changed, 129 insertions, 200 deletions
diff --git a/libguile/snarf.h b/libguile/snarf.h index d0b683308..aafd5bd13 100644 --- a/libguile/snarf.h +++ b/libguile/snarf.h @@ -308,7 +308,7 @@ SCM_SNARF_INIT(scm_set_smob_apply((tag), (c_name), (req), (opt), (rest));) } \ c_name = \ { \ - scm_tc7_stringbuf | SCM_I_STRINGBUF_F_SHARED, \ + scm_tc7_stringbuf, \ sizeof (contents) - 1, \ contents \ } diff --git a/libguile/strings.c b/libguile/strings.c index a153d29ab..e460a938d 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -54,40 +54,34 @@ SCM_SYMBOL (sym_UTF_8, "UTF-8"); SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1"); SCM_SYMBOL (sym_error, "error"); -/* Stringbufs - * - * XXX - keeping an accurate refcount during GC seems to be quite - * tricky, so we just keep score of whether a stringbuf might be - * shared, not whether it definitely is. - * - * The scheme I (mvo) tried to keep an accurate reference count would - * recount all strings that point to a stringbuf during the mark-phase - * of the GC. This was done since one cannot access the stringbuf of - * a string when that string is freed (in order to decrease the - * reference count). The memory of the stringbuf might have been - * reused already for something completely different. - * - * This recounted worked for a small number of threads beating on - * cow-strings, but it failed randomly with more than 10 threads, say. - * I couldn't figure out what went wrong, so I used the conservative - * approach implemented below. - * - * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit - * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4) - * strings. - */ +/* A stringbuf is a linear buffer of characters. Every string has a + stringbuf. Strings may reference just a slice of a stringbuf; that's + often the case for strings made by the "substring" function. + + Stringbufs may hold either 8-bit characters or 32-bit characters. In + either case the characters are Unicode codepoints. "Narrow" + stringbufs thus have the ISO-8859-1 (Latin-1) encoding, and "wide" + stringbufs have the UTF-32 (UCS-4) encoding. + + By default, stringbufs are immutable. This enables an O(1) + "substring" operation with no synchronization. A string-set! will + first ensure that the string's stringbuf is mutable, copying the + stringbuf if necessary. This is therefore a copy-on-write + representation. However, taking a substring of a mutable stringbuf + is an O(n) operation as it has to create a new immutable stringbuf. + There are also mutation-sharing substrings as well. */ /* The size in words of the stringbuf header (type tag + size). */ #define STRINGBUF_HEADER_SIZE 2U #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM)) -#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE +#define STRINGBUF_F_MUTABLE SCM_I_STRINGBUF_F_MUTABLE #define STRINGBUF_TAG scm_tc7_stringbuf -#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED) #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE) +#define STRINGBUF_MUTABLE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_MUTABLE) #define STRINGBUF_CONTENTS(buf) ((void *) \ SCM_CELL_OBJECT_LOC (buf, \ @@ -97,16 +91,6 @@ SCM_SYMBOL (sym_error, "error"); #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf)) -#define SET_STRINGBUF_SHARED(buf) \ - do \ - { \ - /* Don't modify BUF if it's already marked as shared since it might be \ - a read-only, statically allocated stringbuf. */ \ - if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \ - SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \ - } \ - while (0) - #ifdef SCM_STRING_LENGTH_HISTOGRAM static size_t lenhist[1001]; #endif @@ -231,8 +215,6 @@ narrow_stringbuf (SCM buf) return new_buf; } -scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER; - /* Copy-on-write strings. */ @@ -267,15 +249,8 @@ scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER; void scm_i_print_stringbuf (SCM exp, SCM port, scm_print_state *pstate) { - SCM str; - - scm_i_pthread_mutex_lock (&stringbuf_write_mutex); - SET_STRINGBUF_SHARED (exp); - scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); - - str = scm_double_cell (RO_STRING_TAG, SCM_UNPACK(exp), - 0, STRINGBUF_LENGTH (exp)); - + SCM str = scm_double_cell (STRING_TAG, SCM_UNPACK(exp), 0, + STRINGBUF_LENGTH (exp)); scm_puts ("#<stringbuf ", port); scm_iprin1 (str, port, pstate); scm_puts (">", port); @@ -289,7 +264,6 @@ static void init_null_stringbuf (void) { null_stringbuf = make_stringbuf (0); - SET_STRINGBUF_SHARED (null_stringbuf); } /* Create a scheme string with space for LEN 8-bit Latin-1-encoded @@ -359,77 +333,110 @@ get_str_buf_start (SCM *str, SCM *buf, size_t *start) *buf = STRING_STRINGBUF (*str); } -SCM -scm_i_substring (SCM str, size_t start, size_t end) +static SCM +substring_with_immutable_stringbuf (SCM str, size_t start, size_t end, + int force_copy_p, int read_only_p) { - if (start == end) - return scm_i_make_string (0, NULL, 0); + SCM buf; + size_t str_start, len; + scm_t_bits tag = read_only_p ? RO_STRING_TAG : STRING_TAG; + + get_str_buf_start (&str, &buf, &str_start); + len = end - start; + start += str_start; + + if (len == 0) + return scm_i_make_string (0, NULL, read_only_p); + else if (!force_copy_p && SCM_LIKELY (!STRINGBUF_MUTABLE (buf))) + return scm_double_cell (tag, SCM_UNPACK (buf), start, len); else { - SCM buf; - size_t str_start; - get_str_buf_start (&str, &buf, &str_start); - scm_i_pthread_mutex_lock (&stringbuf_write_mutex); - SET_STRINGBUF_SHARED (buf); - scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); - return scm_double_cell (STRING_TAG, SCM_UNPACK(buf), - (scm_t_bits)str_start + start, - (scm_t_bits) end - start); + SCM new_buf, new_str; + + if (STRINGBUF_WIDE (buf)) + { + new_buf = make_wide_stringbuf (len); + u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf), + (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + start), len); + new_str = scm_double_cell (tag, SCM_UNPACK (new_buf), 0, len); + scm_i_try_narrow_string (new_str); + } + else + { + new_buf = make_stringbuf (len); + memcpy (STRINGBUF_CHARS (new_buf), + STRINGBUF_CHARS (buf) + start, len); + new_str = scm_double_cell (tag, SCM_UNPACK (new_buf), 0, len); + } + + return new_str; } } SCM +scm_i_substring (SCM str, size_t start, size_t end) +{ + return substring_with_immutable_stringbuf (str, start, end, 0, 0); +} + +SCM scm_i_substring_read_only (SCM str, size_t start, size_t end) { - if (start == end) - return scm_i_make_string (0, NULL, 1); - else - { - SCM buf; - size_t str_start; - get_str_buf_start (&str, &buf, &str_start); - scm_i_pthread_mutex_lock (&stringbuf_write_mutex); - SET_STRINGBUF_SHARED (buf); - scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); - return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf), - (scm_t_bits)str_start + start, - (scm_t_bits) end - start); - } + return substring_with_immutable_stringbuf (str, start, end, 0, 1); } SCM scm_i_substring_copy (SCM str, size_t start, size_t end) { - if (start == end) - return scm_i_make_string (0, NULL, 0); - else + return substring_with_immutable_stringbuf (str, start, end, 1, 0); +} + +static void +scm_i_string_ensure_mutable_x (SCM str) +{ + SCM buf; + + if (IS_SH_STRING (str)) { - size_t len = end - start; - SCM buf, my_buf, substr; - size_t str_start; - int wide = 0; - get_str_buf_start (&str, &buf, &str_start); - if (scm_i_is_narrow_string (str)) - { - my_buf = make_stringbuf (len); - memcpy (STRINGBUF_CHARS (my_buf), - STRINGBUF_CHARS (buf) + str_start + start, len); - } - else - { - my_buf = make_wide_stringbuf (len); - u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf), - (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start - + start), len); - wide = 1; - } - scm_remember_upto_here_1 (buf); - substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf), - (scm_t_bits) 0, (scm_t_bits) len); - if (wide) - scm_i_try_narrow_string (substr); - return substr; + /* Shared-mutation strings always have mutable stringbufs. */ + buf = STRING_STRINGBUF (SH_STRING_STRING (str)); + if (!STRINGBUF_MUTABLE (buf)) + abort (); + return; } + + if (IS_RO_STRING (str)) + scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (str)); + + buf = STRING_STRINGBUF (str); + + if (STRINGBUF_MUTABLE (buf)) + return; + + /* Otherwise copy and mark the fresh stringbuf as mutable. Note that + we copy the whole stringbuf so that the start/len offsets from the + original string keep working, so that concurrent accessors on this + string don't see things in an inconsistent state. */ + { + SCM new_buf; + size_t len = STRINGBUF_LENGTH (buf); + + if (STRINGBUF_WIDE (buf)) + { + new_buf = make_wide_stringbuf (len); + u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf), + (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len); + } + else + { + new_buf = make_stringbuf (len); + memcpy (STRINGBUF_CHARS (new_buf), STRINGBUF_CHARS (buf), len); + } + + SCM_SET_CELL_WORD_0 (new_buf, + SCM_CELL_WORD_0 (new_buf) | STRINGBUF_F_MUTABLE); + SET_STRING_STRINGBUF (str, new_buf); + } } SCM @@ -439,6 +446,8 @@ scm_i_substring_shared (SCM str, size_t start, size_t end) return str; else if (start == end) return scm_i_make_string (0, NULL, 0); + else if (IS_RO_STRING (str)) + return scm_i_substring_read_only (str, start, end); else { size_t len = end - start; @@ -447,6 +456,9 @@ scm_i_substring_shared (SCM str, size_t start, size_t end) start += STRING_START (str); str = SH_STRING_STRING (str); } + + scm_i_string_ensure_mutable_x (str); + return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str), (scm_t_bits)start, (scm_t_bits) len); } @@ -568,60 +580,13 @@ scm_i_string_wide_chars (SCM str) scm_list_1 (str)); } -/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to +/* If the buffer in ORIG_STR is immutable, copy ORIG_STR's characters to a new string buffer, so that it can be modified without modifying - other strings. Also, lock the string mutex. Later, one must call - scm_i_string_stop_writing to unlock the mutex. */ + other strings. */ SCM scm_i_string_start_writing (SCM orig_str) { - SCM buf, str = orig_str; - size_t start; - - get_str_buf_start (&str, &buf, &start); - if (IS_RO_STRING (str)) - scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str)); - - scm_i_pthread_mutex_lock (&stringbuf_write_mutex); - if (STRINGBUF_SHARED (buf)) - { - /* Clone the stringbuf. */ - size_t len = STRING_LENGTH (str); - SCM new_buf; - - scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); - - if (scm_i_is_narrow_string (str)) - { - new_buf = make_stringbuf (len); - memcpy (STRINGBUF_CHARS (new_buf), - STRINGBUF_CHARS (buf) + STRING_START (str), len); - - } - else - { - new_buf = make_wide_stringbuf (len); - u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf), - (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) - + STRING_START (str)), len); - } - - SET_STRING_STRINGBUF (str, new_buf); - start -= STRING_START (str); - - /* FIXME: The following operations are not atomic, so other threads - looking at STR may see an inconsistent state. Nevertheless it can't - hurt much since (i) accessing STR while it is being mutated can't - yield a crash, and (ii) concurrent accesses to STR should be - protected by a mutex at the application level. The latter may not - apply when STR != ORIG_STR, though. */ - SET_STRING_START (str, 0); - SET_STRING_STRINGBUF (str, new_buf); - - buf = new_buf; - - scm_i_pthread_mutex_lock (&stringbuf_write_mutex); - } + scm_i_string_ensure_mutable_x (orig_str); return orig_str; } @@ -661,7 +626,6 @@ scm_i_string_writable_wide_chars (SCM str) void scm_i_string_stop_writing (void) { - scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); } /* Return the Xth character of STR as a UCS-4 codepoint. */ @@ -768,42 +732,10 @@ scm_i_make_symbol (SCM name, scm_t_bits flags, unsigned long hash, SCM props) { SCM buf; - size_t start = STRING_START (name); size_t length = STRING_LENGTH (name); - if (IS_SH_STRING (name)) - { - name = SH_STRING_STRING (name); - start += STRING_START (name); - } + name = scm_i_substring_copy (name, 0, length); buf = STRING_STRINGBUF (name); - - if (start == 0 && length == STRINGBUF_LENGTH (buf)) - { - /* reuse buf. */ - scm_i_pthread_mutex_lock (&stringbuf_write_mutex); - SET_STRINGBUF_SHARED (buf); - scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); - } - else - { - /* make new buf. */ - if (scm_i_is_narrow_string (name)) - { - SCM new_buf = make_stringbuf (length); - memcpy (STRINGBUF_CHARS (new_buf), - STRINGBUF_CHARS (buf) + start, length); - buf = new_buf; - } - else - { - SCM new_buf = make_wide_stringbuf (length); - u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf), - (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start, - length); - buf = new_buf; - } - } return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf), (scm_t_bits) hash, SCM_UNPACK (props)); } @@ -882,9 +814,6 @@ SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end) { SCM buf = SYMBOL_STRINGBUF (sym); - scm_i_pthread_mutex_lock (&stringbuf_write_mutex); - SET_STRINGBUF_SHARED (buf); - scm_i_pthread_mutex_unlock (&stringbuf_write_mutex); return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf), (scm_t_bits)start, (scm_t_bits) end - start); } @@ -921,8 +850,8 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str), "A new string containing this string's stringbuf's characters\n" "@item stringbuf-length\n" "The number of characters in this stringbuf\n" - "@item stringbuf-shared\n" - "@code{#t} if this stringbuf is shared\n" + "@item stringbuf-mutable\n" + "@code{#t} if this stringbuf is mutable\n" "@item stringbuf-wide\n" "@code{#t} if this stringbuf's characters are stored in a\n" "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n" @@ -984,11 +913,11 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str), } e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"), scm_from_size_t (STRINGBUF_LENGTH (buf))); - if (STRINGBUF_SHARED (buf)) - e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"), + if (STRINGBUF_MUTABLE (buf)) + e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-mutable"), SCM_BOOL_T); else - e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"), + e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-mutable"), SCM_BOOL_F); if (STRINGBUF_WIDE (buf)) e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"), @@ -1015,8 +944,8 @@ SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym), "A new string containing this symbols's stringbuf's characters\n" "@item stringbuf-length\n" "The number of characters in this stringbuf\n" - "@item stringbuf-shared\n" - "@code{#t} if this stringbuf is shared\n" + "@item stringbuf-mutable\n" + "@code{#t} if this stringbuf is mutable\n" "@item stringbuf-wide\n" "@code{#t} if this stringbuf's characters are stored in a\n" "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n" @@ -1057,11 +986,11 @@ SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym), } e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"), scm_from_size_t (STRINGBUF_LENGTH (buf))); - if (STRINGBUF_SHARED (buf)) - e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"), + if (STRINGBUF_MUTABLE (buf)) + e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-mutable"), SCM_BOOL_T); else - e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"), + e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-mutable"), SCM_BOOL_F); if (STRINGBUF_WIDE (buf)) e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"), diff --git a/libguile/strings.h b/libguile/strings.h index 882e7ce64..77690ce67 100644 --- a/libguile/strings.h +++ b/libguile/strings.h @@ -182,8 +182,8 @@ SCM_API SCM scm_makfromstrs (int argc, char **argv); #define scm_tc7_ro_string (scm_tc7_string + 0x200) /* Flags for shared and wide strings. */ -#define SCM_I_STRINGBUF_F_SHARED 0x100 #define SCM_I_STRINGBUF_F_WIDE 0x400 +#define SCM_I_STRINGBUF_F_MUTABLE 0x800 SCM_INTERNAL void scm_i_print_stringbuf (SCM exp, SCM port, scm_print_state *pstate); |