summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Gran <spk121@yahoo.com>2009-08-08 02:35:00 -0700
committerMichael Gran <spk121@yahoo.com>2009-08-08 02:35:00 -0700
commit9c44cd4559a5d04ba70bbd9ff47f41bfdfebd09d (patch)
treeea93002217c56ad32f123836dab5d6fd1ef02930
parenta876e7dcea78e770bedba40017fbb225cf88bff5 (diff)
Add Unicode strings and symbols
This adds full Unicode strings as a datatype, and it adds some minimal functionality. The terminal and port encoding is assumed to be ISO-8859-1. Non-ISO-8859-1 characters are written or input as string character escapes. The string character escapes now have 3 forms: \xXX \uXXXX and \UXXXXXX, for unprintable characters that have 2, 4 or 6 hex digits. The process for writing to strings has been modified. There is now a function scm_i_string_start_writing that does the copy-on-write conversion if necessary. To compile strings that may be wide, the VM storage of strings and string-likes has changed. Most string-using functions have not yet been updated and may break when used with wide strings. * module/language/assembly/compile-bytecode.scm (write-bytecode): use variable width string bytecode format * module/language/assembly.scm (byte-length): use variable width bytecode format * libguile/vm-i-loader.c (load-string, load-symbol): (load-keyword, define): use variable-width bytecode format * libguile/vm-engine.h (FETCH_WIDTH): new macro * libguile/strings.h: new declarations * libguile/strings.c (make_wide_stringbuf): new function (widen_stringbuf): new function (scm_i_make_wide_string): new function (scm_i_is_narrow_string): new function (scm_i_string_wide_chars): new function (scm_i_string_start_writing): new function (scm_i_string_ref): new function (scm_i_string_set_x): new function (scm_i_is_narrow_symbol): new function (scm_i_symbol_wide_chars, scm_i_symbol_ref): new function (scm_string_width): new function (unistring_escapes_to_guile_escapes): new function (scm_to_stringn): new function (scm_i_stringbuf_free): modify for wide strings (scm_i_substring_copy): modify for wide strings (scm_i_string_chars, scm_string_append): modify for wide strings (scm_i_make_symbol, scm_to_locale_stringn): modify for wide strings (scm_string_dump, scm_symbol_dump, scm_to_locale_stringbuf): (scm_string, scm_i_deprecated_string_chars): modify for wide strings (scm_from_locale_string, scm_from_locale_stringn): add null test * libguile/srfi-13.c: add calls for scm_i_string_start_writing for each call of scm_i_string_stop_writing (scm_string_for_each): modify for wide strings * libguile/socket.c: add calls for scm_i_string_start_writing for each call of scm_i_string_stop_writing * libguile/rw.c: add calls for scm_i_string_start_writing for each call of scm_i_string_stop_writing * libguile/read.c (scm_read_string): allow reading of wide strings * libguile/print.h: add declaration for scm_charprint * libguile/print.c (iprin1): print wide strings and add new string escapes (scm_charprint): new function * libguile/ports.h: new declarations for scm_lfwrite_substr and scm_lfwrite_str * libguile/ports.c (update_port_lf): new function (scm_lfwrite): use update_port_lf (scm_lfwrite_substr): new function (scm_lfwrite_str): new function * test-suite/tests/asm-to-bytecode.test ("compiler"): add string width byte to sting-like asm tests
-rw-r--r--libguile/ports.c90
-rw-r--r--libguile/ports.h3
-rw-r--r--libguile/print.c157
-rw-r--r--libguile/print.h1
-rw-r--r--libguile/read.c233
-rw-r--r--libguile/rw.c2
-rw-r--r--libguile/socket.c3
-rw-r--r--libguile/srfi-13.c23
-rw-r--r--libguile/strings.c649
-rw-r--r--libguile/strings.h59
-rw-r--r--libguile/vm-engine.h1
-rw-r--r--libguile/vm-i-loader.c87
-rw-r--r--module/language/assembly.scm12
-rw-r--r--module/language/assembly/compile-bytecode.scm26
-rw-r--r--test-suite/tests/asm-to-bytecode.test6
15 files changed, 1046 insertions, 306 deletions
diff --git a/libguile/ports.c b/libguile/ports.c
index 627fd3f00..2c1a3898f 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -969,7 +969,35 @@ scm_fill_input (SCM port)
* This function differs from scm_c_write; it updates port line and
* column. */
-void
+static void
+update_port_lf (scm_t_wchar c, SCM port)
+{
+ if (c == '\a')
+ {
+ }
+ else if (c == '\b')
+ {
+ SCM_DECCOL (port);
+ }
+ else if (c == '\n')
+ {
+ SCM_INCLINE (port);
+ }
+ else if (c == '\r')
+ {
+ SCM_ZEROCOL (port);
+ }
+ else if (c == '\t')
+ {
+ SCM_TABCOL (port);
+ }
+ else
+ {
+ SCM_INCCOL (port);
+ }
+}
+
+void
scm_lfwrite (const char *ptr, size_t size, SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
@@ -980,30 +1008,54 @@ scm_lfwrite (const char *ptr, size_t size, SCM port)
ptob->write (port, ptr, size);
- for (; size; ptr++, size--) {
- if (*ptr == '\a') {
- }
- else if (*ptr == '\b') {
- SCM_DECCOL(port);
- }
- else if (*ptr == '\n') {
- SCM_INCLINE(port);
- }
- else if (*ptr == '\r') {
- SCM_ZEROCOL(port);
- }
- else if (*ptr == '\t') {
- SCM_TABCOL(port);
- }
- else {
- SCM_INCCOL(port);
+ for (; size; ptr++, size--)
+ update_port_lf ((scm_t_wchar) (unsigned char) *ptr, port);
+
+ if (pt->rw_random)
+ pt->rw_active = SCM_PORT_WRITE;
+}
+
+/* Write a scheme string STR to PORT from START inclusive to END
+ exclusive. */
+void
+scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
+{
+ size_t i, size = scm_i_string_length (str);
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)];
+ scm_t_wchar p;
+ char *buf;
+ size_t len;
+
+ if (pt->rw_active == SCM_PORT_READ)
+ scm_end_input (port);
+
+ if (end == -1)
+ end = size;
+ size = end - start;
+
+ buf = scm_to_stringn (scm_c_substring (str, start, end), &len,
+ NULL, iconveh_escape_sequence);
+ ptob->write (port, buf, len);
+ free (buf);
+
+ for (i = 0; i < size; i++)
+ {
+ p = scm_i_string_ref (str, i + start);
+ update_port_lf (p, port);
}
- }
if (pt->rw_random)
pt->rw_active = SCM_PORT_WRITE;
}
+/* Write a scheme string STR to PORT. */
+void
+scm_lfwrite_str (SCM str, SCM port)
+{
+ scm_lfwrite_substr (str, 0, -1, port);
+}
+
/* scm_c_read
*
* Used by an application to read arbitrary number of bytes from an
diff --git a/libguile/ports.h b/libguile/ports.h
index 8a21b09f9..d427fecb1 100644
--- a/libguile/ports.h
+++ b/libguile/ports.h
@@ -269,6 +269,9 @@ SCM_API SCM scm_read_char (SCM port);
SCM_API size_t scm_c_read (SCM port, void *buffer, size_t size);
SCM_API void scm_c_write (SCM port, const void *buffer, size_t size);
SCM_API void scm_lfwrite (const char *ptr, size_t size, SCM port);
+SCM_INTERNAL void scm_lfwrite_str (SCM str, SCM port);
+SCM_INTERNAL void scm_lfwrite_substr (SCM str, size_t start, size_t end,
+ SCM port);
SCM_API void scm_flush (SCM port);
SCM_API void scm_end_input (SCM port);
SCM_API int scm_fill_input (SCM port);
diff --git a/libguile/print.c b/libguile/print.c
index f43856bbe..6f31fcf4a 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -559,55 +559,113 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
break;
}
break;
- case scm_tc7_string:
- if (SCM_WRITINGP (pstate))
- {
- size_t i, j, len;
- const char *data;
+ case scm_tc7_string:
+ if (SCM_WRITINGP (pstate))
+ {
+ size_t i, j, len;
+ static char const hex[] = "0123456789abcdef";
+ char buf[8];
- scm_putc ('"', port);
- len = scm_i_string_length (exp);
- data = scm_i_string_chars (exp);
- for (i = 0, j = 0; i < len; ++i)
- {
- unsigned char ch = data[i];
- if ((ch < 32 && ch != '\n') || (127 <= ch && ch < 148))
- {
- static char const hex[]="0123456789abcdef";
- char buf[4];
-
- scm_lfwrite (data+j, i-j, port);
- buf[0] = '\\';
- buf[1] = 'x';
- buf[2] = hex [ch / 16];
- buf[3] = hex [ch % 16];
- scm_lfwrite (buf, 4, port);
- data = scm_i_string_chars (exp);
- j = i+1;
- }
- else if (ch == '"' || ch == '\\')
- {
- scm_lfwrite (data+j, i-j, port);
- scm_putc ('\\', port);
- data = scm_i_string_chars (exp);
- j = i;
- }
- }
- scm_lfwrite (data+j, i-j, port);
- scm_putc ('"', port);
- scm_remember_upto_here_1 (exp);
- }
- else
- scm_lfwrite (scm_i_string_chars (exp), scm_i_string_length (exp),
- port);
- scm_remember_upto_here_1 (exp);
- break;
+
+ scm_putc ('"', port);
+ len = scm_i_string_length (exp);
+ for (i = 0; i < len; ++i)
+ {
+ scm_t_wchar ch = scm_i_string_ref (exp, i);
+ int printed = 0;
+
+ if (ch == ' ' || ch == '\n')
+ {
+ scm_putc (ch, port);
+ printed = 1;
+ }
+ else if (ch == '"' || ch == '\\')
+ {
+ scm_putc ('\\', port);
+ scm_charprint (ch, port);
+ printed = 1;
+ }
+ else
+ if (uc_is_general_category_withtable
+ (ch,
+ UC_CATEGORY_MASK_L | UC_CATEGORY_MASK_M |
+ UC_CATEGORY_MASK_N | UC_CATEGORY_MASK_P |
+ UC_CATEGORY_MASK_S))
+ {
+ /* Print the character since it is a graphic
+ character. */
+ scm_t_wchar *wbuf;
+ SCM wstr = scm_i_make_wide_string (1, &wbuf);
+ char *buf;
+ size_t len;
+
+ wbuf[0] = ch;
+
+ buf = u32_conv_to_encoding ("ISO-8859-1",
+ iconveh_error,
+ (scm_t_uint32 *) wbuf,
+ 1, NULL, NULL, &len);
+ if (buf != NULL)
+ {
+ /* Character is graphic and representable in
+ this encoding. Print it. */
+ scm_lfwrite_str (wstr, port);
+ free (buf);
+ printed = 1;
+ }
+ }
+
+ if (!printed)
+ {
+ /* Character is graphic but unrepresentable in
+ this port's encoding or is not graphic. */
+ if (ch <= 0xFF)
+ {
+ buf[0] = '\\';
+ buf[1] = 'x';
+ buf[2] = hex[ch / 16];
+ buf[3] = hex[ch % 16];
+ scm_lfwrite (buf, 4, port);
+ }
+ else if (ch <= 0xFFFF)
+ {
+ buf[0] = '\\';
+ buf[1] = 'u';
+ buf[2] = hex[(ch & 0xF000) >> 12];
+ buf[3] = hex[(ch & 0xF00) >> 8];
+ buf[4] = hex[(ch & 0xF0) >> 4];
+ buf[5] = hex[(ch & 0xF)];
+ scm_lfwrite (buf, 6, port);
+ j = i + 1;
+ }
+ else if (ch > 0xFFFF)
+ {
+ buf[0] = '\\';
+ buf[1] = 'U';
+ buf[2] = hex[(ch & 0xF00000) >> 20];
+ buf[3] = hex[(ch & 0xF0000) >> 16];
+ buf[4] = hex[(ch & 0xF000) >> 12];
+ buf[5] = hex[(ch & 0xF00) >> 8];
+ buf[6] = hex[(ch & 0xF0) >> 4];
+ buf[7] = hex[(ch & 0xF)];
+ scm_lfwrite (buf, 8, port);
+ j = i + 1;
+ }
+ }
+ }
+ scm_putc ('"', port);
+ scm_remember_upto_here_1 (exp);
+ }
+ else
+ scm_lfwrite (scm_i_string_chars (exp), scm_i_string_length (exp),
+ port);
+ scm_remember_upto_here_1 (exp);
+ break;
case scm_tc7_symbol:
if (scm_i_symbol_is_interned (exp))
{
scm_print_symbol_name (scm_i_symbol_chars (exp),
- scm_i_symbol_length (exp),
- port);
+ scm_i_symbol_length (exp), port);
scm_remember_upto_here_1 (exp);
}
else
@@ -763,6 +821,17 @@ scm_prin1 (SCM exp, SCM port, int writingp)
}
}
+/* Print a character.
+ */
+void
+scm_charprint (scm_t_uint32 ch, SCM port)
+{
+ scm_t_wchar *wbuf;
+ SCM wstr = scm_i_make_wide_string (1, &wbuf);
+
+ wbuf[0] = ch;
+ scm_lfwrite_str (wstr, port);
+}
/* Print an integer.
*/
diff --git a/libguile/print.h b/libguile/print.h
index d817a6fc3..1df29522c 100644
--- a/libguile/print.h
+++ b/libguile/print.h
@@ -77,6 +77,7 @@ SCM_API SCM scm_print_options (SCM setting);
SCM_API SCM scm_make_print_state (void);
SCM_API void scm_free_print_state (SCM print_state);
SCM_INTERNAL SCM scm_i_port_with_print_state (SCM port, SCM print_state);
+SCM_API void scm_charprint (scm_t_uint32 c, SCM port);
SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port);
SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port);
SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port);
diff --git a/libguile/read.c b/libguile/read.c
index 2140fed25..577a73e58 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -387,110 +387,167 @@ scm_read_string (int chr, SCM port)
object (the string returned). */
SCM str = SCM_BOOL_F;
- char c_str[READER_STRING_BUFFER_SIZE];
unsigned c_str_len = 0;
- int c;
+ scm_t_wchar c;
+ str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
while ('"' != (c = scm_getc (port)))
{
if (c == EOF)
- str_eof: scm_i_input_error (FUNC_NAME, port,
- "end of file in string constant",
- SCM_EOL);
-
- if (c_str_len + 1 >= sizeof (c_str))
- {
- /* Flush the C buffer onto a Scheme string. */
- SCM addy;
+ {
+ str_eof:
+ scm_i_input_error (FUNC_NAME, port,
+ "end of file in string constant", SCM_EOL);
+ }
- if (str == SCM_BOOL_F)
- str = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
+ if (c_str_len + 1 >= scm_i_string_length (str))
+ {
+ SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
- addy = scm_from_locale_stringn (c_str, c_str_len);
- str = scm_string_append_shared (scm_list_2 (str, addy));
-
- c_str_len = 0;
- }
+ str = scm_string_append (scm_list_2 (str, addy));
+ }
if (c == '\\')
- switch (c = scm_getc (port))
- {
- case EOF:
- goto str_eof;
- case '"':
- case '\\':
- break;
+ {
+ switch (c = scm_getc (port))
+ {
+ case EOF:
+ goto str_eof;
+ case '"':
+ case '\\':
+ break;
#if SCM_ENABLE_ELISP
- case '(':
- case ')':
- if (SCM_ESCAPED_PARENS_P)
- break;
- goto bad_escaped;
+ case '(':
+ case ')':
+ if (SCM_ESCAPED_PARENS_P)
+ break;
+ goto bad_escaped;
#endif
- case '\n':
- continue;
- case '0':
- c = '\0';
- break;
- case 'f':
- c = '\f';
- break;
- case 'n':
- c = '\n';
- break;
- case 'r':
- c = '\r';
- break;
- case 't':
- c = '\t';
- break;
- case 'a':
- c = '\007';
- break;
- case 'v':
- c = '\v';
- break;
- case 'x':
- {
- int a, b;
- a = scm_getc (port);
- if (a == EOF) goto str_eof;
- b = scm_getc (port);
- if (b == EOF) goto str_eof;
- if ('0' <= a && a <= '9') a -= '0';
- else if ('A' <= a && a <= 'F') a = a - 'A' + 10;
- else if ('a' <= a && a <= 'f') a = a - 'a' + 10;
- else goto bad_escaped;
- if ('0' <= b && b <= '9') b -= '0';
- else if ('A' <= b && b <= 'F') b = b - 'A' + 10;
- else if ('a' <= b && b <= 'f') b = b - 'a' + 10;
- else goto bad_escaped;
- c = a * 16 + b;
- break;
- }
- default:
- bad_escaped:
- scm_i_input_error (FUNC_NAME, port,
- "illegal character in escape sequence: ~S",
- scm_list_1 (SCM_MAKE_CHAR (c)));
- }
- c_str[c_str_len++] = c;
+ case '\n':
+ continue;
+ case '0':
+ c = '\0';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'a':
+ c = '\007';
+ break;
+ case 'v':
+ c = '\v';
+ break;
+ case 'x':
+ {
+ scm_t_wchar a, b;
+ a = scm_getc (port);
+ if (a == EOF)
+ goto str_eof;
+ b = scm_getc (port);
+ if (b == EOF)
+ goto str_eof;
+ if ('0' <= a && a <= '9')
+ a -= '0';
+ else if ('A' <= a && a <= 'F')
+ a = a - 'A' + 10;
+ else if ('a' <= a && a <= 'f')
+ a = a - 'a' + 10;
+ else
+ {
+ c = a;
+ goto bad_escaped;
+ }
+ if ('0' <= b && b <= '9')
+ b -= '0';
+ else if ('A' <= b && b <= 'F')
+ b = b - 'A' + 10;
+ else if ('a' <= b && b <= 'f')
+ b = b - 'a' + 10;
+ else
+ {
+ c = b;
+ goto bad_escaped;
+ }
+ c = a * 16 + b;
+ break;
+ }
+ case 'u':
+ {
+ scm_t_wchar a;
+ int i;
+ c = 0;
+ for (i = 0; i < 4; i++)
+ {
+ a = scm_getc (port);
+ if (a == EOF)
+ goto str_eof;
+ if ('0' <= a && a <= '9')
+ a -= '0';
+ else if ('A' <= a && a <= 'F')
+ a = a - 'A' + 10;
+ else if ('a' <= a && a <= 'f')
+ a = a - 'a' + 10;
+ else
+ {
+ c = a;
+ goto bad_escaped;
+ }
+ c = c * 16 + a;
+ }
+ break;
+ }
+ case 'U':
+ {
+ scm_t_wchar a;
+ int i;
+ c = 0;
+ for (i = 0; i < 6; i++)
+ {
+ a = scm_getc (port);
+ if (a == EOF)
+ goto str_eof;
+ if ('0' <= a && a <= '9')
+ a -= '0';
+ else if ('A' <= a && a <= 'F')
+ a = a - 'A' + 10;
+ else if ('a' <= a && a <= 'f')
+ a = a - 'a' + 10;
+ else
+ {
+ c = a;
+ goto bad_escaped;
+ }
+ c = c * 16 + a;
+ }
+ break;
+ }
+ default:
+ bad_escaped:
+ scm_i_input_error (FUNC_NAME, port,
+ "illegal character in escape sequence: ~S",
+ scm_list_1 (SCM_MAKE_CHAR (c)));
+ }
+ }
+ str = scm_i_string_start_writing (str);
+ scm_i_string_set_x (str, c_str_len++, c);
+ scm_i_string_stop_writing ();
}
if (c_str_len > 0)
{
- SCM addy;
-
- addy = scm_from_locale_stringn (c_str, c_str_len);
- if (str == SCM_BOOL_F)
- str = addy;
- else
- str = scm_string_append_shared (scm_list_2 (str, addy));
+ return scm_i_substring_copy (str, 0, c_str_len);
}
- else
- str = (str == SCM_BOOL_F) ? scm_nullstr : str;
-
- return str;
+
+ return scm_nullstr;
}
#undef FUNC_NAME
diff --git a/libguile/rw.c b/libguile/rw.c
index cb62b79b9..a9b4a329a 100644
--- a/libguile/rw.c
+++ b/libguile/rw.c
@@ -131,6 +131,7 @@ SCM_DEFINE (scm_read_string_x_partial, "read-string!/partial", 1, 3, 0,
don't touch the file descriptor. otherwise the
"return immediately if something is available" rule may
be violated. */
+ str = scm_i_string_start_writing (str);
dest = scm_i_string_writable_chars (str) + offset;
chars_read = scm_take_from_input_buffers (port, dest, read_len);
scm_i_string_stop_writing ();
@@ -140,6 +141,7 @@ SCM_DEFINE (scm_read_string_x_partial, "read-string!/partial", 1, 3, 0,
if (chars_read == 0 && read_len > 0) /* don't confuse read_len == 0 with
EOF. */
{
+ str = scm_i_string_start_writing (str);
dest = scm_i_string_writable_chars (str) + offset;
SCM_SYSCALL (chars_read = read (fdes, dest, read_len));
scm_i_string_stop_writing ();
diff --git a/libguile/socket.c b/libguile/socket.c
index 553a1a185..2e02e9082 100644
--- a/libguile/socket.c
+++ b/libguile/socket.c
@@ -1438,6 +1438,7 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
fd = SCM_FPORT_FDES (sock);
len = scm_i_string_length (buf);
+ buf = scm_i_string_start_writing (buf);
dest = scm_i_string_writable_chars (buf);
SCM_SYSCALL (rv = recv (fd, dest, len, flg));
scm_i_string_stop_writing ();
@@ -1482,6 +1483,7 @@ SCM_DEFINE (scm_send, "send", 2, 1, 0,
fd = SCM_FPORT_FDES (sock);
len = scm_i_string_length (message);
+ message = scm_i_string_start_writing (message);
src = scm_i_string_writable_chars (message);
SCM_SYSCALL (rv = send (fd, src, len, flg));
scm_i_string_stop_writing ();
@@ -1550,6 +1552,7 @@ SCM_DEFINE (scm_recvfrom, "recvfrom!", 2, 3, 0,
/* recvfrom will not necessarily return an address. usually nothing
is returned for stream sockets. */
+ str = scm_i_string_start_writing (str);
buf = scm_i_string_writable_chars (str);
((struct sockaddr *) &addr)->sa_family = AF_UNSPEC;
SCM_SYSCALL (rv = recvfrom (fd, buf + offset,
diff --git a/libguile/srfi-13.c b/libguile/srfi-13.c
index f3863d355..781fe6893 100644
--- a/libguile/srfi-13.c
+++ b/libguile/srfi-13.c
@@ -549,6 +549,7 @@ SCM_DEFINE (scm_string_copy_x, "string-copy!", 3, 2, 0,
len = cend - cstart;
SCM_ASSERT_RANGE (3, s, len <= scm_i_string_length (target) - ctstart);
+ target = scm_i_string_start_writing (target);
ctarget = scm_i_string_writable_chars (target);
memmove (ctarget + ctstart, cstr + cstart, len);
scm_i_string_stop_writing ();
@@ -985,6 +986,7 @@ SCM_DEFINE (scm_substring_fill_x, "string-fill!", 2, 2, 0,
4, end, cend);
SCM_VALIDATE_CHAR_COPY (2, chr, c);
+ str = scm_i_string_start_writing (str);
cstr = scm_i_string_writable_chars (str);
for (k = cstart; k < cend; k++)
cstr[k] = c;
@@ -2376,6 +2378,7 @@ string_upcase_x (SCM v, size_t start, size_t end)
size_t k;
char *dst;
+ v = scm_i_string_start_writing (v);
dst = scm_i_string_writable_chars (v);
for (k = start; k < end; ++k)
dst[k] = scm_c_upcase (dst[k]);
@@ -2442,6 +2445,7 @@ string_downcase_x (SCM v, size_t start, size_t end)
size_t k;
char *dst;
+ v = scm_i_string_start_writing (v);
dst = scm_i_string_writable_chars (v);
for (k = start; k < end; ++k)
dst[k] = scm_c_downcase (dst[k]);
@@ -2511,6 +2515,7 @@ string_titlecase_x (SCM str, size_t start, size_t end)
size_t i;
int in_word = 0;
+ str = scm_i_string_start_writing (str);
sz = (unsigned char *) scm_i_string_writable_chars (str);
for(i = start; i < end; i++)
{
@@ -2635,6 +2640,7 @@ SCM_DEFINE (scm_string_reverse, "string-reverse", 1, 2, 0,
2, start, cstart,
3, end, cend);
result = scm_string_copy (str);
+ result = scm_i_string_start_writing (result);
ctarget = scm_i_string_writable_chars (result);
string_reverse_x (ctarget, cstart, cend);
scm_i_string_stop_writing ();
@@ -2658,6 +2664,7 @@ SCM_DEFINE (scm_string_reverse_x, "string-reverse!", 1, 2, 0,
2, start, cstart,
3, end, cend);
+ str = scm_i_string_start_writing (str);
cstr = scm_i_string_writable_chars (str);
string_reverse_x (cstr, cstart, cend);
scm_i_string_stop_writing ();
@@ -3018,19 +3025,16 @@ SCM_DEFINE (scm_string_for_each, "string-for-each", 2, 2, 0,
"return value is not specified.")
#define FUNC_NAME s_scm_string_for_each
{
- const char *cstr;
size_t cstart, cend;
scm_t_trampoline_1 proc_tramp = scm_trampoline_1 (proc);
SCM_ASSERT (proc_tramp, proc, SCM_ARG1, FUNC_NAME);
- MY_VALIDATE_SUBSTRING_SPEC_COPY (2, s, cstr,
- 3, start, cstart,
- 4, end, cend);
+ MY_VALIDATE_SUBSTRING_SPEC (2, s,
+ 3, start, cstart,
+ 4, end, cend);
while (cstart < cend)
{
- unsigned int c = (unsigned char) cstr[cstart];
- proc_tramp (proc, SCM_MAKE_CHAR (c));
- cstr = scm_i_string_chars (s);
+ proc_tramp (proc, SCM_MAKE_CHAR (scm_i_string_ref (s, cstart)));
cstart++;
}
@@ -3162,6 +3166,7 @@ SCM_DEFINE (scm_string_xcopy_x, "string-xcopy!", 4, 3, 0,
SCM_ASSERT_RANGE (1, tstart,
ctstart + (csto - csfrom) <= scm_i_string_length (target));
+ target = scm_i_string_start_writing (target);
p = scm_i_string_writable_chars (target) + ctstart;
cs = scm_i_string_chars (s);
while (csfrom < csto)
@@ -3200,8 +3205,8 @@ SCM_DEFINE (scm_string_replace, "string-replace", 2, 4, 0,
MY_VALIDATE_SUBSTRING_SPEC (2, s2,
5, start2, cstart2,
6, end2, cend2);
- result = scm_i_make_string (cstart1 + (cend2 - cstart2) +
- scm_i_string_length (s1) - cend1, &p);
+ result = scm_i_make_string ((cstart1 + cend2 - cstart2
+ + scm_i_string_length (s1) - cend1), &p);
cstr1 = scm_i_string_chars (s1);
cstr2 = scm_i_string_chars (s2);
memmove (p, cstr1, cstart1 * sizeof (char));
diff --git a/libguile/strings.c b/libguile/strings.c
index 4e21f3e28..fc92fd233 100644
--- a/libguile/strings.c
+++ b/libguile/strings.c
@@ -24,6 +24,8 @@
#include <string.h>
#include <stdio.h>
+#include <ctype.h>
+#include <unistr.h>
#include "libguile/_scm.h"
#include "libguile/chars.h"
@@ -69,10 +71,12 @@
#define STRINGBUF_F_SHARED 0x100
#define STRINGBUF_F_INLINE 0x200
+#define STRINGBUF_F_WIDE 0x400
#define STRINGBUF_TAG scm_tc7_stringbuf
#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
#define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
+#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
#define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
#define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
@@ -82,6 +86,7 @@
#define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
? STRINGBUF_INLINE_CHARS (buf) \
: STRINGBUF_OUTLINE_CHARS (buf))
+#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
#define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
? STRINGBUF_INLINE_LENGTH (buf) \
: STRINGBUF_OUTLINE_LENGTH (buf))
@@ -126,6 +131,23 @@ make_stringbuf (size_t len)
}
}
+static SCM
+make_wide_stringbuf (size_t len)
+{
+ scm_t_wchar *mem;
+#if SCM_DEBUG
+ if (len < 1000)
+ lenhist[len]++;
+ else
+ lenhist[1000]++;
+#endif
+
+ mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
+ mem[len] = 0;
+ return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
+ (scm_t_bits) len, (scm_t_bits) 0);
+}
+
/* Return a new stringbuf whose underlying storage consists of the LEN+1
octets pointed to by STR (the last octet is zero). */
SCM
@@ -147,8 +169,58 @@ void
scm_i_stringbuf_free (SCM buf)
{
if (!STRINGBUF_INLINE (buf))
- scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
- STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
+ {
+ if (!STRINGBUF_WIDE (buf))
+ scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
+ STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
+ else
+ scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
+ sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
+ + 1), "string");
+ }
+
+}
+
+static void
+widen_stringbuf (SCM buf)
+{
+ size_t i, len;
+ scm_t_wchar *mem;
+
+ if (STRINGBUF_WIDE (buf))
+ return;
+
+ if (STRINGBUF_INLINE (buf))
+ {
+ len = STRINGBUF_INLINE_LENGTH (buf);
+
+ mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
+ for (i = 0; i < len; i++)
+ mem[i] =
+ (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
+ mem[len] = 0;
+
+ SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
+ SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
+ SCM_SET_CELL_WORD_1 (buf, mem);
+ SCM_SET_CELL_WORD_2 (buf, len);
+ }
+ else
+ {
+ len = STRINGBUF_OUTLINE_LENGTH (buf);
+
+ mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
+ for (i = 0; i < len; i++)
+ mem[i] =
+ (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
+ mem[len] = 0;
+
+ scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
+
+ SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
+ SCM_SET_CELL_WORD_1 (buf, mem);
+ SCM_SET_CELL_WORD_2 (buf, len);
+ }
}
scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
@@ -195,6 +267,18 @@ scm_i_make_string (size_t len, char **charsp)
return res;
}
+SCM
+scm_i_make_wide_string (size_t len, scm_t_wchar ** charsp)
+{
+ SCM buf = make_wide_stringbuf (len);
+ SCM res;
+ if (charsp)
+ *charsp = STRINGBUF_WIDE_CHARS (buf);
+ res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
+ (scm_t_bits) 0, (scm_t_bits) len);
+ return res;
+}
+
static void
validate_substring_args (SCM str, size_t start, size_t end)
{
@@ -253,12 +337,24 @@ scm_i_substring_copy (SCM str, size_t start, size_t end)
SCM buf, my_buf;
size_t str_start;
get_str_buf_start (&str, &buf, &str_start);
- my_buf = make_stringbuf (len);
- memcpy (STRINGBUF_CHARS (my_buf),
- STRINGBUF_CHARS (buf) + str_start + start, len);
+ if (scm_i_is_narrow_string (str))
+ {
+ my_buf = make_stringbuf (len);
+ memcpy (STRINGBUF_CHARS (my_buf),
+ STRINGBUF_CHARS (buf) + str_start + start, len);
+ }
+ else
+ {
+ my_buf = make_wide_stringbuf (len);
+ u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
+ (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
+ + start), len);
+ /* Even though this string is wide, the substring may be narrow.
+ Consider adding code to narrow string. */
+ }
scm_remember_upto_here_1 (buf);
- return scm_double_cell (STRING_TAG, SCM_UNPACK(my_buf),
- (scm_t_bits)0, (scm_t_bits) len);
+ return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
+ (scm_t_bits) 0, (scm_t_bits) len);
}
SCM
@@ -330,17 +426,45 @@ scm_i_string_length (SCM str)
return STRING_LENGTH (str);
}
+int
+scm_i_is_narrow_string (SCM str)
+{
+ return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
+}
+
const char *
scm_i_string_chars (SCM str)
{
SCM buf;
size_t start;
get_str_buf_start (&str, &buf, &start);
- return STRINGBUF_CHARS (buf) + start;
+ if (scm_i_is_narrow_string (str))
+ return STRINGBUF_CHARS (buf) + start;
+ else
+ scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
+ scm_list_1 (str));
+ return NULL;
}
-char *
-scm_i_string_writable_chars (SCM orig_str)
+const scm_t_wchar *
+scm_i_string_wide_chars (SCM str)
+{
+ SCM buf;
+ size_t start;
+
+ get_str_buf_start (&str, &buf, &start);
+ if (!scm_i_is_narrow_string (str))
+ return STRINGBUF_WIDE_CHARS (buf) + start;
+ else
+ scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
+ scm_list_1 (str));
+}
+
+/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
+ a new string buffer, so that it can be modified without modifying
+ other strings. */
+SCM
+scm_i_string_start_writing (SCM orig_str)
{
SCM buf, str = orig_str;
size_t start;
@@ -352,18 +476,26 @@ scm_i_string_writable_chars (SCM orig_str)
scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
if (STRINGBUF_SHARED (buf))
{
- /* Clone stringbuf. For this, we put all threads to sleep.
- */
-
+ /* Clone the stringbuf. */
size_t len = STRING_LENGTH (str);
SCM new_buf;
scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
- new_buf = make_stringbuf (len);
- memcpy (STRINGBUF_CHARS (new_buf),
- STRINGBUF_CHARS (buf) + STRING_START (str), len);
-
+ if (scm_i_is_narrow_string (str))
+ {
+ new_buf = make_stringbuf (len);
+ memcpy (STRINGBUF_CHARS (new_buf),
+ STRINGBUF_CHARS (buf) + STRING_START (str), len);
+
+ }
+ else
+ {
+ new_buf = make_wide_stringbuf (len);
+ u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
+ (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
+ + STRING_START (str)), len);
+ }
scm_i_thread_put_to_sleep ();
SET_STRING_STRINGBUF (str, new_buf);
start -= STRING_START (str);
@@ -374,8 +506,39 @@ scm_i_string_writable_chars (SCM orig_str)
scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
}
+ return orig_str;
+}
+
+/* Return a pointer to the chars of a string that fits in a Latin-1
+ encoding. */
+char *
+scm_i_string_writable_chars (SCM str)
+{
+ SCM buf;
+ size_t start;
- return STRINGBUF_CHARS (buf) + start;
+ get_str_buf_start (&str, &buf, &start);
+ if (scm_i_is_narrow_string (str))
+ return STRINGBUF_CHARS (buf) + start;
+ else
+ scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
+ scm_list_1 (str));
+ return NULL;
+}
+
+/* Return a pointer to the Unicode codepoints of a string. */
+static scm_t_wchar *
+scm_i_string_writable_wide_chars (SCM str)
+{
+ SCM buf;
+ size_t start;
+
+ get_str_buf_start (&str, &buf, &start);
+ if (!scm_i_is_narrow_string (str))
+ return STRINGBUF_WIDE_CHARS (buf) + start;
+ else
+ scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
+ scm_list_1 (str));
}
void
@@ -384,6 +547,34 @@ scm_i_string_stop_writing (void)
scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
}
+/* Return the Xth character is C. */
+scm_t_wchar
+scm_i_string_ref (SCM str, size_t x)
+{
+ if (scm_i_is_narrow_string (str))
+ return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
+ else
+ return scm_i_string_wide_chars (str)[x];
+}
+
+void
+scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
+{
+ if (chr > 0xFF && scm_i_is_narrow_string (str))
+ widen_stringbuf (STRING_STRINGBUF (str));
+
+ if (scm_i_is_narrow_string (str))
+ {
+ char *dst = scm_i_string_writable_chars (str);
+ dst[p] = (char) (unsigned char) chr;
+ }
+ else
+ {
+ scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
+ dst[p] = chr;
+ }
+}
+
/* Symbols.
Basic symbol creation and accessing is done here, the rest is in
@@ -418,10 +609,21 @@ scm_i_make_symbol (SCM name, scm_t_bits flags,
else
{
/* make new buf. */
- SCM new_buf = make_stringbuf (length);
- memcpy (STRINGBUF_CHARS (new_buf),
- STRINGBUF_CHARS (buf) + start, length);
- buf = new_buf;
+ if (scm_i_is_narrow_string (name))
+ {
+ SCM new_buf = make_stringbuf (length);
+ memcpy (STRINGBUF_CHARS (new_buf),
+ STRINGBUF_CHARS (buf) + start, length);
+ buf = new_buf;
+ }
+ else
+ {
+ SCM new_buf = make_wide_stringbuf (length);
+ u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
+ (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
+ length);
+ buf = new_buf;
+ }
}
return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
(scm_t_bits) hash, SCM_UNPACK (props));
@@ -466,11 +668,40 @@ scm_c_symbol_length (SCM sym)
}
#undef FUNC_NAME
+int
+scm_i_is_narrow_symbol (SCM sym)
+{
+ SCM buf;
+
+ buf = SYMBOL_STRINGBUF (sym);
+ return !STRINGBUF_WIDE (buf);
+}
+
const char *
scm_i_symbol_chars (SCM sym)
{
- SCM buf = SYMBOL_STRINGBUF (sym);
- return STRINGBUF_CHARS (buf);
+ SCM buf;
+
+ buf = SYMBOL_STRINGBUF (sym);
+ if (!STRINGBUF_WIDE (buf))
+ return STRINGBUF_CHARS (buf);
+ else
+ scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
+ scm_list_1 (sym));
+}
+
+/* Return a pointer to the Unicode codepoints of a symbol's name. */
+const scm_t_wchar *
+scm_i_symbol_wide_chars (SCM sym)
+{
+ SCM buf;
+
+ buf = SYMBOL_STRINGBUF (sym);
+ if (STRINGBUF_WIDE (buf))
+ return STRINGBUF_WIDE_CHARS (buf);
+ else
+ scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
+ scm_list_1 (sym));
}
SCM
@@ -496,6 +727,15 @@ scm_i_symbol_substring (SCM sym, size_t start, size_t end)
(scm_t_bits)start, (scm_t_bits) end - start);
}
+scm_t_wchar
+scm_i_symbol_ref (SCM sym, size_t x)
+{
+ if (scm_i_is_narrow_symbol (sym))
+ return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
+ else
+ return scm_i_symbol_wide_chars (sym)[x];
+}
+
/* Debugging
*/
@@ -505,15 +745,17 @@ SCM scm_sys_string_dump (SCM);
SCM scm_sys_symbol_dump (SCM);
SCM scm_sys_stringbuf_hist (void);
-SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0,
- (SCM str),
- "")
+SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str), "")
#define FUNC_NAME s_scm_sys_string_dump
{
SCM_VALIDATE_STRING (1, str);
fprintf (stderr, "%p:\n", str);
fprintf (stderr, " start: %u\n", STRING_START (str));
fprintf (stderr, " len: %u\n", STRING_LENGTH (str));
+ if (scm_i_is_narrow_string (str))
+ fprintf (stderr, " format: narrow\n");
+ else
+ fprintf (stderr, " format: wide\n");
if (IS_SH_STRING (str))
{
fprintf (stderr, " string: %p\n", SH_STRING_STRING (str));
@@ -524,36 +766,54 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0,
{
SCM buf = STRING_STRINGBUF (str);
fprintf (stderr, " buf: %p\n", buf);
- fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf));
+ if (scm_i_is_narrow_string (str))
+ fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf));
+ else
+ fprintf (stderr, " chars: %p\n", STRINGBUF_WIDE_CHARS (buf));
fprintf (stderr, " length: %u\n", STRINGBUF_LENGTH (buf));
- fprintf (stderr, " flags: %x\n", (SCM_CELL_WORD_0 (buf) & 0x300));
+ if (STRINGBUF_SHARED (buf))
+ fprintf (stderr, " shared: true\n");
+ else
+ fprintf (stderr, " shared: false\n");
+ if (STRINGBUF_INLINE (buf))
+ fprintf (stderr, " inline: true\n");
+ else
+ fprintf (stderr, " inline: false\n");
+
}
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
-SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0,
- (SCM sym),
- "")
+SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym), "")
#define FUNC_NAME s_scm_sys_symbol_dump
{
SCM_VALIDATE_SYMBOL (1, sym);
fprintf (stderr, "%p:\n", sym);
fprintf (stderr, " hash: %lu\n", scm_i_symbol_hash (sym));
+ if (scm_i_is_narrow_symbol (sym))
+ fprintf (stderr, " format: narrow\n");
+ else
+ fprintf (stderr, " format: wide\n");
{
SCM buf = SYMBOL_STRINGBUF (sym);
fprintf (stderr, " buf: %p\n", buf);
- fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf));
+ if (scm_i_is_narrow_symbol (sym))
+ fprintf (stderr, " chars: %p\n", STRINGBUF_CHARS (buf));
+ else
+ fprintf (stderr, " chars: %p\n", STRINGBUF_WIDE_CHARS (buf));
fprintf (stderr, " length: %u\n", STRINGBUF_LENGTH (buf));
- fprintf (stderr, " shared: %u\n", STRINGBUF_SHARED (buf));
+ if (STRINGBUF_SHARED (buf))
+ fprintf (stderr, " shared: true\n");
+ else
+ fprintf (stderr, " shared: false\n");
+
}
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
-SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0,
- (void),
- "")
+SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
#define FUNC_NAME s_scm_sys_stringbuf_hist
{
int i;
@@ -589,29 +849,46 @@ SCM_DEFINE (scm_string, "string", 0, 0, 1,
#define FUNC_NAME s_scm_string
{
SCM result;
+ SCM rest;
size_t len;
- char *data;
-
- {
- long i = scm_ilength (chrs);
+ size_t p = 0;
+ long i;
- SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
- len = i;
- }
+ /* Verify that this is a list of chars. */
+ i = scm_ilength (chrs);
+ len = (size_t) i;
+ rest = chrs;
- result = scm_i_make_string (len, &data);
- while (len > 0 && scm_is_pair (chrs))
+ SCM_ASSERT (len >= 0, chrs, SCM_ARG1, FUNC_NAME);
+ while (len > 0 && scm_is_pair (rest))
{
- SCM elt = SCM_CAR (chrs);
-
+ SCM elt = SCM_CAR (rest);
SCM_VALIDATE_CHAR (SCM_ARGn, elt);
- *data++ = SCM_CHAR (elt);
- chrs = SCM_CDR (chrs);
+ rest = SCM_CDR (rest);
+ len--;
+ scm_remember_upto_here_1 (elt);
+ }
+
+ /* Construct a string containing this list of chars. */
+ len = (size_t) i;
+ rest = chrs;
+
+ result = scm_i_make_string (len, NULL);
+ result = scm_i_string_start_writing (result);
+ while (len > 0 && scm_is_pair (rest))
+ {
+ SCM elt = SCM_CAR (rest);
+ scm_i_string_set_x (result, p, SCM_CHAR (elt));
+ p++;
+ rest = SCM_CDR (rest);
len--;
+ scm_remember_upto_here_1 (elt);
}
+ scm_i_string_stop_writing ();
+
if (len > 0)
scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
- if (!scm_is_null (chrs))
+ if (!scm_is_null (rest))
scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
return result;
@@ -634,13 +911,16 @@ SCM
scm_c_make_string (size_t len, SCM chr)
#define FUNC_NAME NULL
{
- char *dst;
- SCM res = scm_i_make_string (len, &dst);
+ size_t p;
+ SCM res = scm_i_make_string (len, NULL);
if (!SCM_UNBNDP (chr))
{
SCM_VALIDATE_CHAR (0, chr);
- memset (dst, SCM_CHAR (chr), len);
+ res = scm_i_string_start_writing (res);
+ for (p = 0; p < len; p++)
+ scm_i_string_set_x (res, p, SCM_CHAR (chr));
+ scm_i_string_stop_writing ();
}
return res;
@@ -657,6 +937,20 @@ SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
}
#undef FUNC_NAME
+SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
+ (SCM string),
+ "Return the bytes used to represent a character in @var{string}."
+ "This will return 1 or 4.")
+#define FUNC_NAME s_scm_string_width
+{
+ SCM_VALIDATE_STRING (1, string);
+ if (!scm_i_is_narrow_string (string))
+ return scm_from_int (4);
+
+ return scm_from_int (1);
+}
+#undef FUNC_NAME
+
size_t
scm_c_string_length (SCM string)
{
@@ -667,8 +961,8 @@ scm_c_string_length (SCM string)
SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
(SCM str, SCM k),
- "Return character @var{k} of @var{str} using zero-origin\n"
- "indexing. @var{k} must be a valid index of @var{str}.")
+ "Return character @var{k} of @var{str} using zero-origin\n"
+ "indexing. @var{k} must be a valid index of @var{str}.")
#define FUNC_NAME s_scm_string_ref
{
size_t len;
@@ -682,7 +976,10 @@ SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
else
scm_out_of_range (NULL, k);
- return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
+ if (scm_i_is_narrow_string (str))
+ return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
+ else
+ return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
}
#undef FUNC_NAME
@@ -691,14 +988,18 @@ scm_c_string_ref (SCM str, size_t p)
{
if (p >= scm_i_string_length (str))
scm_out_of_range (NULL, scm_from_size_t (p));
- return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
+ if (scm_i_is_narrow_string (str))
+ return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
+ else
+ return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
+
}
SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
(SCM str, SCM k, SCM chr),
- "Store @var{chr} in element @var{k} of @var{str} and return\n"
- "an unspecified value. @var{k} must be a valid index of\n"
- "@var{str}.")
+ "Store @var{chr} in element @var{k} of @var{str} and return\n"
+ "an unspecified value. @var{k} must be a valid index of\n"
+ "@var{str}.")
#define FUNC_NAME s_scm_string_set_x
{
size_t len;
@@ -713,11 +1014,10 @@ SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
scm_out_of_range (NULL, k);
SCM_VALIDATE_CHAR (3, chr);
- {
- char *dst = scm_i_string_writable_chars (str);
- dst[idx] = SCM_CHAR (chr);
- scm_i_string_stop_writing ();
- }
+ str = scm_i_string_start_writing (str);
+ scm_i_string_set_x (str, idx, SCM_CHAR (chr));
+ scm_i_string_stop_writing ();
+
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
@@ -727,11 +1027,9 @@ scm_c_string_set_x (SCM str, size_t p, SCM chr)
{
if (p >= scm_i_string_length (str))
scm_out_of_range (NULL, scm_from_size_t (p));
- {
- char *dst = scm_i_string_writable_chars (str);
- dst[p] = SCM_CHAR (chr);
- scm_i_string_stop_writing ();
- }
+ str = scm_i_string_start_writing (str);
+ scm_i_string_set_x (str, p, SCM_CHAR (chr));
+ scm_i_string_stop_writing ();
}
SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
@@ -832,31 +1130,55 @@ SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
(SCM args),
- "Return a newly allocated string whose characters form the\n"
+ "Return a newly allocated string whose characters form the\n"
"concatenation of the given strings, @var{args}.")
#define FUNC_NAME s_scm_string_append
{
SCM res;
- size_t i = 0;
+ size_t len = 0;
+ int wide = 0;
SCM l, s;
char *data;
+ scm_t_wchar *wdata;
+ int i;
SCM_VALIDATE_REST_ARGUMENT (args);
- for (l = args; !scm_is_null (l); l = SCM_CDR (l))
+ for (l = args; !scm_is_null (l); l = SCM_CDR (l))
{
s = SCM_CAR (l);
SCM_VALIDATE_STRING (SCM_ARGn, s);
- i += scm_i_string_length (s);
+ len += scm_i_string_length (s);
+ if (!scm_i_is_narrow_string (s))
+ wide = 1;
}
- res = scm_i_make_string (i, &data);
- for (l = args; !scm_is_null (l); l = SCM_CDR (l))
+ if (!wide)
+ res = scm_i_make_string (len, &data);
+ else
+ res = scm_i_make_wide_string (len, &wdata);
+
+ for (l = args; !scm_is_null (l); l = SCM_CDR (l))
{
size_t len;
s = SCM_CAR (l);
SCM_VALIDATE_STRING (SCM_ARGn, s);
len = scm_i_string_length (s);
- memcpy (data, scm_i_string_chars (s), len);
- data += len;
+ if (!wide)
+ {
+ memcpy (data, scm_i_string_chars (s), len);
+ data += len;
+ }
+ else
+ {
+ if (scm_i_is_narrow_string (s))
+ {
+ for (i = 0; i < scm_i_string_length (s); i++)
+ wdata[i] = (unsigned char) scm_i_string_chars (s)[i];
+ }
+ else
+ u32_cpy ((scm_t_uint32 *) wdata,
+ (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
+ wdata += len;
+ }
scm_remember_upto_here_1 (s);
}
return res;
@@ -875,8 +1197,11 @@ scm_from_locale_stringn (const char *str, size_t len)
SCM res;
char *dst;
- if (len == (size_t)-1)
+ if (len == (size_t) -1)
len = strlen (str);
+ if (len == 0)
+ return scm_nullstr;
+
res = scm_i_make_string (len, &dst);
memcpy (dst, str, len);
return res;
@@ -885,6 +1210,9 @@ scm_from_locale_stringn (const char *str, size_t len)
SCM
scm_from_locale_string (const char *str)
{
+ if (str == NULL)
+ return scm_nullstr;
+
return scm_from_locale_stringn (str, -1);
}
@@ -893,21 +1221,20 @@ scm_take_locale_stringn (char *str, size_t len)
{
SCM buf, res;
- if (len == (size_t)-1)
+ if (len == (size_t) -1)
len = strlen (str);
else
{
/* Ensure STR is null terminated. A realloc for 1 extra byte should
often be satisfied from the alignment padding after the block, with
no actual data movement. */
- str = scm_realloc (str, len+1);
+ str = scm_realloc (str, len + 1);
str[len] = '\0';
}
buf = scm_i_take_stringbufn (str, len);
res = scm_double_cell (STRING_TAG,
- SCM_UNPACK (buf),
- (scm_t_bits) 0, (scm_t_bits) len);
+ SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
return res;
}
@@ -917,33 +1244,143 @@ scm_take_locale_string (char *str)
return scm_take_locale_stringn (str, -1);
}
+/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
+ and \UXXXXXX. */
+static void
+unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
+{
+ char *before, *after;
+ size_t i, j;
+
+ before = *bufp;
+ after = *bufp;
+ i = 0;
+ j = 0;
+ while (i < *lenp)
+ {
+ if ((i <= *lenp - 6)
+ && before[i] == '\\'
+ && before[i + 1] == 'u'
+ && before[i + 2] == '0' && before[i + 3] == '0')
+ {
+ /* Convert \u00NN to \xNN */
+ after[j] = '\\';
+ after[j + 1] = 'x';
+ after[j + 2] = tolower (before[i + 4]);
+ after[j + 3] = tolower (before[i + 5]);
+ i += 6;
+ j += 4;
+ }
+ else if ((i <= *lenp - 10)
+ && before[i] == '\\'
+ && before[i + 1] == 'U'
+ && before[i + 2] == '0' && before[i + 3] == '0')
+ {
+ /* Convert \U00NNNNNN to \UNNNNNN */
+ after[j] = '\\';
+ after[j + 1] = 'U';
+ after[j + 2] = tolower (before[i + 4]);
+ after[j + 3] = tolower (before[i + 5]);
+ after[j + 4] = tolower (before[i + 6]);
+ after[j + 5] = tolower (before[i + 7]);
+ after[j + 6] = tolower (before[i + 8]);
+ after[j + 7] = tolower (before[i + 9]);
+ i += 10;
+ j += 8;
+ }
+ else
+ {
+ after[j] = before[i];
+ i++;
+ j++;
+ }
+ }
+ *lenp = j;
+ after = scm_realloc (after, j);
+}
+
char *
-scm_to_locale_stringn (SCM str, size_t *lenp)
+scm_to_locale_stringn (SCM str, size_t * lenp)
{
- char *res;
- size_t len;
+ const char *enc;
+
+ /* In the future, enc will hold the port's encoding. */
+ enc = NULL;
+
+ return scm_to_stringn (str, lenp, enc, iconveh_escape_sequence);
+}
+
+/* Low-level scheme to C string conversion function. */
+char *
+scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
+ enum iconv_ilseq_handler handler)
+{
+ static const char iso[11] = "ISO-8859-1";
+ char *buf;
+ size_t ilen, len, i;
if (!scm_is_string (str))
scm_wrong_type_arg_msg (NULL, 0, str, "string");
- len = scm_i_string_length (str);
- res = scm_malloc (len + ((lenp==NULL)? 1 : 0));
- memcpy (res, scm_i_string_chars (str), len);
+ ilen = scm_i_string_length (str);
+
+ if (ilen == 0)
+ {
+ buf = scm_malloc (1);
+ buf[0] = '\0';
+ if (lenp)
+ *lenp = 0;
+ return buf;
+ }
+
if (lenp == NULL)
+ for (i = 0; i < ilen; i++)
+ if (scm_i_string_ref (str, i) == '\0')
+ scm_misc_error (NULL,
+ "string contains #\\nul character: ~S",
+ scm_list_1 (str));
+
+ if (scm_i_is_narrow_string (str))
{
- res[len] = '\0';
- if (strlen (res) != len)
- {
- free (res);
- scm_misc_error (NULL,
- "string contains #\\nul character: ~S",
- scm_list_1 (str));
- }
+ if (lenp)
+ {
+ buf = scm_malloc (ilen);
+ memcpy (buf, scm_i_string_chars (str), ilen);
+ *lenp = ilen;
+ return buf;
+ }
+ else
+ {
+ buf = scm_malloc (ilen + 1);
+ memcpy (buf, scm_i_string_chars (str), ilen);
+ buf[ilen] = '\0';
+ return buf;
+ }
}
- else
+
+
+ buf = NULL;
+ len = 0;
+ buf = u32_conv_to_encoding (iso,
+ handler,
+ (scm_t_uint32 *) scm_i_string_wide_chars (str),
+ ilen, NULL, NULL, &len);
+ if (buf == NULL)
+ scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
+ scm_list_2 (scm_from_locale_string (iso), str));
+
+ if (handler == iconveh_escape_sequence)
+ unistring_escapes_to_guile_escapes (&buf, &len);
+
+ if (lenp)
*lenp = len;
+ else
+ {
+ buf = scm_realloc (buf, len + 1);
+ buf[len] = '\0';
+ }
scm_remember_upto_here_1 (str);
- return res;
+ return buf;
}
char *
@@ -956,18 +1393,21 @@ size_t
scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
{
size_t len;
-
+ char *result = NULL;
if (!scm_is_string (str))
scm_wrong_type_arg_msg (NULL, 0, str, "string");
- len = scm_i_string_length (str);
- memcpy (buf, scm_i_string_chars (str), (len > max_len)? max_len : len);
+ result = scm_to_locale_stringn (str, &len);
+
+ memcpy (buf, result, (len > max_len) ? max_len : len);
+ free (result);
+
scm_remember_upto_here_1 (str);
return len;
}
/* converts C scm_array of strings to SCM scm_list of strings. */
/* If argc < 0, a null terminated scm_array is assumed. */
-SCM
+SCM
scm_makfromstrs (int argc, char **argv)
{
int i = argc;
@@ -1081,6 +1521,7 @@ scm_i_deprecated_string_chars (SCM str)
/* The following is still wrong, of course...
*/
+ str = scm_i_string_start_writing (str);
chars = scm_i_string_writable_chars (str);
scm_i_string_stop_writing ();
return chars;
diff --git a/libguile/strings.h b/libguile/strings.h
index 9e028d82e..5c09d587a 100644
--- a/libguile/strings.h
+++ b/libguile/strings.h
@@ -23,6 +23,7 @@
+#include <uniconv.h>
#include "libguile/__scm.h"
@@ -46,26 +47,37 @@
Internal, low level interface to the character arrays
- - Use scm_i_string_chars to get a pointer to the byte array of a
- string for reading. Use scm_i_string_length to get the number of
- bytes in that array. The array is not null-terminated.
+ - Use scm_is_narrow_string to determine is the string is narrow or
+ wide.
+
+ - Use scm_i_string_chars or scm_i_string_wide_chars to get a
+ pointer to the byte or scm_t_wchar array of a string for reading.
+ Use scm_i_string_length to get the number of characters in that
+ array. The array is not null-terminated.
- The array is valid as long as the corresponding SCM object is
protected but only until the next SCM_TICK. During such a 'safe
point', strings might change their representation.
- - Use scm_i_string_writable_chars to get the same pointer as with
- scm_i_string_chars, but for reading and writing. This is a
- potentially costly operation since it implements the
- copy-on-write behavior. When done with the writing, call
- scm_i_string_stop_writing. You must do this before the next
- SCM_TICK. (This means, before calling almost any other scm_
- function and you can't allow throws, of course.)
-
- - New strings can be created with scm_i_make_string. This gives
- access to a writable pointer that remains valid as long as nobody
- else makes a copy-on-write substring of the string. Do not call
- scm_i_string_stop_writing for this pointer.
+ - Use scm_i_string_start_writing to get a version of the string
+ ready for reading and writing. This is a potentially costly
+ operation since it implements the copy-on-write behavior. When
+ done with the writing, call scm_i_string_stop_writing. You must
+ do this before the next SCM_TICK. (This means, before calling
+ almost any other scm_ function and you can't allow throws, of
+ course.)
+
+ - New strings can be created with scm_i_make_string or
+ scm_i_make_wide_string. This gives access to a writable pointer
+ that remains valid as long as nobody else makes a copy-on-write
+ substring of the string. Do not call scm_i_string_stop_writing
+ for this pointer.
+
+ - Alternately, scm_i_string_ref and scm_i_string_set_x can be used
+ to read and write strings without worrying about whether the
+ string is narrow or wide. scm_i_string_set_x still needs to be
+ bracketed by scm_i_string_start_writing and
+ scm_i_string_stop_writing.
Legacy interface
@@ -74,13 +86,15 @@
- SCM_STRING_CHARS uses scm_i_string_writable_chars and immediately
calls scm_i_stop_writing, hoping for the best. SCM_STRING_LENGTH
is the same as scm_i_string_length. SCM_STRING_CHARS will throw
- an error for for strings that are not null-terminated.
+ an error for for strings that are not null-terminated. There is
+ no wide version of this interface.
*/
SCM_API SCM scm_string_p (SCM x);
SCM_API SCM scm_string (SCM chrs);
SCM_API SCM scm_make_string (SCM k, SCM chr);
SCM_API SCM scm_string_length (SCM str);
+SCM_API SCM scm_string_width (SCM str);
SCM_API SCM scm_string_ref (SCM str, SCM k);
SCM_API SCM scm_string_set_x (SCM str, SCM k, SCM chr);
SCM_API SCM scm_substring (SCM str, SCM start, SCM end);
@@ -106,6 +120,9 @@ SCM_API SCM scm_take_locale_string (char *str);
SCM_API SCM scm_take_locale_stringn (char *str, size_t len);
SCM_API char *scm_to_locale_string (SCM str);
SCM_API char *scm_to_locale_stringn (SCM str, size_t *lenp);
+SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp,
+ const char *encoding,
+ enum iconv_ilseq_handler handler);
SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len);
SCM_API SCM scm_makfromstrs (int argc, char **argv);
@@ -113,15 +130,20 @@ SCM_API SCM scm_makfromstrs (int argc, char **argv);
/* internal accessor functions. Arguments must be valid. */
SCM_INTERNAL SCM scm_i_make_string (size_t len, char **datap);
+SCM_INTERNAL SCM scm_i_make_wide_string (size_t len, scm_t_wchar **datap);
SCM_INTERNAL SCM scm_i_substring (SCM str, size_t start, size_t end);
SCM_INTERNAL SCM scm_i_substring_read_only (SCM str, size_t start, size_t end);
SCM_INTERNAL SCM scm_i_substring_shared (SCM str, size_t start, size_t end);
SCM_INTERNAL SCM scm_i_substring_copy (SCM str, size_t start, size_t end);
SCM_INTERNAL size_t scm_i_string_length (SCM str);
SCM_API /* FIXME: not internal */ const char *scm_i_string_chars (SCM str);
+SCM_API const scm_t_wchar *scm_i_string_wide_chars (SCM str);
SCM_API /* FIXME: not internal */ char *scm_i_string_writable_chars (SCM str);
+SCM_INTERNAL SCM scm_i_string_start_writing (SCM str);
SCM_INTERNAL void scm_i_string_stop_writing (void);
-
+SCM_INTERNAL int scm_i_is_narrow_string (SCM str);
+SCM_INTERNAL scm_t_wchar scm_i_string_ref (SCM str, size_t x);
+SCM_INTERNAL void scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr);
/* internal functions related to symbols. */
SCM_INTERNAL SCM scm_i_make_symbol (SCM name, scm_t_bits flags,
@@ -133,8 +155,11 @@ SCM_INTERNAL SCM
scm_i_c_take_symbol (char *name, size_t len,
scm_t_bits flags, unsigned long hash, SCM props);
SCM_INTERNAL const char *scm_i_symbol_chars (SCM sym);
+SCM_INTERNAL const scm_t_wchar *scm_i_symbol_wide_chars (SCM sym);
SCM_INTERNAL size_t scm_i_symbol_length (SCM sym);
+SCM_INTERNAL int scm_i_is_narrow_symbol (SCM str);
SCM_INTERNAL SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end);
+SCM_INTERNAL scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x);
/* internal GC functions. */
diff --git a/libguile/vm-engine.h b/libguile/vm-engine.h
index c0f772fb8..240969c37 100644
--- a/libguile/vm-engine.h
+++ b/libguile/vm-engine.h
@@ -336,6 +336,7 @@ do { \
#define FETCH() (*ip++)
#define FETCH_LENGTH(len) do { len=*ip++; len<<=8; len+=*ip++; len<<=8; len+=*ip++; } while (0)
+#define FETCH_WIDTH(width) do { width=*ip++; } while (0)
#undef CLOCK
#if VM_USE_CLOCK
diff --git a/libguile/vm-i-loader.c b/libguile/vm-i-loader.c
index 9ae49ed65..8de7f0036 100644
--- a/libguile/vm-i-loader.c
+++ b/libguile/vm-i-loader.c
@@ -72,31 +72,82 @@ VM_DEFINE_LOADER (82, load_number, "load-number")
VM_DEFINE_LOADER (83, load_string, "load-string")
{
size_t len;
+ int width;
+ SCM str;
+
FETCH_LENGTH (len);
+ FETCH_WIDTH (width);
SYNC_REGISTER ();
- PUSH (scm_from_locale_stringn ((char *)ip, len));
- /* Was: scm_makfromstr (ip, len, 0) */
- ip += len;
+ if (width == 1)
+ {
+ char *buf;
+ str = scm_i_make_string (len, &buf);
+ memcpy (buf, (char *) ip, len);
+ }
+ else if (width == 4)
+ {
+ scm_t_wchar *wbuf;
+ str = scm_i_make_wide_string (len, &wbuf);
+ memcpy ((char *) wbuf, (char *) ip, len * width);
+ }
+ else
+ SCM_MISC_ERROR ("load-string: invalid character width", SCM_EOL);
+ PUSH (str);
+ ip += len * width;
NEXT;
}
VM_DEFINE_LOADER (84, load_symbol, "load-symbol")
{
size_t len;
+ int width;
+ SCM str;
FETCH_LENGTH (len);
+ FETCH_WIDTH (width);
SYNC_REGISTER ();
- PUSH (scm_from_locale_symboln ((char *)ip, len));
- ip += len;
+ if (width == 1)
+ {
+ char *buf;
+ str = scm_i_make_string (len, &buf);
+ memcpy (buf, (char *) ip, len);
+ }
+ else if (width == 4)
+ {
+ scm_t_wchar *wbuf;
+ str = scm_i_make_wide_string (len, &wbuf);
+ memcpy ((char *) wbuf, (char *) ip, len * width);
+ }
+ else
+ SCM_MISC_ERROR ("load-symbol: invalid character width", SCM_EOL);
+ PUSH (scm_string_to_symbol (str));
+ ip += len * width;
NEXT;
}
VM_DEFINE_LOADER (85, load_keyword, "load-keyword")
{
size_t len;
+ int width;
+ SCM str;
FETCH_LENGTH (len);
+ FETCH_WIDTH (width);
SYNC_REGISTER ();
- PUSH (scm_from_locale_keywordn ((char *)ip, len));
- ip += len;
+ if (width == 1)
+ {
+ char *buf;
+ str = scm_i_make_string (len, &buf);
+ memcpy (buf, (char *) ip, len);
+ }
+ else if (width == 4)
+ {
+ scm_t_wchar *wbuf;
+ str = scm_i_make_wide_string (len, &wbuf);
+ memcpy ((char *) wbuf, (char *) ip, len * width);
+ }
+ else
+ SCM_MISC_ERROR ("load-keyword: invalid character width", SCM_EOL);
+ PUSH (scm_symbol_to_keyword (scm_string_to_symbol (str)));
+ ip += len * width;
NEXT;
}
@@ -132,13 +183,29 @@ VM_DEFINE_INSTRUCTION (87, link_now, "link-now", 0, 1, 1)
VM_DEFINE_LOADER (88, define, "define")
{
- SCM sym;
+ SCM str, sym;
size_t len;
+ int width;
FETCH_LENGTH (len);
+ FETCH_WIDTH (width);
SYNC_REGISTER ();
- sym = scm_from_locale_symboln ((char *)ip, len);
- ip += len;
+ if (width == 1)
+ {
+ char *buf;
+ str = scm_i_make_string (len, &buf);
+ memcpy (buf, (char *) ip, len);
+ }
+ else if (width == 4)
+ {
+ scm_t_wchar *wbuf;
+ str = scm_i_make_wide_string (len, &wbuf);
+ memcpy ((char *) wbuf, (char *) ip, len * width);
+ }
+ else
+ SCM_MISC_ERROR ("load define: invalid character width", SCM_EOL);
+ sym = scm_string_to_symbol (str);
+ ip += len * width;
SYNC_REGISTER ();
PUSH (scm_sym2var (sym, scm_current_module_lookup_closure (), SCM_BOOL_T));
diff --git a/module/language/assembly.scm b/module/language/assembly.scm
index 3a1da4fe3..5571bee61 100644
--- a/module/language/assembly.scm
+++ b/module/language/assembly.scm
@@ -34,6 +34,10 @@
;; lengths are encoded in 3 bytes
(define *len-len* 3)
+;; the number of bytes per string character is encoded in 1 byte
+(define *width-len* 1)
+
+
(define (byte-length assembly)
(pmatch assembly
(,label (guard (not (pair? label)))
@@ -45,15 +49,15 @@
((load-number ,str)
(+ 1 *len-len* (string-length str)))
((load-string ,str)
- (+ 1 *len-len* (string-length str)))
+ (+ 1 *len-len* *width-len* (* (string-width str) (string-length str))))
((load-symbol ,str)
- (+ 1 *len-len* (string-length str)))
+ (+ 1 *len-len* *width-len* (* (string-width str) (string-length str))))
((load-keyword ,str)
- (+ 1 *len-len* (string-length str)))
+ (+ 1 *len-len* *width-len* (* (string-width str) (string-length str))))
((load-array ,bv)
(+ 1 *len-len* (bytevector-length bv)))
((define ,str)
- (+ 1 *len-len* (string-length str)))
+ (+ 1 *len-len* *width-len* (* (string-width str) (string-length str))))
((load-program ,nargs ,nrest ,nlocs ,labels ,len ,meta . ,code)
(+ 1 *program-header-len* len (if meta (1- (byte-length meta)) 0)))
((,inst . _) (guard (>= (instruction-length inst) 0))
diff --git a/module/language/assembly/compile-bytecode.scm b/module/language/assembly/compile-bytecode.scm
index bed0fb2dc..840c73b3a 100644
--- a/module/language/assembly/compile-bytecode.scm
+++ b/module/language/assembly/compile-bytecode.scm
@@ -65,6 +65,12 @@
(write-byte (logand (ash x -8) 255))
(write-byte (logand (ash x -16) 255))
(write-byte (logand (ash x -24) 255)))
+ (define (write-uint32 x) (case byte-order
+ ((1234) (write-uint32-le x))
+ ((4321) (write-uint32-be x))
+ (else (error "unknown endianness" byte-order))))
+ (define (write-wide-string s)
+ (string-for-each (lambda (c) (write-uint32 (char->integer c))) s))
(define (write-loader-len len)
(write-byte (ash len -16))
(write-byte (logand (ash len -8) 255))
@@ -72,6 +78,14 @@
(define (write-loader str)
(write-loader-len (string-length str))
(write-string str))
+ (define (write-sized-loader str)
+ (let ((len (string-length str))
+ (wid (string-width str)))
+ (write-loader-len len)
+ (write-byte wid)
+ (if (= wid 4)
+ (write-wide-string str)
+ (write-string str))))
(define (write-bytevector bv)
(write-loader-len (bytevector-length bv))
;; Ew!
@@ -89,10 +103,6 @@
(write-uint16 (case byte-order
((1234) write-uint16-le)
((4321) write-uint16-be)
- (else (error "unknown endianness" byte-order))))
- (write-uint32 (case byte-order
- ((1234) write-uint32-le)
- ((4321) write-uint32-be)
(else (error "unknown endianness" byte-order)))))
(let ((opcode (instruction->opcode inst))
(len (instruction-length inst)))
@@ -126,11 +136,11 @@
((load-unsigned-integer ,str) (write-loader str))
((load-integer ,str) (write-loader str))
((load-number ,str) (write-loader str))
- ((load-string ,str) (write-loader str))
- ((load-symbol ,str) (write-loader str))
- ((load-keyword ,str) (write-loader str))
+ ((load-string ,str) (write-sized-loader str))
+ ((load-symbol ,str) (write-sized-loader str))
+ ((load-keyword ,str) (write-sized-loader str))
((load-array ,bv) (write-bytevector bv))
- ((define ,str) (write-loader str))
+ ((define ,str) (write-sized-loader str))
((br ,l) (write-break l))
((br-if ,l) (write-break l))
((br-if-not ,l) (write-break l))
diff --git a/test-suite/tests/asm-to-bytecode.test b/test-suite/tests/asm-to-bytecode.test
index 33a2a45f0..d01e93c43 100644
--- a/test-suite/tests/asm-to-bytecode.test
+++ b/test-suite/tests/asm-to-bytecode.test
@@ -79,15 +79,15 @@
(char->integer #\1) (char->integer #\4)))
(comp-test '(load-string "foo")
- (vector 'load-string 0 0 3 (char->integer #\f) (char->integer #\o)
+ (vector 'load-string 0 0 3 1 (char->integer #\f) (char->integer #\o)
(char->integer #\o)))
(comp-test '(load-symbol "foo")
- (vector 'load-symbol 0 0 3 (char->integer #\f) (char->integer #\o)
+ (vector 'load-symbol 0 0 3 1 (char->integer #\f) (char->integer #\o)
(char->integer #\o)))
(comp-test '(load-keyword "qux")
- (vector 'load-keyword 0 0 3 (char->integer #\q) (char->integer #\u)
+ (vector 'load-keyword 0 0 3 1 (char->integer #\q) (char->integer #\u)
(char->integer #\x)))
(comp-test '(load-program 3 2 1 () 3 #f (make-int8 3) (return))