diff options
author | Ludovic Courtès <ludo@gnu.org> | 2010-01-07 11:00:37 +0100 |
---|---|---|
committer | Ludovic Courtès <ludo@gnu.org> | 2010-01-07 11:10:35 +0100 |
commit | 7b0419128bce68f48a158292430ed4a7202aa1b1 (patch) | |
tree | da697b566d45d144d2c23310ac34de7c29e60666 | |
parent | 29bcdbb05948a5f12d2d8cb36a0c3c582e738be3 (diff) |
Have string ports honor `%default-port-encoding'.
* libguile/strports.c (scm_i_mkstrport): Remove.
(scm_mkstrport): Don't change the port's encoding to UTF-8; convert
STR to the default port encoding.
(scm_strport_to_string): Fix documentation & indentation.
* libguile/strports.h (scm_i_mkstrport): Remove.
* test-suite/lib.scm (exception:encoding-error): New variable.
(format-test-name): Set `%default-port-encoding' to "UTF-8".
* test-suite/tests/ports.test ("string ports")["%default-port-encoding
is honored", "suitable encoding [latin-1]", "suitable encoding
[latin-3]", "wrong encoding"]: New tests.
* test-suite/tests/r6rs-ports.test ("7.2.11 Binary
Output")["put-bytevector with UTF-16 string port", "put-bytevector
with wrong-encoding string port"]: New tests.
* test-suite/tests/reader.test (read-string): Set
`%default-port-encoding' to `#f'.
("reading")["unprintable symbol"]: Use a string that doesn't contain
zeros.
* doc/ref/api-io.texi (String Ports): Document encoding issues with
`call-with-output-string' and `with-output-to-string'.
-rw-r--r-- | doc/ref/api-io.texi | 26 | ||||
-rw-r--r-- | libguile/strports.c | 84 | ||||
-rw-r--r-- | libguile/strports.h | 4 | ||||
-rw-r--r-- | test-suite/lib.scm | 26 | ||||
-rw-r--r-- | test-suite/tests/ports.test | 41 | ||||
-rw-r--r-- | test-suite/tests/r6rs-ports.test | 26 | ||||
-rw-r--r-- | test-suite/tests/reader.test | 11 |
7 files changed, 137 insertions, 81 deletions
diff --git a/doc/ref/api-io.texi b/doc/ref/api-io.texi index eb5338c0d..a1f4221e2 100644 --- a/doc/ref/api-io.texi +++ b/doc/ref/api-io.texi @@ -1,6 +1,6 @@ @c -*-texinfo-*- @c This is part of the GNU Guile Reference Manual. -@c Copyright (C) 1996, 1997, 2000, 2001, 2002, 2003, 2004, 2007, 2009 +@c Copyright (C) 1996, 1997, 2000, 2001, 2002, 2003, 2004, 2007, 2009, 2010 @c Free Software Foundation, Inc. @c See the file guile.texi for copying conditions. @@ -972,6 +972,28 @@ away from its default. Calls the one-argument procedure @var{proc} with a newly created output port. When the function returns, the string composed of the characters written into the port is returned. @var{proc} should not close the port. + +Note that which characters can be written to a string port depend on the port's +encoding. The default encoding of string ports is specified by the +@code{%default-port-encoding} fluid (@pxref{Ports, +@code{%default-port-encoding}}). For instance, it is an error to write Greek +letter alpha to an ISO-8859-1-encoded string port since this character cannot be +represented with ISO-8859-1: + +@example +(define alpha (integer->char #x03b1)) ; GREEK SMALL LETTER ALPHA + +(with-fluids ((%default-port-encoding "ISO-8859-1")) + (call-with-output-string + (lambda (p) + (display alpha p)))) + +@result{} +Throw to key `encoding-error' +@end example + +Changing the string port's encoding to a Unicode-capable encoding such as UTF-8 +solves the problem. @end deffn @deffn {Scheme Procedure} call-with-input-string string proc @@ -985,6 +1007,8 @@ read. The value yielded by the @var{proc} is returned. Calls the zero-argument procedure @var{thunk} with the current output port set temporarily to a new string port. It returns a string composed of the characters written to the current output. + +See @code{call-with-output-string} above for character encoding considerations. @end deffn @deffn {Scheme Procedure} with-input-from-string string thunk diff --git a/libguile/strports.c b/libguile/strports.c index 95e93c9ef..625b75308 100644 --- a/libguile/strports.c +++ b/libguile/strports.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1995,1996,1998,1999,2000,2001,2002, 2003, 2005, 2006, 2009 Free Software Foundation, Inc. +/* Copyright (C) 1995,1996,1998,1999,2000,2001,2002, 2003, 2005, 2006, 2009, 2010 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -289,84 +289,60 @@ st_truncate (SCM port, scm_t_off length) pt->write_pos = pt->read_end; } -SCM -scm_i_mkstrport (SCM pos, const char *utf8_str, size_t str_len, long modes, const char *caller) +SCM +scm_mkstrport (SCM pos, SCM str, long modes, const char *caller) { - SCM z, str; + SCM z; scm_t_port *pt; - size_t c_pos; - char *buf; - - /* Because ports are inherently 8-bit, strings need to be converted - to a locale representation for storage. But, since string ports - rely on string functionality for their memory management, we need - to create a new string that has the 8-bit locale representation - of the underlying string. + size_t str_len, c_pos; + char *buf, *c_str; - locale_str is already in the locale of the port. */ - str = scm_i_make_string (str_len, &buf); - memcpy (buf, utf8_str, str_len); - - c_pos = scm_to_unsigned_integer (pos, 0, str_len); + SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller); + c_pos = scm_to_unsigned_integer (pos, 0, scm_i_string_length (str)); if (!((modes & SCM_WRTNG) || (modes & SCM_RDNG))) scm_misc_error ("scm_mkstrport", "port must read or write", SCM_EOL); - scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex); + scm_dynwind_begin (0); + scm_i_dynwind_pthread_mutex_lock (&scm_i_port_table_mutex); + z = scm_new_port_table_entry (scm_tc16_strport); pt = SCM_PTAB_ENTRY(z); SCM_SETSTREAM (z, SCM_UNPACK (str)); - SCM_SET_CELL_TYPE(z, scm_tc16_strport|modes); - pt->write_buf = pt->read_buf = (unsigned char *) scm_i_string_chars (str); + SCM_SET_CELL_TYPE (z, scm_tc16_strport | modes); + + /* Create a copy of STR in the encoding of Z. */ + buf = scm_to_stringn (str, &str_len, pt->encoding, + SCM_FAILED_CONVERSION_ERROR); + c_str = scm_gc_malloc (str_len, "strport"); + memcpy (c_str, buf, str_len); + free (buf); + + pt->write_buf = pt->read_buf = (unsigned char *) c_str; pt->read_pos = pt->write_pos = pt->read_buf + c_pos; pt->write_buf_size = pt->read_buf_size = str_len; pt->write_end = pt->read_end = pt->read_buf + pt->read_buf_size; pt->rw_random = 1; - scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex); - /* ensure write_pos is writable. */ + scm_dynwind_end (); + + /* Ensure WRITE_POS is writable. */ if ((modes & SCM_WRTNG) && pt->write_pos == pt->write_end) st_flush (z); - scm_i_set_port_encoding_x (z, "UTF-8"); scm_i_set_conversion_strategy_x (z, SCM_FAILED_CONVERSION_ERROR); return z; } -SCM -scm_mkstrport (SCM pos, SCM str, long modes, const char *caller) -{ - SCM z; - size_t str_len; - char *buf; - - SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller); - - /* Because ports are inherently 8-bit, strings need to be converted - to a locale representation for storage. But, since string ports - rely on string functionality for their memory management, we need - to create a new string that has the 8-bit locale representation - of the underlying string. This violates the guideline that the - internal encoding of characters in strings is in unicode - codepoints. */ - - /* String ports are are always initialized with "UTF-8" as their - encoding. */ - buf = scm_to_stringn (str, &str_len, "UTF-8", SCM_FAILED_CONVERSION_ERROR); - z = scm_i_mkstrport (pos, buf, str_len, modes, caller); - free (buf); - return z; -} - -/* Create a new string from a string port's buffer, converting from - the port's 8-bit locale-specific representation to the standard - string representation. */ -SCM scm_strport_to_string (SCM port) +/* Create a new string from the buffer of PORT, a string port, converting from + PORT's encoding to the standard string representation. */ +SCM +scm_strport_to_string (SCM port) { - scm_t_port *pt = SCM_PTAB_ENTRY (port); SCM str; - + scm_t_port *pt = SCM_PTAB_ENTRY (port); + if (pt->rw_active == SCM_PORT_WRITE) st_flush (port); diff --git a/libguile/strports.h b/libguile/strports.h index d93266a12..3a9c3ec01 100644 --- a/libguile/strports.h +++ b/libguile/strports.h @@ -3,7 +3,7 @@ #ifndef SCM_STRPORTS_H #define SCM_STRPORTS_H -/* Copyright (C) 1995,1996,2000,2001,2002, 2006, 2008 Free Software Foundation, Inc. +/* Copyright (C) 1995,1996,2000,2001,2002, 2006, 2008, 2010 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -44,8 +44,6 @@ SCM_API scm_t_bits scm_tc16_strport; SCM_API SCM scm_mkstrport (SCM pos, SCM str, long modes, const char * caller); -SCM_INTERNAL SCM scm_i_mkstrport (SCM pos, const char *locale_str, size_t str_len, - long modes, const char *caller); SCM_API SCM scm_strport_to_string (SCM port); SCM_API SCM scm_object_to_string (SCM obj, SCM printer); SCM_API SCM scm_call_with_output_string (SCM proc); diff --git a/test-suite/lib.scm b/test-suite/lib.scm index e5b7a0813..a2390da67 100644 --- a/test-suite/lib.scm +++ b/test-suite/lib.scm @@ -1,5 +1,5 @@ ;;;; test-suite/lib.scm --- generic support for testing -;;;; Copyright (C) 1999, 2000, 2001, 2004, 2006, 2007, 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 1999, 2000, 2001, 2004, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. ;;;; ;;;; This program is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public @@ -30,6 +30,7 @@ exception:numerical-overflow exception:struct-set!-denied exception:system-error + exception:encoding-error exception:miscellaneous-error exception:string-contains-nul exception:read-error @@ -267,6 +268,8 @@ with-locale with-locale* (cons 'misc-error "^set! denied for field")) (define exception:system-error (cons 'system-error ".*")) +(define exception:encoding-error + (cons 'misc-error "(cannot convert to output locale|input locale conversion error)")) (define exception:miscellaneous-error (cons 'misc-error "^.*")) (define exception:read-error @@ -389,15 +392,18 @@ with-locale with-locale* ;;;; Turn a test name into a nice human-readable string. (define (format-test-name name) - (call-with-output-string - (lambda (port) - (let loop ((name name) - (separator "")) - (if (pair? name) - (begin - (display separator port) - (display (car name) port) - (loop (cdr name) ": "))))))) + ;; Choose a Unicode-capable encoding so that the string port can contain any + ;; valid Unicode character. + (with-fluids ((%default-port-encoding "UTF-8")) + (call-with-output-string + (lambda (port) + (let loop ((name name) + (separator "")) + (if (pair? name) + (begin + (display separator port) + (display (car name) port) + (loop (cdr name) ": ")))))))) ;;;; For a given test-name, deliver the full name including all prefixes. (define (full-name name) diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test index 312467d5d..72dcb6379 100644 --- a/test-suite/tests/ports.test +++ b/test-suite/tests/ports.test @@ -1,7 +1,7 @@ -;;;; ports.test --- test suite for Guile I/O ports -*- scheme -*- +;;;; ports.test --- Guile I/O ports. -*- coding: utf-8; mode: scheme; -*- ;;;; Jim Blandy <jimb@red-bean.com> --- May 1999 ;;;; -;;;; Copyright (C) 1999, 2001, 2004, 2006, 2007, 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 1999, 2001, 2004, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public @@ -307,7 +307,42 @@ (string-set! text 0 #\a) (string-set! text (- len 1) #\b) (pass-if "output check" - (string=? text result)))) + (string=? text result))) + + (pass-if "%default-port-encoding is honored" + (let ((encodings '("UTF-8" "UTF-16" "ISO-8859-1" "ISO-8859-3"))) + (equal? (map (lambda (e) + (with-fluids ((%default-port-encoding e)) + (call-with-output-string + (lambda (p) + (display (port-encoding p) p))))) + encodings) + encodings))) + + (pass-if "suitable encoding [latin-1]" + (let ((str "hello, world")) + (with-fluids ((%default-port-encoding "ISO-8859-1")) + (equal? str + (with-output-to-string + (lambda () + (display str))))))) + + (pass-if "suitable encoding [latin-3]" + (let ((str "ĉu bone?")) + (with-fluids ((%default-port-encoding "ISO-8859-3")) + (equal? str + (with-output-to-string + (lambda () + (display str))))))) + + (pass-if-exception "wrong encoding" + exception:encoding-error + (let ((str "ĉu bone?")) + ;; Latin-1 cannot represent ‘ĉ’. + (with-fluids ((%default-port-encoding "ISO-8859-1")) + (with-output-to-string + (lambda () + (display str))))))) (with-test-prefix "call-with-output-string" diff --git a/test-suite/tests/r6rs-ports.test b/test-suite/tests/r6rs-ports.test index eb60cf3ff..1d60991e9 100644 --- a/test-suite/tests/r6rs-ports.test +++ b/test-suite/tests/r6rs-ports.test @@ -1,6 +1,6 @@ -;;;; r6rs-ports.test --- Exercise the R6RS I/O port API. +;;;; r6rs-ports.test --- R6RS I/O port tests. -*- coding: iso-8859-1; -*- ;;;; -;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2009, 2010 Free Software Foundation, Inc. ;;;; Ludovic Courtès ;;;; ;;;; This library is free software; you can redistribute it and/or @@ -219,7 +219,25 @@ (port (%make-void-port "w"))) (close-port port) - (put-bytevector port bv)))) + (put-bytevector port bv))) + + (pass-if "put-bytevector with UTF-16 string port" + (let* ((str "hello, world") + (bv (string->utf16 str))) + (equal? str + (with-fluids ((%default-port-encoding "UTF-16BE")) + (call-with-output-string + (lambda (port) + (put-bytevector port bv))))))) + + (pass-if-exception "put-bytevector with wrong-encoding string port" + exception:encoding-error + (let* ((str "hello, world") + (bv (string->utf16 str))) + (with-fluids ((%default-port-encoding "UTF-32")) + (call-with-output-string + (lambda (port) + (put-bytevector port bv))))))) (with-test-prefix "7.2.7 Input Ports" @@ -452,8 +470,6 @@ (not eof?) (bytevector=? sink source))))) - ;;; Local Variables: -;;; coding: latin-1 ;;; mode: scheme ;;; End: diff --git a/test-suite/tests/reader.test b/test-suite/tests/reader.test index 2ee21c158..b819e63fb 100644 --- a/test-suite/tests/reader.test +++ b/test-suite/tests/reader.test @@ -1,6 +1,6 @@ -;;;; reader.test --- Exercise the reader. -*- Scheme -*- +;;;; reader.test --- Reader test. -*- coding: iso-8859-1; mode: scheme -*- ;;;; -;;;; Copyright (C) 1999, 2001, 2002, 2003, 2007, 2008, 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 1999, 2001, 2002, 2003, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. ;;;; Jim Blandy <jimb@red-bean.com> ;;;; ;;;; This library is free software; you can redistribute it and/or @@ -41,7 +41,8 @@ (define (read-string s) - (with-input-from-string s (lambda () (read)))) + (with-fluids ((%default-port-encoding #f)) + (with-input-from-string s (lambda () (read))))) (define (with-read-options opts thunk) (let ((saved-options (read-options))) @@ -110,8 +111,8 @@ (pass-if "unprintable symbol" ;; The reader tolerates unprintable characters for symbols. - (equal? (string->symbol "\001\002\003") - (read-string "\001\002\003"))) + (equal? (string->symbol "\x01\x02\x03") + (read-string "\x01\x02\x03"))) (pass-if "CR recognized as a token delimiter" ;; In 1.8.3, character 0x0d was not recognized as a delimiter. |