summaryrefslogtreecommitdiff
path: root/parser/lexer.scm
diff options
context:
space:
mode:
Diffstat (limited to 'parser/lexer.scm')
-rw-r--r--parser/lexer.scm651
1 files changed, 651 insertions, 0 deletions
diff --git a/parser/lexer.scm b/parser/lexer.scm
new file mode 100644
index 0000000..7230613
--- /dev/null
+++ b/parser/lexer.scm
@@ -0,0 +1,651 @@
+;;; File: parser/lexer Author: John
+
+;;; token data structure: a list with the token type in the
+;;; car and other information in the rest of the list. Symbols
+;;; designate the token type.
+
+;;; Reserved tokens use the name as the type and have no args.
+;;; Reserved tokens:
+;;; case class data default deriving else hiding if import in infix
+;;; infixl infixr instance interface let module of renaming then to
+;;; type where .. :: => = @ \ | ~ <- -> `
+;;; Other tokens:
+;;; (file string)
+;;; (newline line indent-column)
+;;; (conid string)
+;;; (varid string)
+;;; (consym string)
+;;; (varsym string)
+;;; (comment string) ;;; not used at the moment
+;;; (integer integer)
+;;; (float integer fraction exponent)
+;;; (string string)
+;;; (eof)
+
+
+;;; *** All of the stuff for lexing character and string literals is
+;;; *** broken because it assumes that the host Lisp uses the ASCII
+;;; *** encoding for characters and supports at least 255 characters.
+;;; *** I have marked the specific places in the code where these
+;;; *** assumptions are made, but fixing the problem will probably
+;;; *** require more drastic changes anyway -- such as using integers
+;;; *** instead of characters and vectors of integers instead of characters
+;;; *** throughout the compiler.
+
+(define *max-char* 255) ; highest char-code allowed.
+
+;;; This defines the long names of the control chars. Note that some of
+;;; this duplicates the table above & the reader.
+
+(define *control-char-names* '(
+ ("NUL" . 0) ("SOH" . 1) ("STX" . 2) ("ETX" . 3)
+ ("EOT" . 4) ("ENQ" . 5) ("ACK" . 6) ("BEL" . 7)
+ ("BS" . 8) ("HT" . 9) ("LF" . 10) ("VT" . 11)
+ ("FF" . 12) ("CR" . 13) ("SO" . 14) ("SI" . 15)
+ ("DLE" . 16) ("DC1" . 17) ("DC2" . 18) ("DC3" . 19)
+ ("DC4" . 20) ("NAK" . 21) ("SYN" . 22) ("ETB" . 23)
+ ("CAN" . 24) ("EM" . 25) ("SUB" . 26) ("ESC" . 27)
+ ("FS" . 28) ("GS" . 29) ("RS" . 30) ("US" . 31)
+ ("SP" . 32) ("DEL" . 127)))
+
+;;; This defines the short names for a few control chars. This
+;;; is keyed off the previous table
+
+(define *short-control-char-names* '(
+ (#\a . "BEL") (#\b . "BS") (#\f . "FF") (#\n . "LF")
+ (#\r . "CR") (#\t . "HT") (#\v . "VT")))
+
+;;; This is used in the ^X construct. Assume that ^X = code for ^A + X-A
+;;; *** This is an invalid assumption.
+
+(define *control-A* 1)
+
+;;; This function is the interface between the lexer and the rest
+;;; of the system. Note that the `file' reported in error messages
+;;; must be bound in an outer context.
+
+
+;;; *** I think this function should be binding these variables and not
+;;; *** just assigning them.
+
+(define (lex-port port literate?)
+ (setf *lex-literate?* literate?)
+ (setf *current-line* 1)
+ (setf *current-col* 0)
+ (setf *on-new-line?* '#t)
+ (setf *save-col?* '#f)
+ (setf *port* port)
+ (setf *tokens* '())
+ (setf *char* (read-char *port*))
+ (setf *peek-char* (read-char *port*))
+ (when (eof-object? *char*)
+ (setf *char* '#\space))
+ (when (eof-object? *peek-char*)
+ (setf *peek-char* '#\space))
+ (setf *at-eof/p?* '#f)
+ (setf *at-eof?* '#f)
+ (when *lex-literate?*
+ (process-literate-comments '#t))
+ (parse-till-eof)
+ (nreverse *tokens*))
+
+(define (parse-till-eof)
+ (cond (*at-eof?*
+ (emit-token 'eof)
+ '())
+ (else
+ (lex-one-token)
+ (parse-till-eof))))
+
+;;; There is an assumption that the scanner never peeks beyond a newline.
+;;; In literate mode, this may reveal the wrong thing.
+
+(define (advance-char)
+ (if (and *lex-literate?* (eqv? *char* #\newline))
+ (process-literate-comments '#f)
+ (advance-char-1)))
+
+(define (advance-char-1)
+ (cond ((eqv? *char* #\newline)
+ (setf *on-new-line?* '#t)
+ (incf (the fixnum *current-line*))
+ (setf *current-col* 0))
+ ((eqv? *char* #\tab)
+ (incf (the fixnum *current-col*) (- 8 (modulo *current-col* 8))))
+ (else
+ (incf (the fixnum *current-col*))))
+ (setf *char* *peek-char*)
+ (setf *at-eof?* *at-eof/p?*)
+ (setf *peek-char* (read-char *port*))
+ (when (eof-object? *peek-char*)
+ (setf *at-eof/p?* '#t)
+ (setf *peek-char* '#\space))
+ *char*)
+
+(define (peek-char-2)
+ (let ((ch (peek-char *port*)))
+ (if (eof-object? ch)
+ '#\space
+ ch)))
+
+(define (lex-one-token)
+ (setf *start-line* *current-line*) ; capture the loc at the start of the token
+ (setf *start-col* *current-col*)
+ (unless *at-eof?*
+ (char-case *char*
+ (whitechar
+ (advance-char)
+ (lex-one-token))
+ (#\- (char-case *peek-char*
+ (#\- (lex-comment))
+ (#\> (advance-char)
+ (advance-char)
+ (emit-token '\-\>))
+ (#\} (signal-missing-begin-comment)
+ (advance-char)
+ (advance-char)
+ (lex-one-token))
+ (else
+ (lex-varsym))))
+ (#\{ (cond ((char=? *peek-char* '#\-)
+ (advance-char)
+ (advance-char)
+ (cond ((char=? *char* '#\#)
+ (advance-char)
+ (emit-token 'begin-annotation))
+ (else
+ (lex-ncomment)
+ (lex-one-token))))
+ (else
+ (advance-char)
+ (emit-token '\{ ))))
+ (small (lex-varid))
+ (large (lex-conid))
+ (#\( (advance-char)
+ (emit-token '\())
+ (#\: (lex-consym))
+ (#\` (advance-char)
+ (emit-token '\`))
+ ((symbol presymbol) (lex-varsym))
+ (digit (lex-numeric))
+ (#\' (lex-char))
+ (#\" (lex-string))
+ (#\) (advance-char)
+ (emit-token '\)))
+ (#\, (advance-char)
+ (emit-token '\,))
+ (#\; (advance-char)
+ (emit-token '\;))
+ (#\[ (advance-char)
+ (emit-token '\[))
+ (#\] (advance-char)
+ (emit-token '\]))
+ (#\_ (advance-char)
+ (emit-token '\_))
+ (#\} (advance-char)
+ (emit-token '\}))
+ (else
+ (signal-invalid-character *char*)
+ (advance-char)
+ (lex-one-token)))))
+
+(define (signal-missing-begin-comment)
+ (lexer-error 'missing-begin-comment
+ "`-}' appears outside of a nested comment."))
+
+(define (signal-invalid-character ch)
+ (lexer-error 'invalid-character
+ "Invalid character `~a' appears in source program." ch))
+
+(define (advance-past-white)
+ (unless *at-eof?*
+ (char-case *char*
+ (whitechar
+ (advance-char)
+ (advance-past-white))
+ (else
+ '()))))
+
+(define (process-literate-comments at-start?)
+ (unless at-start? (advance-char-1))
+ (let ((l (classify-line)))
+ (cond ((or *at-eof?* (eq? l 'program))
+ '())
+ ((eq? l 'blank)
+ (skip-literate-comment '#t))
+ (else
+ (when (not at-start?)
+ (lexer-error 'blank-line-needed
+ "Literate comments must be preceeded by a blank line"))
+ (skip-literate-comment '#f)))))
+
+(define (skip-literate-comment prev-blank)
+ (skip-past-line)
+ (let ((l (classify-line)))
+ (cond (*at-eof?*
+ '())
+ ((eq? l 'comment)
+ (skip-literate-comment '#f))
+ ((eq? l 'blank)
+ (skip-literate-comment '#t))
+ (else
+ (when (not prev-blank)
+ (lexer-error 'blank-line-needed
+ "Literate comments must be followed by a blank line"))))))
+
+(define (classify-line)
+ (if *at-eof?*
+ 'blank
+ (char-case *char*
+ (#\>
+ (advance-char-1)
+ 'program)
+ (#\newline 'blank)
+ (whitechar
+ (classify-line-1))
+ (else 'comment))))
+
+(define (classify-line-1)
+ (advance-char-1)
+ (char-case *char*
+ (#\newline 'blank)
+ (whitechar (classify-line-1))
+ (else 'comment)))
+
+(define (skip-past-line)
+ (when (not *at-eof?*)
+ (char-case *char*
+ (#\newline
+ (advance-char-1))
+ (else
+ (advance-char-1)
+ (skip-past-line)))))
+
+(define (lex-comment) ;; a -- style comment
+ (advance-char)
+ (cond (*at-eof?* (lexer-eof-in-comment *current-line*))
+ ((char=? *char* #\newline)
+ (lex-one-token))
+ (else
+ (lex-comment))))
+
+(define (lexer-eof-in-comment start-line)
+ (signal-eof-in-comment start-line)
+ (lex-one-token)) ; will return the eof token
+
+(define (signal-eof-in-comment start-line)
+ (lexer-error 'eof-in-comment
+ "End of file in comment starting at line ~A." start-line))
+
+;;; Here *char* and *peek-char* are the first two chars on a line.
+
+(define (scan-symbol)
+ (scan-list-of (symbol #\:)))
+
+(define (scan-var-con)
+ (scan-list-of (large small digit #\' #\_)))
+
+(define (lex-ncomment)
+ (lex-ncomment-1 *current-line*))
+
+(define (lex-ncomment-1 start-line)
+ (if *at-eof?*
+ (lexer-eof-in-comment start-line)
+ (char-case *char*
+ (#\- (cond ((char=? *peek-char* #\})
+ (advance-char)
+ (advance-char))
+ (else
+ (advance-char)
+ (lex-ncomment-1 start-line))))
+ (#\{ (cond ((char=? *peek-char* #\-)
+ (advance-char)
+ (advance-char)
+ (lex-ncomment)
+ (lex-ncomment-1 start-line))
+ (else
+ (advance-char)
+ (lex-ncomment-1 start-line))))
+ (else
+ (advance-char)
+ (lex-ncomment-1 start-line)))))
+
+(define (lex-varid)
+ (let ((sym (scan-var-con)))
+ (parse-reserved sym varid
+ "case" "class"
+ "data" "default" "deriving"
+ "else"
+ "hiding"
+ "if" "import" "in" "infix" "infixl" "infixr" "instance" "interface"
+ "let"
+ "module"
+ "of"
+ "renaming"
+ "then" "to" "type"
+ "where")))
+
+(define (lex-conid)
+ (let ((sym (scan-var-con)))
+ (emit-token/string 'conid sym)))
+
+(define (lex-consym)
+ (let ((sym (scan-symbol)))
+ (cond ((string=/list? (cdr sym) ":")
+ (emit-token '\:\:))
+ (else
+ (emit-token/string 'consym sym)))))
+
+(define (lex-varsym)
+ (let ((sym (scan-symbol)))
+ (cond ((and (string=/list? sym "<") (char=? *char* #\-))
+ (advance-char)
+ (emit-token '\<\-))
+ ((and (string=/list? sym "#")
+ (char=? *char* #\-)
+ (char=? *peek-char* #\}))
+ (advance-char)
+ (advance-char)
+ (emit-token 'end-annotation))
+ (else
+ (parse-reserved sym varsym
+ ".."
+ "=>" "="
+ "@"
+ "\\"
+ "|"
+ "~")))))
+
+(define (lex-integer radix)
+ (lex-integer-1 radix 0))
+
+(define (lex-integer-1 radix psum)
+ (declare (type fixnum radix)
+ (type integer psum))
+ (let ((d (char->digit *char* radix)))
+ (if d
+ (begin
+ (advance-char)
+ (lex-integer-1 radix (+ (* psum radix) (the fixnum d))))
+ psum)))
+
+(define (lex-fraction int-part denominator)
+ (declare (type integer int-part denominator))
+ (let ((d (char->digit *char* 10)))
+ (if d
+ (begin
+ (advance-char)
+ (lex-fraction
+ (+ (* int-part 10) (the fixnum d)) (* denominator 10)))
+ (values int-part denominator))))
+
+(define (lex-numeric)
+ (let ((int-part (lex-integer 10)))
+ (if (and (char=? *char* #\.)
+ (char->digit *peek-char* 10))
+ (lex-float int-part)
+ (emit-token 'integer int-part))))
+
+(define (lex-float int-part)
+ (advance-char)
+ (multiple-value-bind (numerator denominator) (lex-fraction int-part 1)
+ (let ((no-exponent
+ (lambda () (emit-token 'float numerator denominator 0))))
+ (char-case *char*
+ (exponent
+ (char-case *peek-char*
+ (digit
+ (advance-char)
+ (lex-float/exp numerator denominator 1))
+ ((#\+ #\-)
+ (cond ((char->digit (peek-char-2) 10)
+ (let ((sign (if (char=? *peek-char* '#\+) 1 -1)))
+ (advance-char)
+ (advance-char)
+ (lex-float/exp numerator denominator sign)))
+ (else
+ (funcall no-exponent))))
+ (else
+ (funcall no-exponent))))
+ (else
+ (emit-token 'float numerator denominator 0))))))
+
+(define (lex-float/exp numerator denominator sign)
+ (let ((exponent (lex-integer 10)))
+ (emit-token 'float numerator denominator (* sign exponent))))
+
+(define (lex-char)
+ (advance-char)
+ (let ((c
+ (char-case *char*
+ (#\' (signal-null-character)
+ '#\?)
+ (#\\ (lex-escaped-char '#f))
+ ((#\space graphic)
+ (let ((ch *char*))
+ (advance-char)
+ ch))
+ (else
+ (signal-bad-character-constant *char*)
+ (advance-char)
+ `#\?))))
+ (cond ((char=? *char* '#\')
+ (advance-char)
+ (emit-token 'char c))
+ (else
+ (signal-missing-char-quote)
+ (skip-to-quote-or-eol)))))
+
+(define (signal-null-character)
+ (lexer-error 'null-character
+ "Null character '' is illegal - use '\\'' for a quote."))
+
+(define (signal-bad-character-constant ch)
+ (lexer-error 'bad-character-constant
+ "The character `~a' may not appear in a character literal." ch))
+
+(define (signal-missing-char-quote)
+ (lexer-error 'missing-char-quote
+ "Character constant has more than one character."))
+
+
+(define (skip-to-quote-or-eol)
+ (if *at-eof?*
+ (lex-one-token)
+ (char-case *char*
+ (#\' (advance-char)
+ (lex-one-token))
+ (#\newline (advance-char)
+ (lex-one-token))
+ (else
+ (advance-char)
+ (skip-to-quote-or-eol)))))
+
+(define (lex-string)
+ (advance-char)
+ (emit-token 'string (list->string (gather-string-chars))))
+
+(define (gather-string-chars)
+ (char-case *char*
+ (#\\
+ (let ((ch (lex-escaped-char '#t)))
+ (if (eq? ch 'null)
+ (gather-string-chars)
+ (cons ch (gather-string-chars)))))
+ (#\"
+ (advance-char)
+ '())
+ ((graphic #\space)
+ (let ((ch *char*))
+ (advance-char)
+ (cons ch (gather-string-chars))))
+ (#\newline
+ (signal-missing-string-quote)
+ '())
+ (else
+ (signal-bad-string-constant *char*)
+ (advance-char)
+ (gather-string-chars))))
+
+(define (signal-missing-string-quote)
+ (lexer-error 'missing-string-quote
+ "String continued over end of line."))
+
+(define (signal-bad-string-constant ch)
+ (lexer-error 'bad-string-constant
+ "The character `~a' may not appear in a string literal." ch))
+
+
+(define (convert-stupid-control-character-names)
+ (let ((c1 *char*)
+ (c2 *peek-char*))
+ (advance-char)
+ (advance-char)
+ (let ((s2 (string c1 c2))
+ (s3 (string c1 c2 *char*)))
+ (let ((srch3 (assoc s3 *control-char-names*)))
+ (cond (srch3
+ (advance-char)
+ (integer->char (cdr srch3)))
+ (else
+ (let ((srch2 (assoc s2 *control-char-names*)))
+ (cond (srch2
+ (integer->char (cdr srch2)))
+ (else
+ (signal-bad-control-char s3)
+ `#\?)))))))))
+
+(define (signal-bad-control-char name)
+ (lexer-error 'invalid-control-char
+ "`~a' is not a recognized control character name." name))
+
+
+(define (lex-escaped-char in-string?)
+ (advance-char)
+ (char-case *char*
+ ((#\a #\b #\f #\n #\r #\t #\v)
+ (let* ((ccode (cdr (assoc *char* *short-control-char-names*)))
+ (ccode1 (cdr (assoc ccode *control-char-names*))))
+ (advance-char)
+ (integer->char ccode1)))
+ ((#\\ #\' #\")
+ (let ((ch *char*))
+ (advance-char)
+ ch))
+ (#\&
+ (advance-char)
+ (cond (in-string? 'null)
+ (else
+ (signal-bad-&-escape)
+ '#\?)))
+ (#\^
+ ;; *** This code is problematic because it assumes
+ ;; *** (1) that you can do the arithmetic on the character codes
+ ;; *** (2) that the resulting integer can actually be coerced to
+ ;; *** the right character object in the host Lisp.
+ (advance-char)
+ (char-case *char*
+ ((large #\@ #\[ #\\ #\] #\^ #\_)
+ (let ((code (+ (- (char->integer *char*)
+ (char->integer '#\A))
+ *control-A*)))
+ (advance-char)
+ (integer->char code)))
+ (else
+ (signal-bad-^-escape *char*)
+ '#\?)))
+ (large
+ (convert-stupid-control-character-names))
+ (digit
+ (convert-num-to-char (lex-integer 10)))
+ (#\o
+ (advance-char)
+ (cond ((char->digit *char* 8)
+ (convert-num-to-char (lex-integer 8)))
+ (else
+ (signal-missing-octal-digits)
+ '#\?)))
+ (#\x
+ (advance-char)
+ (cond ((char->digit *char* 16)
+ (convert-num-to-char (lex-integer 16)))
+ (else
+ (signal-missing-hex-digits)
+ `#\?)))
+ (whitechar
+ (cond (in-string?
+ (lex-gap))
+ (else
+ (signal-bad-gap)
+ `#\?)))
+ (else
+ (signal-bad-escape *char*)
+ `#\?)))
+
+(define (signal-bad-&-escape)
+ (lexer-error 'bad-&-escape
+ "The escape `\\&' is not allowed inside a character literal."))
+
+(define (signal-bad-^-escape ch)
+ (lexer-error 'bad-^-escape
+ "The escape `\\^~a' is not recognized." ch))
+
+(define (signal-missing-octal-digits)
+ (lexer-error 'missing-octal-digits
+ "No digits provided for `\\o' escape."))
+
+(define (signal-missing-hex-digits)
+ (lexer-error 'missing-hex-digits
+ "No digits provided for `\\x' escape."))
+
+(define (signal-bad-gap)
+ (lexer-error 'invalid-gap
+ "Gaps are not allowed inside character literals."))
+
+(define (signal-bad-escape ch)
+ (lexer-error 'bad-escape
+ "The escape `\\~a' is not recognized." ch))
+
+
+
+;;; *** This code is problematic because it assumes that integers
+;;; *** between 0 and 255 map on to characters with the corresponding
+;;; *** ASCII encoding in the host Lisp, and that the host Lisp actually
+;;; *** supports 255 characters.
+
+(define (convert-num-to-char num)
+ (cond ((and (>= num 0) (>= *max-char* num))
+ (integer->char num))
+ (else
+ (signal-char-out-of-range num)
+ '#\?)))
+
+(define (signal-char-out-of-range num)
+ (lexer-error 'char-out-of-range
+ "There is no character corresponding to code ~s." num))
+
+
+(define (lex-gap)
+ (cond (*at-eof?*
+ (signal-eof-in-gap)
+ 'null)
+ (else
+ (char-case *char*
+ (whitechar
+ (advance-char)
+ (lex-gap))
+ (#\\
+ (advance-char)
+ 'null)
+ (else
+ (signal-missing-gap)
+ 'null)))))
+
+
+(define (signal-eof-in-gap)
+ (lexer-error 'eof-in-gap
+ "End of file encountered inside gap."))
+
+(define (signal-missing-gap)
+ (lexer-error 'missing-gap
+ "Missing gap delimiter, or junk inside gap."))