From 4e987026148fe65c323afbc93cd560c07bf06b3f Mon Sep 17 00:00:00 2001 From: Yale AI Dept Date: Wed, 14 Jul 1993 13:08:00 -0500 Subject: Import to github. --- parser/lexer.scm | 651 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 651 insertions(+) create mode 100644 parser/lexer.scm (limited to 'parser/lexer.scm') diff --git a/parser/lexer.scm b/parser/lexer.scm new file mode 100644 index 0000000..7230613 --- /dev/null +++ b/parser/lexer.scm @@ -0,0 +1,651 @@ +;;; File: parser/lexer Author: John + +;;; token data structure: a list with the token type in the +;;; car and other information in the rest of the list. Symbols +;;; designate the token type. + +;;; Reserved tokens use the name as the type and have no args. +;;; Reserved tokens: +;;; case class data default deriving else hiding if import in infix +;;; infixl infixr instance interface let module of renaming then to +;;; type where .. :: => = @ \ | ~ <- -> ` +;;; Other tokens: +;;; (file string) +;;; (newline line indent-column) +;;; (conid string) +;;; (varid string) +;;; (consym string) +;;; (varsym string) +;;; (comment string) ;;; not used at the moment +;;; (integer integer) +;;; (float integer fraction exponent) +;;; (string string) +;;; (eof) + + +;;; *** All of the stuff for lexing character and string literals is +;;; *** broken because it assumes that the host Lisp uses the ASCII +;;; *** encoding for characters and supports at least 255 characters. +;;; *** I have marked the specific places in the code where these +;;; *** assumptions are made, but fixing the problem will probably +;;; *** require more drastic changes anyway -- such as using integers +;;; *** instead of characters and vectors of integers instead of characters +;;; *** throughout the compiler. + +(define *max-char* 255) ; highest char-code allowed. + +;;; This defines the long names of the control chars. Note that some of +;;; this duplicates the table above & the reader. + +(define *control-char-names* '( + ("NUL" . 0) ("SOH" . 1) ("STX" . 2) ("ETX" . 3) + ("EOT" . 4) ("ENQ" . 5) ("ACK" . 6) ("BEL" . 7) + ("BS" . 8) ("HT" . 9) ("LF" . 10) ("VT" . 11) + ("FF" . 12) ("CR" . 13) ("SO" . 14) ("SI" . 15) + ("DLE" . 16) ("DC1" . 17) ("DC2" . 18) ("DC3" . 19) + ("DC4" . 20) ("NAK" . 21) ("SYN" . 22) ("ETB" . 23) + ("CAN" . 24) ("EM" . 25) ("SUB" . 26) ("ESC" . 27) + ("FS" . 28) ("GS" . 29) ("RS" . 30) ("US" . 31) + ("SP" . 32) ("DEL" . 127))) + +;;; This defines the short names for a few control chars. This +;;; is keyed off the previous table + +(define *short-control-char-names* '( + (#\a . "BEL") (#\b . "BS") (#\f . "FF") (#\n . "LF") + (#\r . "CR") (#\t . "HT") (#\v . "VT"))) + +;;; This is used in the ^X construct. Assume that ^X = code for ^A + X-A +;;; *** This is an invalid assumption. + +(define *control-A* 1) + +;;; This function is the interface between the lexer and the rest +;;; of the system. Note that the `file' reported in error messages +;;; must be bound in an outer context. + + +;;; *** I think this function should be binding these variables and not +;;; *** just assigning them. + +(define (lex-port port literate?) + (setf *lex-literate?* literate?) + (setf *current-line* 1) + (setf *current-col* 0) + (setf *on-new-line?* '#t) + (setf *save-col?* '#f) + (setf *port* port) + (setf *tokens* '()) + (setf *char* (read-char *port*)) + (setf *peek-char* (read-char *port*)) + (when (eof-object? *char*) + (setf *char* '#\space)) + (when (eof-object? *peek-char*) + (setf *peek-char* '#\space)) + (setf *at-eof/p?* '#f) + (setf *at-eof?* '#f) + (when *lex-literate?* + (process-literate-comments '#t)) + (parse-till-eof) + (nreverse *tokens*)) + +(define (parse-till-eof) + (cond (*at-eof?* + (emit-token 'eof) + '()) + (else + (lex-one-token) + (parse-till-eof)))) + +;;; There is an assumption that the scanner never peeks beyond a newline. +;;; In literate mode, this may reveal the wrong thing. + +(define (advance-char) + (if (and *lex-literate?* (eqv? *char* #\newline)) + (process-literate-comments '#f) + (advance-char-1))) + +(define (advance-char-1) + (cond ((eqv? *char* #\newline) + (setf *on-new-line?* '#t) + (incf (the fixnum *current-line*)) + (setf *current-col* 0)) + ((eqv? *char* #\tab) + (incf (the fixnum *current-col*) (- 8 (modulo *current-col* 8)))) + (else + (incf (the fixnum *current-col*)))) + (setf *char* *peek-char*) + (setf *at-eof?* *at-eof/p?*) + (setf *peek-char* (read-char *port*)) + (when (eof-object? *peek-char*) + (setf *at-eof/p?* '#t) + (setf *peek-char* '#\space)) + *char*) + +(define (peek-char-2) + (let ((ch (peek-char *port*))) + (if (eof-object? ch) + '#\space + ch))) + +(define (lex-one-token) + (setf *start-line* *current-line*) ; capture the loc at the start of the token + (setf *start-col* *current-col*) + (unless *at-eof?* + (char-case *char* + (whitechar + (advance-char) + (lex-one-token)) + (#\- (char-case *peek-char* + (#\- (lex-comment)) + (#\> (advance-char) + (advance-char) + (emit-token '\-\>)) + (#\} (signal-missing-begin-comment) + (advance-char) + (advance-char) + (lex-one-token)) + (else + (lex-varsym)))) + (#\{ (cond ((char=? *peek-char* '#\-) + (advance-char) + (advance-char) + (cond ((char=? *char* '#\#) + (advance-char) + (emit-token 'begin-annotation)) + (else + (lex-ncomment) + (lex-one-token)))) + (else + (advance-char) + (emit-token '\{ )))) + (small (lex-varid)) + (large (lex-conid)) + (#\( (advance-char) + (emit-token '\()) + (#\: (lex-consym)) + (#\` (advance-char) + (emit-token '\`)) + ((symbol presymbol) (lex-varsym)) + (digit (lex-numeric)) + (#\' (lex-char)) + (#\" (lex-string)) + (#\) (advance-char) + (emit-token '\))) + (#\, (advance-char) + (emit-token '\,)) + (#\; (advance-char) + (emit-token '\;)) + (#\[ (advance-char) + (emit-token '\[)) + (#\] (advance-char) + (emit-token '\])) + (#\_ (advance-char) + (emit-token '\_)) + (#\} (advance-char) + (emit-token '\})) + (else + (signal-invalid-character *char*) + (advance-char) + (lex-one-token))))) + +(define (signal-missing-begin-comment) + (lexer-error 'missing-begin-comment + "`-}' appears outside of a nested comment.")) + +(define (signal-invalid-character ch) + (lexer-error 'invalid-character + "Invalid character `~a' appears in source program." ch)) + +(define (advance-past-white) + (unless *at-eof?* + (char-case *char* + (whitechar + (advance-char) + (advance-past-white)) + (else + '())))) + +(define (process-literate-comments at-start?) + (unless at-start? (advance-char-1)) + (let ((l (classify-line))) + (cond ((or *at-eof?* (eq? l 'program)) + '()) + ((eq? l 'blank) + (skip-literate-comment '#t)) + (else + (when (not at-start?) + (lexer-error 'blank-line-needed + "Literate comments must be preceeded by a blank line")) + (skip-literate-comment '#f))))) + +(define (skip-literate-comment prev-blank) + (skip-past-line) + (let ((l (classify-line))) + (cond (*at-eof?* + '()) + ((eq? l 'comment) + (skip-literate-comment '#f)) + ((eq? l 'blank) + (skip-literate-comment '#t)) + (else + (when (not prev-blank) + (lexer-error 'blank-line-needed + "Literate comments must be followed by a blank line")))))) + +(define (classify-line) + (if *at-eof?* + 'blank + (char-case *char* + (#\> + (advance-char-1) + 'program) + (#\newline 'blank) + (whitechar + (classify-line-1)) + (else 'comment)))) + +(define (classify-line-1) + (advance-char-1) + (char-case *char* + (#\newline 'blank) + (whitechar (classify-line-1)) + (else 'comment))) + +(define (skip-past-line) + (when (not *at-eof?*) + (char-case *char* + (#\newline + (advance-char-1)) + (else + (advance-char-1) + (skip-past-line))))) + +(define (lex-comment) ;; a -- style comment + (advance-char) + (cond (*at-eof?* (lexer-eof-in-comment *current-line*)) + ((char=? *char* #\newline) + (lex-one-token)) + (else + (lex-comment)))) + +(define (lexer-eof-in-comment start-line) + (signal-eof-in-comment start-line) + (lex-one-token)) ; will return the eof token + +(define (signal-eof-in-comment start-line) + (lexer-error 'eof-in-comment + "End of file in comment starting at line ~A." start-line)) + +;;; Here *char* and *peek-char* are the first two chars on a line. + +(define (scan-symbol) + (scan-list-of (symbol #\:))) + +(define (scan-var-con) + (scan-list-of (large small digit #\' #\_))) + +(define (lex-ncomment) + (lex-ncomment-1 *current-line*)) + +(define (lex-ncomment-1 start-line) + (if *at-eof?* + (lexer-eof-in-comment start-line) + (char-case *char* + (#\- (cond ((char=? *peek-char* #\}) + (advance-char) + (advance-char)) + (else + (advance-char) + (lex-ncomment-1 start-line)))) + (#\{ (cond ((char=? *peek-char* #\-) + (advance-char) + (advance-char) + (lex-ncomment) + (lex-ncomment-1 start-line)) + (else + (advance-char) + (lex-ncomment-1 start-line)))) + (else + (advance-char) + (lex-ncomment-1 start-line))))) + +(define (lex-varid) + (let ((sym (scan-var-con))) + (parse-reserved sym varid + "case" "class" + "data" "default" "deriving" + "else" + "hiding" + "if" "import" "in" "infix" "infixl" "infixr" "instance" "interface" + "let" + "module" + "of" + "renaming" + "then" "to" "type" + "where"))) + +(define (lex-conid) + (let ((sym (scan-var-con))) + (emit-token/string 'conid sym))) + +(define (lex-consym) + (let ((sym (scan-symbol))) + (cond ((string=/list? (cdr sym) ":") + (emit-token '\:\:)) + (else + (emit-token/string 'consym sym))))) + +(define (lex-varsym) + (let ((sym (scan-symbol))) + (cond ((and (string=/list? sym "<") (char=? *char* #\-)) + (advance-char) + (emit-token '\<\-)) + ((and (string=/list? sym "#") + (char=? *char* #\-) + (char=? *peek-char* #\})) + (advance-char) + (advance-char) + (emit-token 'end-annotation)) + (else + (parse-reserved sym varsym + ".." + "=>" "=" + "@" + "\\" + "|" + "~"))))) + +(define (lex-integer radix) + (lex-integer-1 radix 0)) + +(define (lex-integer-1 radix psum) + (declare (type fixnum radix) + (type integer psum)) + (let ((d (char->digit *char* radix))) + (if d + (begin + (advance-char) + (lex-integer-1 radix (+ (* psum radix) (the fixnum d)))) + psum))) + +(define (lex-fraction int-part denominator) + (declare (type integer int-part denominator)) + (let ((d (char->digit *char* 10))) + (if d + (begin + (advance-char) + (lex-fraction + (+ (* int-part 10) (the fixnum d)) (* denominator 10))) + (values int-part denominator)))) + +(define (lex-numeric) + (let ((int-part (lex-integer 10))) + (if (and (char=? *char* #\.) + (char->digit *peek-char* 10)) + (lex-float int-part) + (emit-token 'integer int-part)))) + +(define (lex-float int-part) + (advance-char) + (multiple-value-bind (numerator denominator) (lex-fraction int-part 1) + (let ((no-exponent + (lambda () (emit-token 'float numerator denominator 0)))) + (char-case *char* + (exponent + (char-case *peek-char* + (digit + (advance-char) + (lex-float/exp numerator denominator 1)) + ((#\+ #\-) + (cond ((char->digit (peek-char-2) 10) + (let ((sign (if (char=? *peek-char* '#\+) 1 -1))) + (advance-char) + (advance-char) + (lex-float/exp numerator denominator sign))) + (else + (funcall no-exponent)))) + (else + (funcall no-exponent)))) + (else + (emit-token 'float numerator denominator 0)))))) + +(define (lex-float/exp numerator denominator sign) + (let ((exponent (lex-integer 10))) + (emit-token 'float numerator denominator (* sign exponent)))) + +(define (lex-char) + (advance-char) + (let ((c + (char-case *char* + (#\' (signal-null-character) + '#\?) + (#\\ (lex-escaped-char '#f)) + ((#\space graphic) + (let ((ch *char*)) + (advance-char) + ch)) + (else + (signal-bad-character-constant *char*) + (advance-char) + `#\?)))) + (cond ((char=? *char* '#\') + (advance-char) + (emit-token 'char c)) + (else + (signal-missing-char-quote) + (skip-to-quote-or-eol))))) + +(define (signal-null-character) + (lexer-error 'null-character + "Null character '' is illegal - use '\\'' for a quote.")) + +(define (signal-bad-character-constant ch) + (lexer-error 'bad-character-constant + "The character `~a' may not appear in a character literal." ch)) + +(define (signal-missing-char-quote) + (lexer-error 'missing-char-quote + "Character constant has more than one character.")) + + +(define (skip-to-quote-or-eol) + (if *at-eof?* + (lex-one-token) + (char-case *char* + (#\' (advance-char) + (lex-one-token)) + (#\newline (advance-char) + (lex-one-token)) + (else + (advance-char) + (skip-to-quote-or-eol))))) + +(define (lex-string) + (advance-char) + (emit-token 'string (list->string (gather-string-chars)))) + +(define (gather-string-chars) + (char-case *char* + (#\\ + (let ((ch (lex-escaped-char '#t))) + (if (eq? ch 'null) + (gather-string-chars) + (cons ch (gather-string-chars))))) + (#\" + (advance-char) + '()) + ((graphic #\space) + (let ((ch *char*)) + (advance-char) + (cons ch (gather-string-chars)))) + (#\newline + (signal-missing-string-quote) + '()) + (else + (signal-bad-string-constant *char*) + (advance-char) + (gather-string-chars)))) + +(define (signal-missing-string-quote) + (lexer-error 'missing-string-quote + "String continued over end of line.")) + +(define (signal-bad-string-constant ch) + (lexer-error 'bad-string-constant + "The character `~a' may not appear in a string literal." ch)) + + +(define (convert-stupid-control-character-names) + (let ((c1 *char*) + (c2 *peek-char*)) + (advance-char) + (advance-char) + (let ((s2 (string c1 c2)) + (s3 (string c1 c2 *char*))) + (let ((srch3 (assoc s3 *control-char-names*))) + (cond (srch3 + (advance-char) + (integer->char (cdr srch3))) + (else + (let ((srch2 (assoc s2 *control-char-names*))) + (cond (srch2 + (integer->char (cdr srch2))) + (else + (signal-bad-control-char s3) + `#\?))))))))) + +(define (signal-bad-control-char name) + (lexer-error 'invalid-control-char + "`~a' is not a recognized control character name." name)) + + +(define (lex-escaped-char in-string?) + (advance-char) + (char-case *char* + ((#\a #\b #\f #\n #\r #\t #\v) + (let* ((ccode (cdr (assoc *char* *short-control-char-names*))) + (ccode1 (cdr (assoc ccode *control-char-names*)))) + (advance-char) + (integer->char ccode1))) + ((#\\ #\' #\") + (let ((ch *char*)) + (advance-char) + ch)) + (#\& + (advance-char) + (cond (in-string? 'null) + (else + (signal-bad-&-escape) + '#\?))) + (#\^ + ;; *** This code is problematic because it assumes + ;; *** (1) that you can do the arithmetic on the character codes + ;; *** (2) that the resulting integer can actually be coerced to + ;; *** the right character object in the host Lisp. + (advance-char) + (char-case *char* + ((large #\@ #\[ #\\ #\] #\^ #\_) + (let ((code (+ (- (char->integer *char*) + (char->integer '#\A)) + *control-A*))) + (advance-char) + (integer->char code))) + (else + (signal-bad-^-escape *char*) + '#\?))) + (large + (convert-stupid-control-character-names)) + (digit + (convert-num-to-char (lex-integer 10))) + (#\o + (advance-char) + (cond ((char->digit *char* 8) + (convert-num-to-char (lex-integer 8))) + (else + (signal-missing-octal-digits) + '#\?))) + (#\x + (advance-char) + (cond ((char->digit *char* 16) + (convert-num-to-char (lex-integer 16))) + (else + (signal-missing-hex-digits) + `#\?))) + (whitechar + (cond (in-string? + (lex-gap)) + (else + (signal-bad-gap) + `#\?))) + (else + (signal-bad-escape *char*) + `#\?))) + +(define (signal-bad-&-escape) + (lexer-error 'bad-&-escape + "The escape `\\&' is not allowed inside a character literal.")) + +(define (signal-bad-^-escape ch) + (lexer-error 'bad-^-escape + "The escape `\\^~a' is not recognized." ch)) + +(define (signal-missing-octal-digits) + (lexer-error 'missing-octal-digits + "No digits provided for `\\o' escape.")) + +(define (signal-missing-hex-digits) + (lexer-error 'missing-hex-digits + "No digits provided for `\\x' escape.")) + +(define (signal-bad-gap) + (lexer-error 'invalid-gap + "Gaps are not allowed inside character literals.")) + +(define (signal-bad-escape ch) + (lexer-error 'bad-escape + "The escape `\\~a' is not recognized." ch)) + + + +;;; *** This code is problematic because it assumes that integers +;;; *** between 0 and 255 map on to characters with the corresponding +;;; *** ASCII encoding in the host Lisp, and that the host Lisp actually +;;; *** supports 255 characters. + +(define (convert-num-to-char num) + (cond ((and (>= num 0) (>= *max-char* num)) + (integer->char num)) + (else + (signal-char-out-of-range num) + '#\?))) + +(define (signal-char-out-of-range num) + (lexer-error 'char-out-of-range + "There is no character corresponding to code ~s." num)) + + +(define (lex-gap) + (cond (*at-eof?* + (signal-eof-in-gap) + 'null) + (else + (char-case *char* + (whitechar + (advance-char) + (lex-gap)) + (#\\ + (advance-char) + 'null) + (else + (signal-missing-gap) + 'null))))) + + +(define (signal-eof-in-gap) + (lexer-error 'eof-in-gap + "End of file encountered inside gap.")) + +(define (signal-missing-gap) + (lexer-error 'missing-gap + "Missing gap delimiter, or junk inside gap.")) -- cgit v1.2.3