;;; Scanning ;;; **************************************************************** ;;; make-lazy-stream : (() -> val * stream) -> stream ;;; stream-get : stream * (value -> (stream -> answer)) -> answer (define make-lazy-stream (lambda (th) th)) (define stream-get (lambda (stream rcvr) (let ((the-pair (stream))) (rcvr (car the-pair) (cdr the-pair))))) ;;; **************************************************************** (define string->stream (lambda (str) (let ((length (string-length str))) (letrec ((loop (lambda (i) (make-lazy-stream (lambda () (cons (if (>= i length) #\nul (string-ref str i)) (loop (+ i 1)))))))) (loop 0))))) ;; a better version of stream->list, parameterized on end-of-stream? (define stream->list (lambda (end-of-stream? str) (stream-get str (lambda (val newstr) (if (end-of-stream? val) '() (cons val (stream->list end-of-stream? newstr))))))) ;;; **************************************************************** ;;; **************************************************************** ;;; Scanner architecture ;;; Scanner = char-stream -> token-stream ;;; State = buf * char * char-stream -> token * char * char-stream (define-record scanner-result (item char stream)) (define apply-automaton (lambda (automaton state buf c str) (letrec ((apply-state (lambda (state buf c str) (let ((opcode (car state)) (next-state (cadr state))) '(printf "apply-state: opcode = ~s c = ~s~%" opcode c) (case opcode ((shift) (stream-get str (lambda (c1 str1) (apply-state next-state (cons c buf) c1 str1)))) ((drop) (stream-get str (lambda (c1 str1) (apply-state next-state buf c1 str1)))) ((emit) (let ((cooker (cadr state))) (let ((item (apply-cooker cooker (reverse buf)))) (printf "emitting item ~s~n" item) (make-scanner-result item c str)))) ((cond) (apply-state (apply-scanner-cond (cdr state) c) buf c str)) ((goto) (apply-state (scanner-label->state next-state automaton) buf c str)) ((fail) (let ((msg (cadr state))) (error 'apply-automaton "scanner failed in state ~s on input ~s" msg c))) ((debug-state) (let ((msg (cadr state)) (next-state (caddr state))) (printf "~s ~s ~s~%" msg buf c) (apply-state next-state buf c str)))))))) (apply-state state buf c str)))) (define apply-scanner-cond (lambda (alternatives c) (if (null? alternatives) (error 'apply-state "couldn't match character ~s in state ~%~s" c alternatives) (let ((alternative1 (car alternatives))) '(printf "apply-scanner-cond: c = ~s alternative = ~s~%" c alternative1) (if (apply-tester (car alternative1) c) (cadr alternative1) (apply-scanner-cond (cdr alternatives) c)))))) (define scanner-label->state (lambda (label automaton) (cadr (assq label automaton)))) (define automaton->start-label caar) (define automaton->scanner (lambda (automaton) (letrec ((loop (lambda (char stream) (make-lazy-stream (lambda () (record-case (apply-automaton automaton (scanner-label->state (automaton->start-label automaton) automaton) '() char stream) (scanner-result (token char stream) (cons token (loop char stream))))))))) (lambda (stream) (stream-get stream loop))))) ;;; driver-1: from string to list of tokens (define driver-1 (lambda (automaton string) (stream->list (lambda (item) (record-case item (token (class data) (eq? class 'end-marker)))) ((automaton->scanner automaton) (string->stream string))))) ;;; **************************************************************** ;;; Standard cookers and testers.... ;;; Record definitions and cookers (define-record token (class data)) (define cook-identifier (lambda (buffer) (let ((sym (string->symbol (list->string buffer)))) (if (memq sym **keywords-list**) (make-token sym #t) (make-token 'identifier sym))))) (define cook-number (lambda (buffer) (make-token 'number (string->number (list->string buffer))))) (define apply-cooker (lambda (cooker char-list) (case cooker ((cook-identifier) (cook-identifier char-list)) ((cook-number) (cook-number char-list)) (else (if (symbol? cooker) (make-token cooker '()) (else (error 'apply-cooker "unknown cooker ~s" cooker))))))) (define apply-tester (lambda (tester ch) (cond ((char? tester) (eq? tester ch)) ((eq? tester 'else) #t) (else (case tester ((whitespace) (char-whitespace? ch)) ((alphabetic) (char-alphabetic? ch)) ((numeric) (char-numeric? ch)) (else (error 'apply-tester "unknown tester ~s" tester))))))) ;;; **************************************************************** ;;; OK, let's build one now.... (define automaton-1 '((start-state (cond (whitespace (drop (goto start-state))) (alphabetic (shift (goto identifier-state))) (numeric (shift (goto number-state))) (#\+ (drop (emit plus-sym))) (#\: (shift (goto assign-sym-state))) (#\% (drop (goto comment-state))) (#\; (drop (emit semicolon))) (#\( (drop (emit lparen))) (#\) (drop (emit rparen))) (#\^ (emit end-marker)) ; alternate end marker (#\nul (emit end-marker)) (else (fail start-state)))) (identifier-state (cond (alphabetic (shift (goto identifier-state))) (numeric (shift (goto identifier-state))) (else (emit cook-identifier)))) (number-state (cond (numeric (shift (goto number-state))) (else (emit cook-number)))) (assign-sym-state (cond (#\= (shift (emit assign-sym))) (else (shift (goto identifier-state))))) (comment-state (cond (#\newline (drop (goto start-state))) (#\^ (goto start-state)) (#\nul (goto start-state)) (else (drop (goto comment-state))))))) (define **keywords-list** '(begin end if then else)) ; > (driver-1 automaton-1 "abc ; def ; % comment ; xyz 13") ; emitting item (token identifier abc) ; emitting item (token identifier def) ; emitting item (token identifier xyz) ; emitting item (token number 13) ; emitting item (token end-marker ()) ; ((token identifier abc) ; (token identifier def) ; (token identifier xyz) ; (token number 13)) ;; and here's what the answer should give: ; > (driver-2 automaton-1) ; driver-2> abc ; emitting item (token identifier abc) ; def ; emitting item (token identifier def) ; % comment ; xyz 13^ ; emitting item (token identifier xyz) ; emitting item (token number 13) ; emitting item (token end-marker ()) ; ((token identifier abc) ; (token identifier def) ; (token identifier xyz) ; (token number 13)) ; >