;;; Common Lisp implementation of "Quickprop", a variation on back-propagation. ;;; Scheme version courtesy of Song Koh (swk@mlb.semi.harris.com). ;;; chez version macros: sigmoid, sigmoid-prime, errfun. ;;; chez performance: 0.92 seconds per epoch on a Sun 3/60 (SunOS 3.5) ;;; For a description of the Quickprop algorithm, see "Faster-Learning ;;; Variations on Back-Propagation: An Empirical Study" by Scott E. Fahlman ;;; in Proceedings of the 1988 Connectionist Models Summer School, ;;; Morgan-Kaufmann, 1988. ;;; This code was written and placed in the public domain by Scott E. ;;; Fahlman. I would like to be hear about anyone using this code and what ;;; kind of results are achieved. I can be contacted on Arpanet as ;;; "fahlman@cs.cmu.edu" or by physical mail: ;;; Scott E. Fahlman, ;;; School of Computer Science ;;; Carnegie-Mellon University ;;; Pittsburgh, PA 15213 ;;; *************************************************************************** ;;; This proclamation buys a certain amount of overall speed at the expense ;;; of runtime checking. Comment it out when debugging new, bug-infested code. ;;; Portability note: This file is mostly portable Common Lisp. A few CMU ;;; extensions are used: ;;; SYSTEM:SERVER with no argument reads one pending external event (from ;;; X, for example) and passes it off to a handler function. If no event ;;; is pending, it proceeds immediately. With an argument of N, it waits N ;;; seconds for an event to arrive, then proceeds. SYSTEM:SERVE-ALL is ;;; similar, but does not return until all pending events have been served. ;;; These calls can be removed if your Lisp handles external events in some ;;; other way, or not at all. (define beep (lambda () #t)) (define cmu-server (lambda () #t)) (define cmu-serve-all (lambda () #t)) ;;; The EXTENSIONS:*IGNORE-FLOATING-POINT-UNDERFLOW* switch, if non-null, ;;; says that floating point underflows should quietly return zero rather ;;; than signalling an error. If your Lisp does not have such a switch, ;;; you will either have to define an error handler for floating underflows ;;; or check for tiny values at various critical points of the code. ;;;; Assorted Parameters. ;;; These parameters and switches control the learning algorithm. (define *weight-range* 2.0) ; Initial weights in the network get random values between plus and ; minus *weight-range*.") (define *sigmoid-prime-offset* 0.1) ; "Add this to the sigmoid prime value to eliminate the flat spots where ; derivative goes to zero.") (define *epsilon* 1.0) ; "Controls the amount of linear gradient descent to use.") (define *mu* 1.75) ; "Do not let quadratic method take a step greater than this value times ; the previous step. If this is too large, the learning becomes chaotic.") (define *decay* -0.0001) ; "This factor times the current weight is added to the slope at the ; start of each epoch. Keeps weights from growing too big.") (define *hyper-err* #t) ; "If non-nil, use hyperbolic arctan error function.") (define *split-epsilon* #t) ; "If non-nil, divide epsilon by unit fan-in before using it.") (define *symmetric* #f) ; "If non-nil, use sigmoid activation function ranging from -0.5 to +0.5. ; If nil, range is 0.0 to 1.0.") ;;; These variables and switches control the simulation and display. (define *epoch* 0) ; "Count of the number of times the entire training set has been presented.") (define *restart* #t) ; "If set, restart whenever the maximum epoch limit is exceeded. ; Else, just quit.") (define *graphics* #f) ; "If nil, skip all routine display updating.") (define *single-pass* #f) ; "When on, pause after forward/backward cycle.") (define *single-epoch* #f) ; "When on, pause after each training epoch.") (define *step* #f) ; "Turned briefly to T in order to continue after a pause.") (define *layout* #f) ; "The layout structure for displaying the current network.") (define *debug-displays* #f) ; "If set when creating displays, create the debugging displays as well.") ;;; The real values to be used for logical one and zero values on input ;;; and output. The routines that build various networks (e.g. BUILD-ENCODER) ;;; look at these. (define *input-zero-value* 0.0) ; "Value representing logical zero on inputs.") (define *input-one-value* 1.0) ; "Value representing logical one on inputs.") (define *output-zero-value* 0.0) ; "Value representing logical zero on outputs.") (define *output-one-value* 1.0) ; "Value representing logical one on outputs.") ;;; The sets of training inputs and outputs are stored in parallel vectors. ;;; Each set is a vector of short-float values. (define *training-inputs* '()) ; "Vector of input patterns for training the net.") (define *training-outputs* '()) ; "Vector of output patterns for training the net.") ;;; For some benchmarks, there is a separate set of values used for testing ;;; the network's ability to generalize. These values are not used during ;;; training. (define *test-inputs* '()) ; "Vector of input patterns for testing the net.") (define *test-outputs* '()) ; "Vector of output patterns for testing the net.") ;;;; Fundamental data structures. ;;; Unit outputs and weights are short flonums. ;;; Instead of representing each unit by a structure, we represent the ;;; unit by a fixnum. This is used to index into various vectors that hold ;;; per-unit information, such as the output state of each unit. ;;; The set of connections COMING INTO each unit is represented by a vector ;;; that is stored with the unit. Per-connection info is stored in similar ;;; vectors. The only constraint on network topology is that a unit's ;;; index must be greater than the index of any unit from which it receives ;;; an input. Regular layers are not required. ;;; Unit 0 is always at a maximum-on value, and has a connection to every ;;; other unit. The weight on this connection acts as a threshold. ;;; Next come some input units, then some hidden units, and finally some ;;; output units. ;;; The following parameters must be set up by the network-building routines. (define *nunits* 0) ;"Total number of units in the network.") (define *ninputs* 0) ; "Number of input units.") (define *first-hidden* 0) ; "Index of first hidden unit.") (define *nhidden* 0 ) ;"Number of hidden units.") (define *first-output* 0) ; "Index of first output unit.") (define *noutputs* 0) ; "Number of output units.") (define *outputs* '()) ; "Vector holding the final output value of each unit.") (define *error-sums* '()) ; "Vector holding the total error activation for each unit.") (define *errors* '()) ; "Vector holding the final error value for each unit.") (define *nconnections* '()) ; "Vector holding the number of incoming connections for each unit.") (define *connections* '()) ; "Vector that holds a connection vector for each unit I. ; Each entry in the connection vector holds a unit index J, ; indicating that this connection is from J to I.") (define *weights* '()) ; "Vector of vectors, with each entry giving the weight associated ; with connection IJ.") (define *delta-weights* '()) ; "Vector of vectors, with each entry giving the change between the previous ; weight and the current one.") (define *slopes* '()) ; "Vector of vectors, with each entry giving the accumulated slope value ; at the current position.") (define *prev-slopes* '()) ; "Vector of vectors, with each entry giving the slope value for the previous ; position.") ;;;; Network-building utilities. ; "Create the network data structures, given the number of input, hidden ; and output units." (define build-data-structures (lambda (ninputs nhidden noutputs) (set! *nunits* (+ 1 ninputs nhidden noutputs)) (set! *ninputs* ninputs) (set! *first-hidden* (+ 1 ninputs)) (set! *nhidden* nhidden) (set! *first-output* (+ 1 ninputs nhidden)) (set! *noutputs* noutputs) (set! *outputs* (make-vector *nunits* 0.0)) (set! *error-sums* (make-vector *nunits* 0.0)) (set! *errors* (make-vector *nunits* 0.0)) (set! *nconnections* (make-vector *nunits* 0)) (set! *connections* (make-vector *nunits*)) (set! *weights* (make-vector *nunits*)) (set! *delta-weights* (make-vector *nunits* )) (set! *slopes* (make-vector *nunits*)) (set! *prev-slopes* (make-vector *nunits*)) (vector-set! *outputs* 0 *input-one-value*))) ; "Select a random weight, an integer uniformly distributed over the ; interval from minus RANGE to plus RANGE, inclusive." (define random-weight (lambda (range) (- (random (truncate (* 2.0 range))) range))) ; "Build connections from every unit in range 1 to every unit in the range 2. ; Also add a connection from unit 0 to every unit in range 2. ; For each connection, select a random initial weight between RANDOM-RANGE ; and its negative." (define connect-layers (lambda (start1 end1 start2 end2 . random-range-list) (define random-range (if (null? random-range-list) 0 (car random-range-list))) (set! *epoch* 0) (let ((n (1+ (- end1 start1)))) (do ((i start2 (1+ i))) ((>= i end2)) (let ((c (make-vector n)) (w (make-vector n)) (d (make-vector n)) (cs (make-vector n)) (ps (make-vector n))) (vector-set! *nconnections* i n) (vector-set! *connections* i c) (vector-set! *weights* i w) (vector-set! *delta-weights* i d) (vector-set! *slopes* i cs) (vector-set! *prev-slopes* i ps) (vector-set! c 0 0) (vector-set! w 0 (random-weight random-range)) (vector-set! d 0 0.0) (vector-set! cs 0 0.0) (vector-set! ps 0 0.0) (do ((j start1 (1+ j)) (k 1 (1+ k))) ((>= j end1)) (vector-set! c k j) (vector-set! w k (random-weight random-range)) (vector-set! d k 0.0) (vector-set! cs k 0.0) (vector-set! ps k 0.0))))))) ; "For each connection, select a random initial weight between RANDOM-RANGE ; and its negative. Clear delta and previous delta values." (define init-weights (lambda rr (define random-range (if (null? rr) *weight-range* (car rr))) (do ((i 0 (1+ i))) ((>= i *nunits*)) (let ((w (vector-ref *weights* i)) (d (vector-ref *delta-weights* i)) (cs (vector-ref *slopes* i)) (ps (vector-ref *prev-slopes* i))) (do ((j 0 (1+ j))) ((>= j (vector-ref *nconnections* i))) (vector-set! w j (random-weight random-range)) (vector-set! d j 0.0) (vector-set! cs j 0.0) (vector-set! ps j 0.0)))))) ; Save the current slope values as prev-slopes, and clear all the slopes. (define clear-slopes (lambda () (do ((i *first-hidden* (1+ i))) ((= i *nunits*)) (let ((cs (vector-ref *slopes* i)) (ps (vector-ref *prev-slopes* i)) (w (vector-ref *weights* i))) (do ((j 0 (1+ j))) ((>= j (vector-ref *nconnections* i))) (vector-set! ps j (vector-ref cs j)) (vector-set! cs j (* *decay* (vector-ref w j)))))))) ;;;; Learning machinery. ;;; Some key utilities. ;;; Sigmoid and sigmoid prime live in the tightest inner loops, so we make ;;; them macros to save a lot of function calls. ; "The basic sigmoid computation. Maps sum of input activation into ; a unit output value in the range from 0.0 to 1.0." (define-macro! sigmoid (activation) `(cond ((< ,activation -15.0) 0.0) ((> ,activation 15.0) 1.0) (else (/ (+ 1.0 (exp (- ,activation))))))) ; "Compute the derivative of the output with respect to activation at ; the current output value. Add a small constant to keep the derivative ; from going to zero when error is close to 1.0." (define-macro! sigmoid-prime (output) `(+ *sigmoid-prime-offset* (* ,output (- 1.0 ,output)))) (define *total-error* 0.0) ; "Accumulate the total output error for one epoch.") (define *score-threshold* .4) ;"To count as correct, a bit's output must be this close to the desired value." (define *total-error-bits* 0) ; "Count number of bits in epoch that are wrong by more than ; *SCORE-THRESHOLD*") ; "Compute the error for one output unit. ; If *hyper-err* is on, use hyperbolic arctan error function. ; Record the squared error." (define-macro! errfun (desired actual) `(let* ((dif (- ,desired ,actual))) (set! *total-error* (+ *total-error* (* dif dif))) (if (not (< (abs dif) *score-threshold*)) (set! *total-error-bits* (1+ *total-error-bits*))) (cond ((not *hyper-err*) (if (and (< -0.1 dif) (< dif 0.1)) 0.0 dif)) ((< dif -.9999999) -17.0) ((> dif .9999999) 17.0) (else (log (/ (+ 1.0 dif) (- 1.0 dif))))))) ;;; The inner loops... ; "Input is a vector of values that become the outputs of the input units. ; Then propagate the values forward through the network." ; Set up all the inputs. (define forward-pass (lambda (input) (let ((symmetric-offset (if *symmetric* -0.5 0.0))) (do ((i 0 (1+ i))) ((>= i *ninputs*)) (vector-set! *outputs* (1+ i) (vector-ref input i))) ;; For each unit J, add up the incoming activation from all units I, ;; Then run it through the sigmoid to produce an output. (do ((j *first-hidden* (1+ j))) ((= j *nunits*)) (let ((c (vector-ref *connections* j)) (w (vector-ref *weights* j)) (sum 0.0)) (define loop (lambda (i imax p-sum) (if (>= i imax) p-sum (loop (1+ i) imax (+ p-sum (* (vector-ref *outputs* (vector-ref c i)) (vector-ref w i))))))) (set! sum (loop 0 (vector-ref *nconnections* j) 0.0)) (vector-set! *outputs* j (+ symmetric-offset (sigmoid sum)))))))) ; "Goal is a vector of desired values for the output units. Propagate the ; error back through the network, accumulating weight deltas." ; Compare outputs to goal and determine error values. (define backward-pass (lambda (goal) (do ((i *first-output* (1+ i)) (n 0 (1+ n))) ((>= i *nunits*)) (vector-set! *error-sums* i (errfun (vector-ref goal n) (vector-ref *outputs* i)))) ;; Zero the error sums for non-output units. (do ((i 0 (1+ i))) ((>= i *first-output*)) (vector-set! *error-sums* i 0.0)) ;; Now propagate error back through net. When this loop reaches unit J, ;; all error from later units has been collected. Do the sigmoid-prime ;; calcuation, and pass error back to earlier weights and units. (do ((j (-1+ *nunits*) (-1+ j)) (symmetric-offset (if *symmetric* 0.5 0.0))) ((< j *first-hidden*)) (let* ((c (vector-ref *connections* j)) (w (vector-ref *weights* j)) (cs (vector-ref *slopes* j)) (nc (vector-ref *nconnections* j)) (o (vector-ref *outputs* j)) (err-j (* (sigmoid-prime (+ symmetric-offset o)) (vector-ref *error-sums* j)))) (vector-set! *errors* j err-j) (do ((i 0 (1+ i))) ((>= i nc)) (let ((i-index (vector-ref c i))) (vector-set! *error-sums* i-index (+ (vector-ref *error-sums* i-index) (* err-j (vector-ref w i)))) (vector-set! cs i (+ (vector-ref cs i) (* err-j (vector-ref *outputs* i-index)))))))))) ; "Update all the weights in the network as a function of each weight's current ; slope. previous slope, and the distance of the last move." (define update-weights (lambda () (let ((shrink-factor (/ *mu* (+ 1.0 *mu*)))) (do ((j *first-hidden* (1+ j))) ((= j *nunits*)) (let ((w (vector-ref *weights* j)) (nc (vector-ref *nconnections* j)) (d (vector-ref *delta-weights* j)) (cs (vector-ref *slopes* j)) (ps (vector-ref *prev-slopes* j))) (do ((i 0 (1+ i))) ((>= i nc)) (let* ((ps-i (vector-ref ps i)) (cs-i (vector-ref cs i)) (d-i (vector-ref d i)) (next-step 0.0)) (cond ((positive? d-i) ;; If last step was positive... ;; Add in epsilon if current slope is positive. (if (positive? cs-i) (set! next-step (+ next-step (if *split-epsilon* (/ (* *epsilon* cs-i) nc) (* *epsilon* cs-i))))) ;; If current slope is close to or larger than ;; prev slope,take maximum size positive step. (cond ((> cs-i (* shrink-factor ps-i)) (set! next-step (+ next-step (* *mu* d-i)))) ;; Else, use quadratic estimate. (else (set! next-step (+ next-step (* (/ cs-i (- ps-i cs-i)) d-i)))))) ;; If last step was significantly negative... ((negative? d-i) ;; Add in epsilon if current slope is negative. (if (negative? cs-i) (set! next-step (+ next-step (if *split-epsilon* (/ (* *epsilon* cs-i) nc) (* *epsilon* cs-i))))) ;; If current slope is close to or more neg than ;; prev slope, take maximum size negative step. (cond ((< cs-i (* shrink-factor ps-i)) (set! next-step (+ next-step (* *mu* d-i)))) ;; Else, use quadratic estimate. (else (set! next-step (+ next-step (* (/ cs-i (- ps-i cs-i)) d-i)))))) (else (set! next-step (+ next-step (if *split-epsilon* (/ (* *epsilon* cs-i) nc) (* *epsilon* cs-i)))))) (vector-set! d i next-step) (vector-set! w i (+ (vector-ref w i) next-step))))))))) ;"Perform forward and back propagation once for each set of weights in the ; training vectors, collecting deltas. Then burn in the weights." (define train-one-epoch (lambda () (define pause1 (lambda () (cond ((or (not *single-pass*) *step) (set! *step* '())) (else (cmu-server 1) (pause1))))) (define pause2 (lambda () (cond ((or (not *single-epoch*) *step) (set! *step* '())) (else (cmu-server 1) (pause2))))) (clear-slopes) (do ((i 0 (1+ i))) ((>= i (vector-length *training-inputs*))) (forward-pass (vector-ref *training-inputs* i)) (backward-pass (vector-ref *training-outputs* i)) (if *single-pass* (pause1)) (cond (*graphics* (update-pass-displays)))) (update-weights) (set! *epoch* (1+ *epoch*)) (cond (*graphics* (update-epoch-displays))) (if (and *single-epoch* (not *single-pass*)) (pause2)))) ; "Train the network until there are 0 bits wrong, then print a message. ; If any given test reaches MAX epochs, restart or abort, depending on ; *RESTART* switch. Repeat all this for the specified number of TIMES." (define train-test (lambda (times max-epochs . rpt ) (define report (if (null? rpt) '() (car rpt))) (let ((total-epochs 0) (total-restarts 0) (esquared 0) (maxepochs 0) (minepochs max-epochs) (newmax '())) (do ((i 0 (1+ i))) ((>= i times)) (set! *epoch* 0) (init-weights) (set! newmax max-epochs) (call-with-current-continuation (lambda (return) (let loop () (cond ((>= *epoch* newmax) (cond (*restart* (set! newmax (+ newmax max-epochs)) (init-weights) (format "Trial " i ": Restart after " *epoch* " epochs.") (set! total-restarts (1+ total-restarts))) (else (format "Trial " i ": Abort after " *epoch* " epochs." ) (set! total-restarts (1+ total-restarts)) (set! total-epochs (+ total-epochs newmax)) (set! esquared (+ esquared (* newmax newmax))) (set! maxepochs newmax) (return '()))))) (set! *total-error* 0.0) (set! *total-error-bits* 0) (train-one-epoch) (cond ((and report (zero? (modulo (-1+ *epoch*) report))) (format "Trained " (-1+ *epoch*) " epochs, " *total-error-bits* " bits wrong, error = " *total-error*))) (cond ((zero? *total-error-bits*) (set! *epoch* (-1+ *epoch*)) (set! total-epochs (+ total-epochs *epoch*)) (set! esquared (+ esquared (* *epoch* *epoch*))) (set! maxepochs (max *epoch* maxepochs)) (set! minepochs (min *epoch* minepochs)) (format "Trial " i ": Learned after " *epoch* " epochs. Running Avg: " (/ total-epochs (1+ i))) (return '()))) (loop)))) (format "Eps " *epsilon* (if *split-epsilon* "*" "") ", Mu " *mu* ", WtRng " *weight-range* ", Decay " *decay* ", SigOff " *sigmoid-prime-offset* ", Hyper " *hyper-err* ", Sym " *symmetric* ) (cond ((> times 1) (format "ReStrt " total-restarts ", Max " maxepochs ", Min " minepochs ", Avg " (/ total-epochs times) ", SD " (sqrt (/ (- (* times esquared) (* total-epochs total-epochs)) (* times (-1+ times))))))))))) (define format (lambda d-list (define loop (lambda (d-list) (cond ( d-list (display (car d-list)) (loop (cdr d-list)))))) (newline) (loop d-list))) ;;;; Setup modification utilities. ;;; In order to convert from the normal assymmetric activation function to ;;; a symmetric one, several values have to be altered and the network has ;;; to be rebuilt. Use these functions so that you don't forget any of ;;; these things and get spurious results. ; "Convert the network to use a symmetric activation function ranging ; from -0.5 to +0.5 instead of 0.0 to 1.0." (define make-symmetric (lambda () (cond (*symmetric* (set! *symmetric* #t) (set! *input-zero-value* -0.5) (set! *input-one-value* 0.5) (set! *output-zero-value* -0.5) (set! *output-one-value* 0.5) "Remember to rebuild the current network.")) (else (beep)))) ; "Convert the network to use an asymmetric activation function ranging ; from 0.0 to 1.0 instead of -0.5 to +0.5." (define make-asymmetric (lambda () (cond (*symmetric* (beep)) (else (set! *symmetric* '()) (set! *input-zero-value* 0.0) (set! *input-one-value* 1.0) (set! *output-zero-value* 0.0) (set! *output-one-value* 1.0) "Remember to rebuild the current network.")))) ;;; Use this to complement all the input and output values for the current ;;; training and testing patterns. Flip each value around the midpoint ;;; between logical one and logical zero. ; "For all the training and testing patterns, exchange logical one values ; and logical zero values. Other values reflect around the midpoint." (define complement-patterns (lambda () (let ((ival (+ *input-zero-value* *input-one-value*)) (oval (+ *output-zero-value* *output-one-value*))) (do ((i 0 (1+ i))) ((>= i (vector-length *training-inputs*))) (do ((j 0 (1+ j))) ((>= j *ninputs*)) (vector-set! (vector-ref *training-inputs* i) j (- ival (vector-ref (vector-ref *training-inputs* i) j))))) (do ((i 0 (1+ i))) ((>= i (vector-length *training-outputs*))) (do ((j 0 (1+ j))) ((>= j *noutputs*)) (vector-set! (vector-ref *training-outputs* i) j (- oval (vector-ref (vector-ref *training-outputs* i) j))))) (do ((i 0 (1+ i))) ((>= i (vector-length *test-inputs*))) (do ((j 0 (1+ j))) ((>= j *ninputs*)) (vector-set! (vector-ref *test-inputs* i) j (- ival (vector-ref (vector-ref *test-inputs* i) j))))) (do ((i 0 (1+ i))) ((>= i (vector-length *test-outputs*))) (do ((j 0 (1+ j))) ((>= j *noutputs*)) (vector-set! (vector-ref *test-outputs* i) j (- oval (vector-ref (vector-ref *test-outputs* i) j)))))))) ;;;; Example ;;; The code to build an X-Y-X encoder looks like this. ;;; Display code has been omitted. ;;; "Build an ecoder with X input units, X output units, and Y units in the ;;; layer connecting them." (define build-encoder (lambda (x y) (build-data-structures x y x) (connect-layers 1 (+ x 1) (+ x 1) (+ x y 1) *weight-range*) (connect-layers (+ x 1) (+ x y 1) (+ x y 1) (+ x y x 1) *weight-range*) (set! *training-inputs* (make-vector x)) (set! *training-outputs* (make-vector x)) (do ((i 0 (1+ i))) ((>= i x)) (let ((v (make-vector x *input-zero-value*))) (vector-set! v i *input-one-value*) (vector-set! *training-inputs* i v)) (let ((v (make-vector x *output-zero-value*))) (vector-set! v i *output-one-value*) (vector-set! *training-outputs* i v))))) ;;; To run this, do something like (train-test 10 200). ;;; some of my stuff :Fri Oct 12 11:11:59 EDT 1990 - swk (define (roundf val n-places) (let ((mfactor (expt 10 n-places))) (/ (round (* val mfactor)) mfactor))) (define (test-example inputs) (forward-pass inputs) (map (lambda (n) (roundf n 2)) (vector->list (subvector *outputs* (+ *first-hidden* *nhidden*) *nunits*)))) (define (output-results) (let((input-list (vector->list *training-inputs*))) (for-each (lambda (in expected out) (write-line (append (vector->list in) (vector->list expected) out))) input-list (vector->list *training-outputs*) (map test-example input-list)))) ;;; (build-encoder 8 4) ;;; (time (train-test 1 100))