;;;#| ;;; ;;;This file contains Decision tree learning code to accompany the ;;;textbook "Machine Learning," Tom M. Mitchell, McGraw Hill, 1997. ;;; ;;;Copyright 1998 Tom M. Mitchell. This code may be freely distributed ;;;and used for any non-commericial purpose, as long as this copyright ;;;notice is retained. The author assumes absolutely no responsibility ;;;for any harm caused by bugs in the code. ;;; ;;; ;;; 1. OVERVIEW ;;; ;;;This file contains a Lisp implementation of the ID3 program for ;;;learning decision trees, as described in Table 3.1 of the textbook ;;;"Machine Learning," Tom Mitchell, McGraw Hill, 1997. It also contains ;;;the set of training examples given in Table 3.2. ;;; ;;;The code is simple enough that you should be able to inspect and ;;;modify it for your own use. ;;; ;;;The six key functions are: ;;; ;;;1. ID3(examples target.attribute attributes) ;;; learns a decision tree, using the algorithm from Table 3.1 in the book ;;; ;;;2. Classify(instance tree) ;;; Given a new instance and a previously learned tree, returns the ;;; classification of that instance produced by the tree. ;;; ;;;3. Print.tree(tree) ;;; Prints a tree in human readable form ;;; ;;;4. Print.entity(instance) ;;; Prints an instance (e.g., one of your training examples) ;;; ;;;5. Get.value(attribute instance) ;;; Get the value of some ATTRIBUTE of INSTANCE ;;; ;;;6. Put.value(attribute instance value) ;;; Assign VALUE to ATTRIBUTE of INSTANCE ;;; ;;; ;;;2. EXAMPLE TRACE: ;;; ;;;After loading this file into CommonLisp, you should be able to ;;;duplicate the following trace: ;;; ;;; ;;;;; FIRST LOOK AT THE PROVIDED TRAINING EXAMPLES ;;;>> *training.examples* ;;;(D14 D13 D12 D11 D10 D9 D8 D7 D6 D5 ...) ;;; ;;;;; USE THE FUNCTION PRINT.ENTITY TO EXAMINE JUST ONE OF THEM ;;;>> (print.entity 'd6) ;;; ;;;(PLAY.TENNIS? NO WIND STRONG HUMIDITY NORMAL TEMPERATURE COOL OUTLOOK RAIN) ;;; ;;;;; NOW USE ID3 TO LEARN A TREE ;;;>> (setq tree (id3 *training.examples* ;;; 'play.tennis? ;;; '(outlook temperature humidity wind))) ;;; ;;;(OUTLOOK (SUNNY (HUMIDITY (NORMAL YES) (HIGH NO))) ;;; (OVERCAST YES) ;;; (RAIN (WIND (STRONG NO) (WEAK YES)))) ;;; ;;; ;;;;; LOOK AT THE TREE IN A MORE HUMAN-FRIENDLY FORM ;;;>> (print.tree tree) ;;;OUTLOOK ;;; = SUNNY ;;; HUMIDITY ;;; = NORMAL => YES ;;; = HIGH => NO ;;; = OVERCAST => YES ;;; = RAIN ;;; WIND ;;; = STRONG => NO ;;; = WEAK => YES ;;; ;;; ;;;;; FINALLY, USE THIS LEARNED TREE TO CLASSIFY AN INSTANCE ;;;>> (classify 'd6 tree) ;;;NO ;;; ;;; ;;;|# ;;; ;;;;;; General utility functions ;;;;; use (print.entity instance) to see the definition of entity (defun put.value (attribute instance val) "assign a value to an attribute of an instance" (setf (get instance attribute) val)) (defun get.value (attribute instance) "retrieve the value of attribute of instance" (get instance attribute)) (defun print.entity (instance) "print the description of instance to the screen" (print (symbol-plist instance))) (setf *data* ;;; studentNum timeLab timeSport timeDrama timeVideo year sex race "chem physics bio" ncaa.bracket? '((s1 avg high none low senior male white yes); n y n yes) (s2 avg none none low senior male white no); n y n no) (s3 avg low none none senior female white no); y y n no) (s4 high avg none low senior male white no); y y n no) (s5 high avg none none senior male white yes); n y n yes) (s6 low low average high senior male white yes); n y n yes) (s7 none low none low senior male white yes); y n n yes) (s8 none low none none senior male white yes); n n n yes) (s9 none high none none senior female white yes); n n n yes) (s10 none low high low senior female white no); y y n no) (s11 none low none high senior female white no); n n n no) (s12 none high none low senior male white no); n n n no) (s13 none low low high senior male white yes); y n y yes) ;;?? (s14 none high none low senior male white yes); n y n yes) (s14 none high none high senior male white yes); y n y yes) (s15 none avg none none senior female white yes); n y y yes) (s16 none none none none senior female asian yes); n n y yes) (s17 none high avg low senior male white yes); n y n yes) (s18 none high avg low senior female white yes); n n n yes) (s19 none high none avg senior male other yes); n n n yes) (s20 none avg avg low senior male white no); y y y no) (s21 avg high none none senior male white yes); n y n yes) (s22 high avg none none senior male asian no); n n n no) (s23 high none none none senior male white no); n y n no) (s24 avg none high none senior male white no); n n n no) (s25 avg none none high senior male other no); n y n no) (s26 avg high none none junior male white yes); n n n yes) (s27 low avg none none junior male white yes); n y n yes) (s28 avg high none none junior male asian yes); y y n yes) (s29 avg none none avg senior male white no); n y n no) (s30 high low none none senior male asian no); y y n no) (s31 avg avg none none senior male white no); n y n no) (s32 avg none none none senior male asian no); y y n no) (s33 low low low low senior male asian yes); y y n yes) (s34 avg high none none senior male other no); n y n no) (s35 avg high none none senior male white yes); n y n yes) (s36 avg high low avg senior male asian no); n y n no) ;; (s37 low avg none none junor male white yes); y y y yes) (s38 avg none none low senior male white no); n n n no) (s39 high low low none junior male white yes); n y y yes) (s40 avg high none low junior female white no); n n n no) (s41 low avg high none junior male white no); n y n no) (s42 high high none none junior male white no); n n y no) (s43 low avg low high senior male white no); n y n no) (s44 none none none high senior male white no); n n y no) (s45 none avg high low sophmore male asian yes); y y n yes) (s46 none low none none sophmore female asian no); n n n no) (s47 none avg low avg junior female asian no); n n n no) (s48 low high low none sophmore male white yes); y n y yes) (s49 none low low low junior male white no); n y y no) (s50 none none low high junior female white no); n n n no) ;; (s51 none avg avg none none sophmore male white yes); n n n yes) (s52 none none none none sophmore male asian no); n n n no) (s53 none high none none sophmore male asian yes); y n y yes) (s54 none none low high junior female asian no); y n n no) (s55 none high none low sophmore female white no); y y y no) (s56 none avg none high sophmore female asian yes); y y n yes) (s57 none high none none junior male white yes); y y n yes) (s58 none avg none avg sophmore male asian no); y y n no) (s59 none high none none junior female other no); y y y no) (s60 none high none none junior male white yes); n y y yes) (s61 none none none none senior male asian no); y y y no) (s62 none high none avg sophmore male asian yes); n y n yes) (s63 none avg none none junior female other no); n n y no) (s64 low low none none senior male asian no); y y n no) (s65 none high none none sophmore female other no); y y n no) (s66 none avg none low senior male asian no); y n n no) (s67 none high none none senior male white no); n n y no) (s68 none high non avg junior male white yes); n n n yes) (s69 none high avg high senior male white yes); y y y yes) (s70 none avg none none junior female asian no); y y n no) ;;?? (s71 high avg none none senior male white no); y y n no) (s71 high avg none none senior male asian no); y y n no) ;; (s72 none none none senior male white no); n y n no) (s73 none high none none junior female white no); y y y no) (s74 high low none none senior male white no); y y n no) (s75 none avg none none junior male asian no); n y n no) (s76 low high low low junior male white yes); n y n yes) (s77 none high none avg junior male white yes); y y n yes) (s78 none low none none junior male asian yes); y y n yes) (s79 low avg avg none junior male white no); y n n no) ;;?? (s80 none high none none junior female white yes); n n n yes) (s80 none high avg none junior female white yes); y n n no) (s81 none high none none senior female asian no); n n n no) (s82 none avg none none junior female asian no); y y n no) (s83 low none none none senior female asian no); n n n no) (s84 none none none none junior female asian no); y y y no) (s85 none none none none senior female white yes); n y y yes) (s86 high avg none none junior male white no); y y n no) (s87 none high none none senior female white yes))); n n n yes))) ;;s1 = leeR ;;s2 = erickT ;;s3 = AlexK ;;s4 = OlexP ;;s5 = BarnettT (defvar *training.examples* nil) ;; set up each training example so that GET.VALUE(attribute example) can be used ;; to retrieve the value of ATTRIBUTE for EXAMPLE ;; chem physcis bio (loop for d in *data* do (setf *training.examples* (cons (first d) *training.examples*)) (loop for attribute in '(timeLab timeSport timeDrama timeVideo year sex race ncaa.bracket?) ;; (loop for attribute in '(timeLab timeSport timeDrama timeVideo year sex race chem physics bio ncaa.bracket?) as value in (cdr d) do (put.value attribute (first d) value))) (put.value 'legal.values 'ncaa.bracket? '(yes no)) ;;; Top level ID3 Decision Tree learning algorithm ; ; Tree Representation: each non-terminal tree node is a list of the form ; (attribute (value1 subtree1)(value2 subtree2)...) ; where subtree-n is either a non-terminal node, or a value signifying the ; target value associated with that terminal node (defun id3 (examples target.attribute attributes) "TARGET.ATTRIBUTE is the attribute to be predicted by the learned tree. EXAMPLES are training examples. ATTRIBUTES is a list of attributes (not including TARGET.ATTRIBUTE) that may be included in the learned decision tree. Returns: a decision tree that predicts the TARGET.ATTRIBUTE over EXAMPLES" (let (firstvalue a partitions) (setq firstvalue (get.value target.attribute (first examples))) (cond ;; if every example has the same target attribute value, return it as ;; a leaf node ((every #'(lambda(e)(eq firstvalue (get.value target.attribute e))) examples) firstvalue) ;; if no attributes, return the most common target attribute value ((null attributes) (most.common.value target.attribute examples)) ;; otherwise, pick the best attribute, partition training data, and call ;; ID3 recursively to grow subtrees below this node (t (setq partitions (loop for a in attributes collect (partition a examples))) (setq a (choose.best.partition target.attribute partitions)) (cons (first a) (loop for branch in (cdr a) collect (list (first branch) (id3 (cdr branch) target.attribute (remove (first a) attributes))))))))) (defun partition (attribute instances) "returns a partion of INSTANCES according to their values for ATTRIBUTE. Returns a list (attribute (value1 e11 e12 ...)(value2 e21 e22 ...)...)" (let (result vlist v) (loop for e in instances do (setq v (get.value attribute e)) (if (setq vlist (assoc v result)) (rplacd vlist (cons e (cdr vlist))) (setq result (cons (list v e) result)))) (cons attribute result))) (defun choose.best.partition (target.attribute partitions) "return the partition with the highest information gain. PARTITIONS is of the form ((attribute1 (val1 e11 e12 ...)(val2 e21 e22 ...)...) (attribute2 (... ...........)(... ... )...)). Note for efficiency, we compute only the expected value of the entropy of the partition, because this is the only term in information gain that varies from one attribute to another" (let ((lowest.exp.entropy 9999) exp.entropy best.partition) (loop for att.partition in partitions do (when (< (setq exp.entropy (expected.entropy target.attribute (cdr att.partition))) lowest.exp.entropy) (setq lowest.exp.entropy exp.entropy) (setq best.partition att.partition))) best.partition)) (defun expected.entropy (att partition) "returns the sum over possible values of att of the quantity number.of.instances.with.this.value x sample.entropy.of.this.partition" (loop for p in partition sum (* (length (cdr p)) (loop for v in (get.value 'legal.values att) sum (let ((vcount (loop for e in (cdr p) count (eq v (get.value att e)))) proportion) (setq proportion (/ vcount (length (cdr p)))) ;; (format t "p: ~S, vcount: ~d, proportion: ~S~%" ;; p vcount proportion) (* -1.0 proportion (if (zerop proportion) 0 (log proportion 2)))))))) (defun most.common.value (attribute instances) (let ((length 0) longest) (loop for p in (partition attribute instances) do (when (> (length p) length) (setq length (length p)) (setq longest p))) (car longest))) (defun print.tree (tree &optional (depth 0)) (tab depth) (format t "~A~%" (first tree)) (loop for subtree in (cdr tree) do (tab (+ depth 1)) (format t "= ~A" (first subtree)) (if (atom (second subtree)) (format t " => ~A~%" (second subtree)) (progn (terpri)(print.tree (second subtree) (+ depth 5)))))) (defun tab (n) (loop for i from 1 to n do (format t " "))) ;;;****** ;; Code to classify a new instance given a tree output by ID3 ;; (defun classify (instance tree) (let (val branch) (if (atom tree) (return-from classify tree)) (setq val (get.value (first tree) instance)) (setq branch (second (assoc val (cdr tree)))) (classify instance branch))) (defun entropy (p) (+ (* -1.0 p (log p 2)) (* -1.0 (- 1 p) (log (- 1 p) 2)))) (setq tree (id3 *training.examples* 'ncaa.bracket? '(timeLab timeSport timeDrama timeVideo year sex race))) #| (defvar *test.examples* nil) (setf *testdata* ;; studentNum timeLab timeSport timeDrama timeVideo year sex race chem physics bio program.java? '((s1 avg high none low senior male white nine ya nine) (s2 avg none none low senior male white nine ya nine) (s3 avg low none none senior female white ya ya nine) (s4 high avg none low senior male white ya ya nine) (s5 high avg none none senior male white nine ya nine))) ;; set up each training example so taht GET.VALUE(attribute example) can be used to retrieve the value of ATTRIVUTE for EXAMPLE (loop for d in *testdata* do (setf *test.examples* (cons (first d) *test.examples*)) (loop for attribute in '(timeLab timeSport timeDrama timeVideo year sex race chem physics bio program.java?) as value in (cdr d) do (put.value attribute (first d) value))) (put.value 'legal.values 'program.java? '(yes no)) ;; [2]> tree ;; (outlook (sunny (humidity (normal yes) (high no))) (overcast yes) ;; (rain (wind (strong no) (weak yes)))) ;; ;; See *testdata* to be used after the tree has been trained. ;; in *testdata* there are no ""yes/no"" training correct answers. ;; ;; [4]> (classify 't1 tree) ;; NO ;; Break 1 [4]> (classify 't2 tree) ;; NO ;; Break 1 [4]> classify 't3 tree) ;; YES |#