;;;#| ;;; ;;;This file contains Decision tree learning code to accompany the ;;;textbook "Machine Learning," Tom M. Mitchell, McGraw Hill, 1997. ;;; ;;;Copyright 1998 Tom M. Mitchell. This code may be freely distributed ;;;and used for any non-commericial purpose, as long as this copyright ;;;notice is retained. The author assumes absolutely no responsibility ;;;for any harm caused by bugs in the code. ;;; ;;; ;;; 1. OVERVIEW ;;; ;;;This file contains a Lisp implementation of the ID3 program for ;;;learning decision trees, as described in Table 3.1 of the textbook ;;;"Machine Learning," Tom Mitchell, McGraw Hill, 1997. It also contains ;;;the set of training examples given in Table 3.2. ;;; ;;;The code is simple enough that you should be able to inspect and ;;;modify it for your own use. ;;; ;;;The six key functions are: ;;; ;;;1. ID3(examples target.attribute attributes) ;;; learns a decision tree, using the algorithm from Table 3.1 in the book ;;; ;;;2. Classify(instance tree) ;;; Given a new instance and a previously learned tree, returns the ;;; classification of that instance produced by the tree. ;;; ;;;3. Print.tree(tree) ;;; Prints a tree in human readable form ;;; ;;;4. Print.entity(instance) ;;; Prints an instance (e.g., one of your training examples) ;;; ;;;5. Get.value(attribute instance) ;;; Get the value of some ATTRIBUTE of INSTANCE ;;; ;;;6. Put.value(attribute instance value) ;;; Assign VALUE to ATTRIBUTE of INSTANCE ;;; ;;; ;;;2. EXAMPLE TRACE: ;;; ;;;After loading this file into CommonLisp, you should be able to ;;;duplicate the following trace: ;;; ;;; ;;;;; FIRST LOOK AT THE PROVIDED TRAINING EXAMPLES ;;;>> *training.examples* ;;;(D14 D13 D12 D11 D10 D9 D8 D7 D6 D5 ...) ;;; ;;;;; USE THE FUNCTION PRINT.ENTITY TO EXAMINE JUST ONE OF THEM ;;;>> (print.entity 'd6) ;;; ;;;(PLAY.TENNIS? NO WIND STRONG HUMIDITY NORMAL TEMPERATURE COOL OUTLOOK RAIN) ;;; ;;;;; NOW USE ID3 TO LEARN A TREE ;;;>> (setq tree (id3 *training.examples* ;;; 'play.tennis? ;;; '(outlook temperature humidity wind))) ;;; ;;;(OUTLOOK (SUNNY (HUMIDITY (NORMAL YES) (HIGH NO))) ;;; (OVERCAST YES) ;;; (RAIN (WIND (STRONG NO) (WEAK YES)))) ;;; ;;; ;;;;; LOOK AT THE TREE IN A MORE HUMAN-FRIENDLY FORM ;;;>> (print.tree tree) ;;;OUTLOOK ;;; = SUNNY ;;; HUMIDITY ;;; = NORMAL => YES ;;; = HIGH => NO ;;; = OVERCAST => YES ;;; = RAIN ;;; WIND ;;; = STRONG => NO ;;; = WEAK => YES ;;; ;;; ;;;;; FINALLY, USE THIS LEARNED TREE TO CLASSIFY AN INSTANCE ;;;>> (classify 'd6 tree) ;;;NO ;;; ;;; ;;;|# ;;; ;;;;;; General utility functions ;;;;; use (print.entity instance) to see the definition of entity (defun put.value (attribute instance val) "assign a value to an attribute of an instance" (setf (get instance attribute) val)) (defun get.value (attribute instance) "retrieve the value of attribute of instance" (get instance attribute)) (defun print.entity (instance) "print the description of instance to the screen" (print (symbol-plist instance))) (setf *data* ;;; studentNum timeLab timeSport timeDrama timeVideo year sex race "chem physics bio" ncaa.bracket? ;; '((s1 avg high none low senior male white n y n yes) ;; (s2 avg none none low senior male white n y n no) ;; (s3 avg low none none senior female white y y n no) ;; (s4 high avg none low senior male white y y n no) ;; (s5 high avg none none senior male white n y n yes) ;; (s6 low low average high senior male white n y n yes) ;; (s7 none low none low senior male white y n n yes) ;; (s8 none low none none senior male white n n n yes) ;; (s9 none high none none senior female white n n n yes) ;; (s10 none low high low senior female white y y n no) ;; (s11 none low none high senior female white n n n no) ;; (s12 none high none low senior male white n n n no) ;; (s13 none low low high senior male white y n y yes) ;; (s14 none high none low senior male white n y n yes) ;; (s15 none avg none none senior female white n y y yes) ;; (s16 none none none none senior female asian n n y yes) ;; (s17 none high avg low senior male white n y n yes) ;; (s18 none high avg low senior female white n n n yes) ;; (s19 none high none avg senior male other n n n yes) ;; (s20 none avg avg low senior male white y y y no) ;; (s21 avg high none none senior male white n y n yes) ;; (s22 high avg none none senior male asian n n n no) ;; (s23 high none none none senior male white n y n no) ;; (s24 avg none high none senior male white n n n no) ;; (s25 avg none none high senior male other n y n no) ;; (s26 avg high none none junior male white n n n yes) ;; (s27 low avg none none junior male white n y n yes) ;; (s28 avg high none none junior male asian y y n yes) ;; (s29 avg none none avg senior male white n y n no) ;; (s30 high low none none senior male asian y y n no) ;; (s31 avg avg none none senior male white n y n no) ;; (s32 avg none none none senior male asian y y n no) ;; (s33 low low low low senior male asian y y n yes) ;; (s34 avg high none none senior male other n y n no) ;; (s35 avg high none none senior male white n y n yes) ;; (s36 avg high low avg senior male asian n y n no) ;; (s37 low avg none none junor male white y y y yes) ;; (s38 avg none none low senior male white n n n no) ;; (s39 high low low none junior male white n y y yes) ;; (s40 avg high none low junior female white n n n no) ;; (s41 low avg high none junior male white n y n no) ;; (s42 high high none none junior male white n n y no) ;; (s43 low avg low high senior male white n y n no) ;; (s44 none none none high senior male white n n y no) ;; (s45 none avg high low sophmore male asian y y n yes) ;; (s46 none low none none sophmore female asian n n n no) ;; (s47 none avg low avg junior female asian n n n no) ;; (s48 low high low none sophmore male white y n y yes) ;; (s49 none low low low junior male white n y y no) ;; (s50 none none low high junior female white n n n no) ;; (s51 none avg avg none none sophmore male white n n n yes) ;; (s52 none none none none sophmore male asian n n n no) ;; (s53 none high none none sophmore male asian y n y yes) ;; (s54 none none low high junior female asian y n n no) ;; (s55 none high none low sophmore female white y y y no) ;; (s56 none avg none high sophmore female asian y y n yes) ;; (s57 none high none none junior male white y y n yes) ;; (s58 none avg none avg sophmore male asian y y n no) ;; (s59 none high none none junior female other y y y no) ;; (s60 none high none none junior male white n y y yes) ;; (s61 none none none none senior male asian y y y no) ;; (s62 none high none avg sophmore male asian n y n yes) ;; (s63 none avg none none junior female other n n y no) ;; (s64 low low none none senior male asian y y n no) ;; (s65 none high none none sophmore female other y y n no) ;; (s66 none avg none low senior male asian y n n no) ;; (s67 none high none none senior male white n n y no) ;; (s68 none high non avg junior male white n n n yes) ;; (s69 none high avg high senior male white y y y yes) ;; (s70 none avg none none junior female asian y y n no) ;; (s71 high avg none none senior male white y y n no) ;; (s72 none none none senior male white n y n no) ;; (s73 none high none none junior female white y y y no) ;; (s74 high low none none senior male white y y n no) ;; (s75 none avg none none junior male asian n y n no) ;; (s76 low high low low junior male white n y n yes) ;; (s77 none high none avg junior male white y y n yes) ;; (s78 none low none none junior male asian y y n yes) ;; (s79 low avg avg none junior male white y n n no) ;; (s80 none high none none junior female white n n n yes) ;; (s81 none high none none senior female asian n n n no) ;; (s82 none avg none none junior female asian y y n no) ;; (s83 low none none none senior female asian n n n no) ;; (s84 none none none none junior female asian y y y no) ;; (s85 none none none none senior female white n y y yes) ;; (s86 high avg none none junior male white y y n no) ;; (s87 none high none none senior female white n n n yes))) '((s1 avg high none low senior male white yes); n y n yes) (s2 avg none none low senior male white no); n y n no) (s3 avg low none none senior female white no); y y n no) (s4 high avg none low senior male white no); y y n no) (s5 high avg none none senior male white yes); n y n yes) (s6 low low average high senior male white yes); n y n yes) (s7 none low none low senior male white yes); y n n yes) (s8 none low none none senior male white yes); n n n yes) (s9 none high none none senior female white yes); n n n yes) (s10 none low high low senior female white no); y y n no) (s11 none low none high senior female white no); n n n no) (s12 none high none low senior male white no); n n n no) (s13 none low low high senior male white yes); y n y yes) (s14 none high none low senior male white yes); n y n yes) (s15 none avg none none senior female white yes); n y y yes) (s16 none none none none senior female asian yes); n n y yes) (s17 none high avg low senior male white yes); n y n yes) (s18 none high avg low senior female white yes); n n n yes) (s19 none high none avg senior male other yes); n n n yes) (s20 none avg avg low senior male white no); y y y no) (s21 avg high none none senior male white yes); n y n yes) (s22 high avg none none senior male asian no); n n n no) (s23 high none none none senior male white no); n y n no) (s24 avg none high none senior male white no); n n n no) (s25 avg none none high senior male other no); n y n no) (s26 avg high none none junior male white yes); n n n yes) (s27 low avg none none junior male white yes); n y n yes) (s28 avg high none none junior male asian yes); y y n yes) (s29 avg none none avg senior male white no); n y n no) (s30 high low none none senior male asian no); y y n no) (s31 avg avg none none senior male white no); n y n no) (s32 avg none none none senior male asian no); y y n no) (s33 low low low low senior male asian yes); y y n yes) (s34 avg high none none senior male other no); n y n no) (s35 avg high none none senior male white yes); n y n yes) (s36 avg high low avg senior male asian no); n y n no) (s37 low avg none none junor male white yes); y y y yes) (s38 avg none none low senior male white no); n n n no) (s39 high low low none junior male white yes); n y y yes) (s40 avg high none low junior female white no); n n n no) (s41 low avg high none junior male white no); n y n no) (s42 high high none none junior male white no); n n y no) (s43 low avg low high senior male white no); n y n no) (s44 none none none high senior male white no); n n y no) (s45 none avg high low sophmore male asian yes); y y n yes) (s46 none low none none sophmore female asian no); n n n no) (s47 none avg low avg junior female asian no); n n n no) (s48 low high low none sophmore male white yes); y n y yes) (s49 none low low low junior male white no); n y y no) (s50 none none low high junior female white no); n n n no) (s51 none avg avg none none sophmore male white yes); n n n yes) (s52 none none none none sophmore male asian no); n n n no) (s53 none high none none sophmore male asian yes); y n y yes) (s54 none none low high junior female asian no); y n n no) (s55 none high none low sophmore female white no); y y y no) (s56 none avg none high sophmore female asian yes); y y n yes) (s57 none high none none junior male white yes); y y n yes) (s58 none avg none avg sophmore male asian no); y y n no) (s59 none high none none junior female other no); y y y no) (s60 none high none none junior male white yes); n y y yes) (s61 none none none none senior male asian no); y y y no) (s62 none high none avg sophmore male asian yes); n y n yes) (s63 none avg none none junior female other no); n n y no) (s64 low low none none senior male asian no); y y n no) (s65 none high none none sophmore female other no); y y n no) (s66 none avg none low senior male asian no); y n n no) (s67 none high none none senior male white no); n n y no) (s68 none high non avg junior male white yes); n n n yes) (s69 none high avg high senior male white yes); y y y yes) (s70 none avg none none junior female asian no); y y n no) (s71 high avg none none senior male white no); y y n no) (s72 none none none senior male white no); n y n no) (s73 none high none none junior female white no); y y y no) (s74 high low none none senior male white no); y y n no) (s75 none avg none none junior male asian no); n y n no) (s76 low high low low junior male white yes); n y n yes) (s77 none high none avg junior male white yes); y y n yes) (s78 none low none none junior male asian yes); y y n yes) (s79 low avg avg none junior male white no); y n n no) (s80 none high none none junior female white yes); n n n yes) (s81 none high none none senior female asian no); n n n no) (s82 none avg none none junior female asian no); y y n no) (s83 low none none none senior female asian no); n n n no) (s84 none none none none junior female asian no); y y y no) (s85 none none none none senior female white yes); n y y yes) (s86 high avg none none junior male white no); y y n no) (s87 none high none none senior female white yes))); n n n yes))) ;;s1 = leeR ;;s2 = erickT ;;s3 = AlexK ;;s4 = OlexP ;;s5 = BarnettT (defvar *training.examples* nil) ;; set up each training example so that GET.VALUE(attribute example) can be used ;; to retrieve the value of ATTRIBUTE for EXAMPLE ;; chem physcis bio (loop for d in *data* do (setf *training.examples* (cons (first d) *training.examples*)) (loop for attribute in '(timeLab timeSport timeDrama timeVideo year sex race ncaa.bracket?) ;; (loop for attribute in '(timeLab timeSport timeDrama timeVideo year sex race chem physics bio ncaa.bracket?) as value in (cdr d) do (put.value attribute (first d) value))) (put.value 'legal.values 'ncaa.bracket? '(yes no)) ;;; Top level ID3 Decision Tree learning algorithm ; ; Tree Representation: each non-terminal tree node is a list of the form ; (attribute (value1 subtree1)(value2 subtree2)...) ; where subtree-n is either a non-terminal node, or a value signifying the ; target value associated with that terminal node (defun id3 (examples target.attribute attributes) "TARGET.ATTRIBUTE is the attribute to be predicted by the learned tree. EXAMPLES are training examples. ATTRIBUTES is a list of attributes (not including TARGET.ATTRIBUTE) that may be included in the learned decision tree. Returns: a decision tree that predicts the TARGET.ATTRIBUTE over EXAMPLES" (let (firstvalue a partitions) (setq firstvalue (get.value target.attribute (first examples))) (cond ;; if every example has the same target attribute value, return it as ;; a leaf node ((every #'(lambda(e)(eq firstvalue (get.value target.attribute e))) examples) firstvalue) ;; if no attributes, return the most common target attribute value ((null attributes) (most.common.value target.attribute examples)) ;; otherwise, pick the best attribute, partition training data, and call ;; ID3 recursively to grow subtrees below this node (t (setq partitions (loop for a in attributes collect (partition a examples))) (setq a (choose.best.partition target.attribute partitions)) (cons (first a) (loop for branch in (cdr a) collect (list (first branch) (id3 (cdr branch) target.attribute (remove (first a) attributes))))))))) (defun partition (attribute instances) "returns a partion of INSTANCES according to their values for ATTRIBUTE. Returns a list (attribute (value1 e11 e12 ...)(value2 e21 e22 ...)...)" (let (result vlist v) (loop for e in instances do (setq v (get.value attribute e)) (if (setq vlist (assoc v result)) (rplacd vlist (cons e (cdr vlist))) (setq result (cons (list v e) result)))) (cons attribute result))) (defun choose.best.partition (target.attribute partitions) "return the partition with the highest information gain. PARTITIONS is of the form ((attribute1 (val1 e11 e12 ...)(val2 e21 e22 ...)...) (attribute2 (... ...........)(... ... )...)). Note for efficiency, we compute only the expected value of the entropy of the partition, because this is the only term in information gain that varies from one attribute to another" (let ((lowest.exp.entropy 9999) exp.entropy best.partition) (loop for att.partition in partitions do (when (< (setq exp.entropy (expected.entropy target.attribute (cdr att.partition))) lowest.exp.entropy) (setq lowest.exp.entropy exp.entropy) (setq best.partition att.partition))) best.partition)) (defun expected.entropy (att partition) "returns the sum over possible values of att of the quantity number.of.instances.with.this.value x sample.entropy.of.this.partition" (loop for p in partition sum (* (length (cdr p)) (loop for v in (get.value 'legal.values att) sum (let ((vcount (loop for e in (cdr p) count (eq v (get.value att e)))) proportion) (setq proportion (/ vcount (length (cdr p)))) ;; (format t "p: ~S, vcount: ~d, proportion: ~S~%" ;; p vcount proportion) (* -1.0 proportion (if (zerop proportion) 0 (log proportion 2)))))))) (defun most.common.value (attribute instances) (let ((length 0) longest) (loop for p in (partition attribute instances) do (when (> (length p) length) (setq length (length p)) (setq longest p))) (car longest))) (defun print.tree (tree &optional (depth 0)) (tab depth) (format t "~A~%" (first tree)) (loop for subtree in (cdr tree) do (tab (+ depth 1)) (format t "= ~A" (first subtree)) (if (atom (second subtree)) (format t " => ~A~%" (second subtree)) (progn (terpri)(print.tree (second subtree) (+ depth 5)))))) (defun tab (n) (loop for i from 1 to n do (format t " "))) ;;;****** ;; Code to classify a new instance given a tree output by ID3 ;; (defun classify (instance tree) (let (val branch) (if (atom tree) (return-from classify tree)) (setq val (get.value (first tree) instance)) (setq branch (second (assoc val (cdr tree)))) (classify instance branch))) (defun entropy (p) (+ (* -1.0 p (log p 2)) (* -1.0 (- 1 p) (log (- 1 p) 2)))) (setq tree (id3 *training.examples* 'ncaa.bracket? '(timeLab timeSport timeDrama timeVideo year sex race))) " (defvar *test.examples* nil) (setf *testdata* ;; studentNum timeLab timeSport timeDrama timeVideo year sex race chem physics bio program.java? '((s1 avg high none low senior male white nine ya nine) (s2 avg none none low senior male white nine ya nine) (s3 avg low none none senior female white ya ya nine) (s4 high avg none low senior male white ya ya nine) (s5 high avg none none senior male white nine ya nine))) ;; set up each training example so taht GET.VALUE(attribute example) can be used to retrieve the value of ATTRIVUTE for EXAMPLE (loop for d in *testdata* do (setf *test.examples* (cons (first d) *test.examples*)) (loop for attribute in '(timeLab timeSport timeDrama timeVideo year sex race chem physics bio program.java?) as value in (cdr d) do (put.value attribute (first d) value))) (put.value 'legal.values 'program.java? '(yes no)) ;; [2]> tree ;; (outlook (sunny (humidity (normal yes) (high no))) (overcast yes) ;; (rain (wind (strong no) (weak yes)))) ;; ;; See *testdata* to be used after the tree has been trained. ;; in *testdata* there are no ""yes/no"" training correct answers. ;; ;; [4]> (classify 't1 tree) ;; NO ;; Break 1 [4]> (classify 't2 tree) ;; NO ;; Break 1 [4]> classify 't3 tree) ;; YES "