update random forest for faster, clean callbacks

652ba4ad · Caroline DE POURTALES · ce8b1765 · 652ba4ad · 652ba4ad · 652ba4ad
Commit 652ba4ad authored 3 years ago by Caroline DE POURTALES
--- a/pages/RFxp/Classifiers/RF2001/pima/pima_nbestim_50_maxdepth_3.mod.pkl
+++ b/pages/RFxp/Classifiers/RF2001/pima/pima_nbestim_50_maxdepth_3.mod.pkl
--- a/pages/RFxp/RFxp.py
+++ b/pages/RFxp/RFxp.py
+#!/usr/bin/env python3
+#-*- coding:utf-8 -*-
+##
+## xprf.py
+##
+##  Created on: Oct 08, 2020
+##      Author: Yacine Izza
+##      E-mail: yacine.izza@univ-toulouse.fr
+##
+#
+#==============================================================================
+from __future__ import print_function
+from data import Data
+from options import Options
+import os
+import sys
+import pickle
+import resource
+from xrf import XRF, RF2001, Dataset
+import numpy as np
+#
+#==============================================================================
+def show_info():
+    """
+        Print info message.
+    """
+    print("c RFxp: Random Forest explainer.")
+    print('c')
+#
+#==============================================================================
+def pickle_save_file(filename, data):
+    try:
+        f =  open(filename, "wb")
+        pickle.dump(data, f)
+        f.close()
+    except:
+        print("Cannot save to file", filename)
+        exit()
+def pickle_load_file(filename):
+    try:
+        f =  open(filename, "rb")
+        data = pickle.load(f)
+        f.close()
+        return data
+    except Exception as e:
+        print(e)
+        print("Cannot load from file", filename)
+        exit()    
+#
+#==============================================================================
+if __name__ == '__main__':
+    # parsing command-line options
+    options = Options(sys.argv)
+    # making output unbuffered
+    if sys.version_info.major == 2:
+        sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+    # showing head
+    show_info()
+    if options.files:
+        cls = None
+        xrf = None
+        print("loading data ...")
+        data = Dataset(filename=options.files[0], 
+                    separator=options.separator, use_categorical = options.use_categorical)
+        if options.train:
+            '''
+            data = Dataset(filename=options.files[0], mapfile=options.mapfile,
+                    separator=options.separator,
+                    use_categorical = options.use_categorical)
+            '''        
+            params = {'n_trees': options.n_estimators,
+                        'depth': options.maxdepth}
+            cls = RF2001(**params)
+            train_accuracy, test_accuracy = cls.train(data)
+            if options.verb == 1:
+                print("----------------------")
+                print("Train accuracy: {0:.2f}".format(100. * train_accuracy))
+                print("Test accuracy: {0:.2f}".format(100. * test_accuracy))
+                print("----------------------")           
+            xrf = XRF(cls, data.feature_names, data.target_name, options.verb)
+            #xrf.test_tree_ensemble()          
+            bench_name = os.path.basename(options.files[0])
+            assert (bench_name.endswith('.csv'))
+            bench_name = os.path.splitext(bench_name)[0]
+            bench_dir_name = options.output + "/RF2001/" + bench_name
+            try:
+                os.stat(bench_dir_name)
+            except:
+                os.makedirs(bench_dir_name)
+            basename = (os.path.join(bench_dir_name, bench_name +
+                            "_nbestim_" + str(options.n_estimators) +
+                            "_maxdepth_" + str(options.maxdepth)))
+            modfile =  basename + '.mod.pkl'
+            print("saving  model to ", modfile)
+            pickle_save_file(modfile, cls)        
+        # read a sample from options.explain
+        if options.explain:
+            options.explain = [float(v.strip()) for v in options.explain.split(',')]
+            if not xrf:
+                print("loading model ...")
+                cls = pickle_load_file(options.files[1])
+                #print()
+                #print("class skl:",cls.forest.classes_)
+                #print("feat names:",data.feature_names)
+                #print("extended name:",data.extended_feature_names_as_array_strings)
+                #print("target:",data.target_name)
+                #print()
+                xrf = XRF(cls, data.feature_names, data.target_name, options.verb)
+                if options.verb:
+                    # print test accuracy of the RF model
+                    _, X_test, _, y_test = data.train_test_split()
+                    X_test = data.transform(X_test) 
+                    cls.print_accuracy(X_test, y_test) 
+            expl = xrf.explain(options.explain, options.xtype)
+            print(f"expl len: {len(expl)}")
+            del xrf.enc
+            del xrf.x            
\ No newline at end of file
--- a/pages/RFxp/data.py
+++ b/pages/RFxp/data.py
+#!/usr/bin/env python
+#-*- coding:utf-8 -*-
+##
+## data.py
+##
+##  Created on: Sep 20, 2017
+##      Author: Alexey Ignatiev, Nina Narodytska
+##      E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com
+##
+#
+#==============================================================================
+from __future__ import print_function
+import collections
+import itertools
+import os, pickle
+import six
+from six.moves import range
+import numpy as np
+#
+#==============================================================================
+class Data(object):
+    """
+        Class for representing data (transactions).
+    """
+    def __init__(self, filename=None, fpointer=None, mapfile=None,
+            separator=',', use_categorical = False):
+        """
+            Constructor and parser.
+        """
+        self.names = None
+        self.nm2id = None
+        self.samps = None
+        self.wghts = None
+        self.feats = None
+        self.fvmap = None
+        self.ovmap = {}
+        self.fvars = None
+        self.fname = filename
+        self.mname = mapfile
+        self.deleted = set([])
+        if filename:
+            with open(filename, 'r') as fp:
+                self.parse(fp, separator)
+        elif fpointer:
+            self.parse(fpointer, separator)
+        if self.mname:
+            self.read_orig_values()
+        # check if we have extra info about categorical_features
+        if (use_categorical):
+            extra_file = filename+".pkl"
+            try:
+                f =  open(extra_file, "rb")
+                print("Attempt: loading extra data from ", extra_file)
+                extra_info = pickle.load(f)
+                print("loaded")
+                f.close()
+                self.categorical_features = extra_info["categorical_features"]
+                self.categorical_names = extra_info["categorical_names"]
+                self.class_names = extra_info["class_names"]
+                self.categorical_onehot_names  = extra_info["categorical_names"].copy()
+                for i, name in enumerate(self.class_names):
+                    self.class_names[i] = str(name).replace("b'","'")
+                for c in self.categorical_names.items():
+                    clean_feature_names = []
+                    for i, name in enumerate(c[1]):
+                        name = str(name).replace("b'","'")
+                        clean_feature_names.append(name)
+                    self.categorical_names[c[0]] = clean_feature_names
+            except Exception as e:
+                f.close()
+                print("Please provide info about categorical features or omit option -c", e)
+                exit()
+    def parse(self, fp, separator):
+        """
+            Parse input file.
+        """
+        # reading data set from file
+        lines = fp.readlines()
+        # reading preamble
+        self.names = lines[0].strip().split(separator)
+        self.feats = [set([]) for n in self.names]
+        del(lines[0])
+        # filling name to id mapping
+        self.nm2id = {name: i for i, name in enumerate(self.names)}
+        self.nonbin2bin = {}
+        for name in self.nm2id:
+            spl = name.rsplit(':',1)
+            if (spl[0] not in self.nonbin2bin):
+                self.nonbin2bin[spl[0]] = [name]
+            else:
+                self.nonbin2bin[spl[0]].append(name)
+        # reading training samples
+        self.samps, self.wghts = [], []
+        for line, w in six.iteritems(collections.Counter(lines)):
+            sample = line.strip().split(separator)
+            for i, f in enumerate(sample):
+                if f:
+                    self.feats[i].add(f)
+            self.samps.append(sample)
+            self.wghts.append(w)
+        # direct and opposite mappings for items
+        idpool = itertools.count(start=0)
+        FVMap = collections.namedtuple('FVMap', ['dir', 'opp'])
+        self.fvmap = FVMap(dir={}, opp={})
+        # mapping features to ids
+        for i in range(len(self.names) - 1):
+            feats = sorted(list(self.feats[i]), reverse=True)
+            if len(feats) > 2:
+                for l in feats:
+                    self.fvmap.dir[(self.names[i], l)] = l
+            else:
+                self.fvmap.dir[(self.names[i], feats[0])] = 1
+                if len(feats) == 2:
+                    self.fvmap.dir[(self.names[i], feats[1])] = 0
+        # opposite mapping
+        for key, val in six.iteritems(self.fvmap.dir):
+            self.fvmap.opp[val] = key
+        # determining feature variables (excluding class variables)
+        for v, pair in six.iteritems(self.fvmap.opp):
+            if pair[0] == self.names[-1]:
+                self.fvars = v - 1
+                break
+    def read_orig_values(self):
+        """
+            Read original values for all the features.
+            (from a separate CSV file)
+        """
+        self.ovmap = {}
+        for line in open(self.mname, 'r'):
+            featval, bits = line.strip().split(',')
+            feat, val = featval.split(':')
+            for i, b in enumerate(bits):
+                f = '{0}:b{1}'.format(feat, i + 1)
+                v = self.fvmap.dir[(f, '1')]
+                if v not in self.ovmap:
+                    self.ovmap[v] = [feat]
+                if -v not in self.ovmap:
+                    self.ovmap[-v] = [feat]
+                self.ovmap[v if b == '1' else -v].append(val)
--- a/pages/RFxp/options.py
+++ b/pages/RFxp/options.py
+#!/usr/bin/env python
+#-*- coding:utf-8 -*-
+##
+## options.py
+##
+##  Created on: Dec 7, 2018
+##      Author: Alexey Ignatiev, Nina Narodytska
+##      E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com
+##
+#
+#==============================================================================
+from __future__ import print_function
+import getopt
+import math
+import os
+import sys
+#
+#==============================================================================
+class Options(object):
+    """
+        Class for representing command-line options.
+    """
+    def __init__(self, command):
+        """
+            Constructor.
+        """
+        # actions
+        self.train = False
+        self.encode = 'none'
+        self.explain = ''
+        self.xtype = 'abd'
+        self.use_categorical = False
+        # training options
+        self.accmin = 0.95
+        self.n_estimators = 100
+        self.maxdepth = 3
+        self.testsplit = 0.2
+        self.seed = 7
+        # other options
+        self.files = None
+        self.output = 'Classifiers'
+        self.mapfile = None
+        self.separator = ','
+        self.smallest = False
+        self.solver = 'g3'
+        self.verb = 0
+        if command:
+            self.parse(command)
+    def parse(self, command):
+        """
+            Parser.
+        """
+        self.command = command
+        try:
+            opts, args = getopt.getopt(command[1:],
+                                    'e:hc:d:Mn:o:s:tvx:X:',
+                                    ['encode=', 'help', 'use-categorical=',
+                                     'maxdepth=', 'minimum', 'nbestims=',
+                                     'output=', 'seed=', 'solver=', 'testsplit=',
+                                     'train', 'verbose', 'explain=', 'xtype=' ])
+        except getopt.GetoptError as err:
+            sys.stderr.write(str(err).capitalize())
+            self.usage()
+            sys.exit(1)
+        for opt, arg in opts:
+            if opt in ('-a', '--accmin'):
+                self.accmin = float(arg)
+            elif opt in ('-c', '--use-categorical'):
+                self.use_categorical = True
+            elif opt in ('-d', '--maxdepth'):
+                self.maxdepth = int(arg)
+            elif opt in ('-e', '--encode'):
+                self.encode = str(arg)
+            elif opt in ('-h', '--help'):
+                self.usage()
+                sys.exit(0)
+            elif opt in ('-M', '--minimum'):
+                self.smallest = True
+            elif opt in ('-n', '--nbestims'):
+                self.n_estimators = int(arg)
+            elif opt in ('-o', '--output'):
+                self.output = str(arg)
+            elif opt == '--seed':
+                self.seed = int(arg)
+            elif opt == '--sep':
+                self.separator = str(arg)
+            elif opt in ('-s', '--solver'):
+                self.solver = str(arg)
+            elif opt == '--testsplit':
+                self.testsplit = float(arg)
+            elif opt in ('-t', '--train'):
+                self.train = True
+            elif opt in ('-v', '--verbose'):
+                self.verb += 1
+            elif opt in ('-x', '--explain'):
+                self.explain = str(arg)
+            elif opt in ('-X', '--xtype'):
+                self.xtype = str(arg)
+            else:
+                assert False, 'Unhandled option: {0} {1}'.format(opt, arg)
+        if self.encode == 'none':
+            self.encode = None
+        self.files = args
+    def usage(self):
+        """
+            Print usage message.
+        """
+        print('Usage: ' + os.path.basename(self.command[0]) + ' [options] input-file')
+        print('Options:')
+        #print('        -a, --accmin=<float>       Minimal accuracy')
+        #print('                                   Available values: [0.0, 1.0] (default = 0.95)')
+        #print('        -c, --use-categorical      Treat categorical features as categorical (with categorical features info if available)')
+        print('        -d, --maxdepth=<int>       Maximal depth of a tree')
+        print('                                   Available values: [1, INT_MAX] (default = 3)')
+        #print('        -e, --encode=<smt>         Encode a previously trained model')
+        #print('                                   Available values: sat, maxsat, none (default = none)')
+        print('        -h, --help                 Show this message')
+        #print('        -m, --map-file=<string>    Path to a file containing a mapping to original feature values. (default: none)')
+        #print('        -M, --minimum              Compute a smallest size explanation (instead of a subset-minimal one)')
+        print('        -n, --nbestims=<int>       Number of trees in the ensemble')
+        print('                                   Available values: [1, INT_MAX] (default = 100)')
+        print('        -o, --output=<string>      Directory where output files will be stored (default: \'temp\')')
+        print('        --seed=<int>               Seed for random splitting')
+        print('                                   Available values: [1, INT_MAX] (default = 7)')
+        print('        --sep=<string>             Field separator used in input file (default = \',\')')
+        print('        -s, --solver=<string>      A SAT oracle to use')
+        print('                                   Available values: glucose3, minisat (default = g3)')
+        print('        -t, --train                Train a model of a given dataset')
+        print('        --testsplit=<float>        Training and test sets split')
+        print('                                   Available values: [0.0, 1.0] (default = 0.2)')
+        print('        -v, --verbose              Increase verbosity level')
+        print('        -x, --explain=<string>     Explain a decision for a given comma-separated sample (default: none)')
+        print('        -X, --xtype=<string>       Type of explanation to compute: abductive or contrastive')
--- a/pages/RFxp/pima.csv
+++ b/pages/RFxp/pima.csv
--- a/pages/RFxp/xrf/__init__.py
+++ b/pages/RFxp/xrf/__init__.py
+#from .tree import *
+from .rndmforest import *
+from .xforest import *
\ No newline at end of file
--- a/pages/RFxp/xrf/rndmforest.py
+++ b/pages/RFxp/xrf/rndmforest.py
+from sklearn.ensemble._voting import VotingClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+import numpy as np
+import sys
+import os
+import resource
+import collections
+from itertools import combinations
+from six.moves import range
+import six
+import math
+#
+#==============================================================================
+class VotingRF(VotingClassifier):
+    """
+        Majority rule classifier
+    """
+    def fit(self, X, y, sample_weight=None):
+        self.estimators_ = []
+        for _, est in self.estimators:
+            self.estimators_.append(est)
+        self.le_ = LabelEncoder().fit(y)
+        self.classes_ = self.le_.classes_   
+    def predict(self, X):
+        """Predict class labels for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+        Returns
+        -------
+        maj : array-like of shape (n_samples,)
+            Predicted class labels.
+        """
+        #check_is_fitted(self)
+        # 'hard' voting
+        predictions = self._predict(X)
+        predictions =  np.asarray(predictions, np.int64) #NEED TO BE CHECKED
+        maj = np.apply_along_axis(
+            lambda x: np.argmax(
+                np.bincount(x, weights=self._weights_not_none)),
+            axis=1, arr=predictions)
+        maj = self.le_.inverse_transform(maj)
+        return maj
+#
+#==============================================================================
+class RF2001(object):
+    """
+        The main class to train Random Forest Classifier (RFC).
+    """
+    def __init__(self, **options):
+        """
+            Constructor.
+        """    
+        self.forest = None
+        self.voting = None
+        param_dist = {'n_estimators':options['n_trees'],
+                      'max_depth':options['depth'],
+                      'criterion':'entropy',
+                      'random_state':324089}
+        self.forest = RandomForestClassifier(**param_dist)
+    def fit(self, X_train, y_train):
+        """
+            building Breiman'01 Random Forest 
+            (similar to train(dataset) fnc) 
+        """
+        self.forest.fit(X_train,y_train)
+        rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)]
+        self.voting = VotingRF(estimators=rtrees)
+        self.voting.fit(X_train,y_train)
+        return self
+    def train(self, dataset, verb=0):
+        """
+            Train a random forest.
+        """
+        X_train, X_test, y_train, y_test = dataset.train_test_split()
+        X_train = dataset.transform(X_train)
+        X_test = dataset.transform(X_test)
+        print("Build a random forest.")
+        self.forest.fit(X_train,y_train)
+        rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)]
+        self.voting = VotingRF(estimators=rtrees)
+        self.voting.fit(X_train,y_train)
+        train_acc = accuracy_score(self.predict(X_train), y_train)
+        test_acc = accuracy_score(self.predict(X_test), y_test)
+        if verb > 1:
+            self.print_acc_vote(X_train, X_test, y_train, y_test)
+            self.print_acc_prob(X_train, X_test, y_train, y_test)
+        return train_acc, test_acc
+    def predict(self, X):
+        return self.voting.predict(X)
+    def predict_prob(self, X):
+        self.forest.predict(X)
+    def estimators(self):
+        assert(self.forest.estimators_ is not None)
+        return self.forest.estimators_
+    def n_estimators(self):
+        return self.forest.n_estimators
+    def print_accuracy(self, X_test, y_test):  
+        test_acc = accuracy_score(self.predict(X_test), y_test)
+        print("c Model accuracy: {0:.2f}".format(100. * test_acc))
+        #print("----------------------")  
\ No newline at end of file
--- a/pages/RFxp/xrf/tree.py
+++ b/pages/RFxp/xrf/tree.py
+#
+#==============================================================================
+from anytree import Node, RenderTree,AsciiStyle
+import json
+import numpy as np
+import math
+import os
+#
+#==============================================================================
+class dt_node(Node):
+    def __init__(self, id, parent = None):
+        Node.__init__(self, id, parent)
+        self.id = id  # The node value
+        self.name = None
+        self.left_node_id = -1   #  Left child
+        self.right_node_id = -1  # Right child
+        self.feature = -1
+        self.threshold = None
+        self.values = -1 
+        #iai
+        #self.split = None
+    def __str__(self):
+        pref = ' ' * self.depth
+        if (len(self.children) == 0):
+            return (pref+ "leaf: {}  {}".format(self.id, self.values))
+        else:
+            if(self.name is None):
+                return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold))
+            else:
+                return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold))
+#==============================================================================
+def build_tree(tree_, feature_names = None):
+    ##  
+    feature = tree_.feature
+    threshold = tree_.threshold
+    values = tree_.value
+    n_nodes = tree_.node_count
+    children_left = tree_.children_left
+    children_right = tree_.children_right
+    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
+    is_leaf = np.zeros(shape=n_nodes, dtype=bool)
+    stack = [(0, -1)]  # seed is the root node id and its parent depth
+    while len(stack) > 0:
+        node_id, parent_depth = stack.pop()
+        node_depth[node_id] = parent_depth + 1
+        # If we have a test node
+        if (children_left[node_id] != children_right[node_id]):
+            stack.append((children_left[node_id], parent_depth + 1))
+            stack.append((children_right[node_id], parent_depth + 1))
+        else:
+            is_leaf[node_id] = True    
+    ##        
+    m = tree_.node_count  
+    assert (m > 0), "Empty tree"
+    def extract_data(idx, root = None, feature_names = None):
+        i = idx
+        assert (i < m), "Error index node"
+        if (root is None):
+            node = dt_node(i)
+        else:
+            node = dt_node(i, parent = root)
+        #node.cover = json_node["cover"]
+        if is_leaf[i]:
+            node.values = np.argmax(values[i])
+            #if(inverse):
+            #    node.values = -node.values
+        else:
+            node.feature = feature[i]
+            if (feature_names is not None):
+                node.name = feature_names[feature[i]]
+            node.threshold = threshold[i]
+            node.left_node_id = children_left[i]
+            node.right_node_id = children_right[i]
+            extract_data(node.left_node_id, node, feature_names) #feat < threshold ( < 0.5 False)
+            extract_data(node.right_node_id, node, feature_names) #feat >= threshold ( >= 0.5 True)            
+        return node
+    root = extract_data(0, None, feature_names)
+    return root
+#==============================================================================
+def walk_tree(node):
+    if (len(node.children) == 0):
+        # leaf
+        print(node)
+    else:
+        print(node)
+        walk_tree(node.children[0])
+        walk_tree(node.children[1])
+def count_nodes(root):
+    def count(node):
+        if len(node.children):
+            return sum([1+count(n) for n in node.children])
+        else:
+            return 0
+    m = count(root) + 1
+    return m
+#
+#==============================================================================
+def predict_tree(node, sample):
+    if (len(node.children) == 0):
+        # leaf
+        return node.values
+    else:
+        feature_branch = node.feature
+        sample_value = sample[feature_branch]
+        assert(sample_value is not None)
+        if(sample_value < node.threshold):
+            return predict_tree(node.children[0], sample)
+        else:
+            return predict_tree(node.children[1], sample)
+#
+#==============================================================================
+class Forest:
+    """ An ensemble of decision trees.
+    This object provides a common interface to many different types of models.
+    """
+    def __init__(self, rf, feature_names = None):
+        #self.rf = rf
+        self.trees = [ build_tree(dt.tree_, feature_names) for dt in rf.estimators()]
+        self.sz = sum([dt.tree_.node_count for dt in rf.estimators()])
+        self.md = max([dt.tree_.max_depth for dt in rf.estimators()])
+        ####
+        nb_nodes = [dt.tree_.node_count for dt in rf.estimators()]
+        print("min: {0} | max: {1}".format(min(nb_nodes), max(nb_nodes)))
+        assert([dt.tree_.node_count for dt in rf.estimators()] == [count_nodes(dt) for dt in self.trees])
+        #self.print_trees()
+    def print_trees(self):
+        for i,t in enumerate(self.trees):
+            print("tree number: ", i)
+            walk_tree(t)
+    def predict_inst(self, inst):
+        scores = [predict_tree(dt, inst) for dt in self.trees]
+        scores = np.asarray(scores)
+        maj = np.argmax(np.bincount(scores))
+        return maj
+    def predict(self, samples):       
+        predictions = []
+        print("#Trees: ", len(self.trees))
+        for sample in np.asarray(samples):
+            scores = []
+            for i,t in enumerate(self.trees):
+                s = predict_tree(t, sample)
+                scores.append((s))
+            scores = np.asarray(scores)
+            predictions.append(scores)
+        predictions = np.asarray(predictions)    
+        #print(predictions)    
+        #np.bincount(x, weights=self._weights_not_none)
+        maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions)
+        return maj   
--- a/pages/RFxp/xrf/xforest.py
+++ b/pages/RFxp/xrf/xforest.py