Skip to content
Snippets Groups Projects
Commit 652ba4ad authored by Caroline DE POURTALES's avatar Caroline DE POURTALES
Browse files

update random forest for faster, clean callbacks

parent ce8b1765
No related branches found
No related tags found
No related merge requests found
File added
#!/usr/bin/env python3
#-*- coding:utf-8 -*-
##
## xprf.py
##
## Created on: Oct 08, 2020
## Author: Yacine Izza
## E-mail: yacine.izza@univ-toulouse.fr
##
#
#==============================================================================
from __future__ import print_function
from data import Data
from options import Options
import os
import sys
import pickle
import resource
from xrf import XRF, RF2001, Dataset
import numpy as np
#
#==============================================================================
def show_info():
"""
Print info message.
"""
print("c RFxp: Random Forest explainer.")
print('c')
#
#==============================================================================
def pickle_save_file(filename, data):
try:
f = open(filename, "wb")
pickle.dump(data, f)
f.close()
except:
print("Cannot save to file", filename)
exit()
def pickle_load_file(filename):
try:
f = open(filename, "rb")
data = pickle.load(f)
f.close()
return data
except Exception as e:
print(e)
print("Cannot load from file", filename)
exit()
#
#==============================================================================
if __name__ == '__main__':
# parsing command-line options
options = Options(sys.argv)
# making output unbuffered
if sys.version_info.major == 2:
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
# showing head
show_info()
if options.files:
cls = None
xrf = None
print("loading data ...")
data = Dataset(filename=options.files[0],
separator=options.separator, use_categorical = options.use_categorical)
if options.train:
'''
data = Dataset(filename=options.files[0], mapfile=options.mapfile,
separator=options.separator,
use_categorical = options.use_categorical)
'''
params = {'n_trees': options.n_estimators,
'depth': options.maxdepth}
cls = RF2001(**params)
train_accuracy, test_accuracy = cls.train(data)
if options.verb == 1:
print("----------------------")
print("Train accuracy: {0:.2f}".format(100. * train_accuracy))
print("Test accuracy: {0:.2f}".format(100. * test_accuracy))
print("----------------------")
xrf = XRF(cls, data.feature_names, data.target_name, options.verb)
#xrf.test_tree_ensemble()
bench_name = os.path.basename(options.files[0])
assert (bench_name.endswith('.csv'))
bench_name = os.path.splitext(bench_name)[0]
bench_dir_name = options.output + "/RF2001/" + bench_name
try:
os.stat(bench_dir_name)
except:
os.makedirs(bench_dir_name)
basename = (os.path.join(bench_dir_name, bench_name +
"_nbestim_" + str(options.n_estimators) +
"_maxdepth_" + str(options.maxdepth)))
modfile = basename + '.mod.pkl'
print("saving model to ", modfile)
pickle_save_file(modfile, cls)
# read a sample from options.explain
if options.explain:
options.explain = [float(v.strip()) for v in options.explain.split(',')]
if not xrf:
print("loading model ...")
cls = pickle_load_file(options.files[1])
#print()
#print("class skl:",cls.forest.classes_)
#print("feat names:",data.feature_names)
#print("extended name:",data.extended_feature_names_as_array_strings)
#print("target:",data.target_name)
#print()
xrf = XRF(cls, data.feature_names, data.target_name, options.verb)
if options.verb:
# print test accuracy of the RF model
_, X_test, _, y_test = data.train_test_split()
X_test = data.transform(X_test)
cls.print_accuracy(X_test, y_test)
expl = xrf.explain(options.explain, options.xtype)
print(f"expl len: {len(expl)}")
del xrf.enc
del xrf.x
\ No newline at end of file
#!/usr/bin/env python
#-*- coding:utf-8 -*-
##
## data.py
##
## Created on: Sep 20, 2017
## Author: Alexey Ignatiev, Nina Narodytska
## E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com
##
#
#==============================================================================
from __future__ import print_function
import collections
import itertools
import os, pickle
import six
from six.moves import range
import numpy as np
#
#==============================================================================
class Data(object):
"""
Class for representing data (transactions).
"""
def __init__(self, filename=None, fpointer=None, mapfile=None,
separator=',', use_categorical = False):
"""
Constructor and parser.
"""
self.names = None
self.nm2id = None
self.samps = None
self.wghts = None
self.feats = None
self.fvmap = None
self.ovmap = {}
self.fvars = None
self.fname = filename
self.mname = mapfile
self.deleted = set([])
if filename:
with open(filename, 'r') as fp:
self.parse(fp, separator)
elif fpointer:
self.parse(fpointer, separator)
if self.mname:
self.read_orig_values()
# check if we have extra info about categorical_features
if (use_categorical):
extra_file = filename+".pkl"
try:
f = open(extra_file, "rb")
print("Attempt: loading extra data from ", extra_file)
extra_info = pickle.load(f)
print("loaded")
f.close()
self.categorical_features = extra_info["categorical_features"]
self.categorical_names = extra_info["categorical_names"]
self.class_names = extra_info["class_names"]
self.categorical_onehot_names = extra_info["categorical_names"].copy()
for i, name in enumerate(self.class_names):
self.class_names[i] = str(name).replace("b'","'")
for c in self.categorical_names.items():
clean_feature_names = []
for i, name in enumerate(c[1]):
name = str(name).replace("b'","'")
clean_feature_names.append(name)
self.categorical_names[c[0]] = clean_feature_names
except Exception as e:
f.close()
print("Please provide info about categorical features or omit option -c", e)
exit()
def parse(self, fp, separator):
"""
Parse input file.
"""
# reading data set from file
lines = fp.readlines()
# reading preamble
self.names = lines[0].strip().split(separator)
self.feats = [set([]) for n in self.names]
del(lines[0])
# filling name to id mapping
self.nm2id = {name: i for i, name in enumerate(self.names)}
self.nonbin2bin = {}
for name in self.nm2id:
spl = name.rsplit(':',1)
if (spl[0] not in self.nonbin2bin):
self.nonbin2bin[spl[0]] = [name]
else:
self.nonbin2bin[spl[0]].append(name)
# reading training samples
self.samps, self.wghts = [], []
for line, w in six.iteritems(collections.Counter(lines)):
sample = line.strip().split(separator)
for i, f in enumerate(sample):
if f:
self.feats[i].add(f)
self.samps.append(sample)
self.wghts.append(w)
# direct and opposite mappings for items
idpool = itertools.count(start=0)
FVMap = collections.namedtuple('FVMap', ['dir', 'opp'])
self.fvmap = FVMap(dir={}, opp={})
# mapping features to ids
for i in range(len(self.names) - 1):
feats = sorted(list(self.feats[i]), reverse=True)
if len(feats) > 2:
for l in feats:
self.fvmap.dir[(self.names[i], l)] = l
else:
self.fvmap.dir[(self.names[i], feats[0])] = 1
if len(feats) == 2:
self.fvmap.dir[(self.names[i], feats[1])] = 0
# opposite mapping
for key, val in six.iteritems(self.fvmap.dir):
self.fvmap.opp[val] = key
# determining feature variables (excluding class variables)
for v, pair in six.iteritems(self.fvmap.opp):
if pair[0] == self.names[-1]:
self.fvars = v - 1
break
def read_orig_values(self):
"""
Read original values for all the features.
(from a separate CSV file)
"""
self.ovmap = {}
for line in open(self.mname, 'r'):
featval, bits = line.strip().split(',')
feat, val = featval.split(':')
for i, b in enumerate(bits):
f = '{0}:b{1}'.format(feat, i + 1)
v = self.fvmap.dir[(f, '1')]
if v not in self.ovmap:
self.ovmap[v] = [feat]
if -v not in self.ovmap:
self.ovmap[-v] = [feat]
self.ovmap[v if b == '1' else -v].append(val)
#!/usr/bin/env python
#-*- coding:utf-8 -*-
##
## options.py
##
## Created on: Dec 7, 2018
## Author: Alexey Ignatiev, Nina Narodytska
## E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com
##
#
#==============================================================================
from __future__ import print_function
import getopt
import math
import os
import sys
#
#==============================================================================
class Options(object):
"""
Class for representing command-line options.
"""
def __init__(self, command):
"""
Constructor.
"""
# actions
self.train = False
self.encode = 'none'
self.explain = ''
self.xtype = 'abd'
self.use_categorical = False
# training options
self.accmin = 0.95
self.n_estimators = 100
self.maxdepth = 3
self.testsplit = 0.2
self.seed = 7
# other options
self.files = None
self.output = 'Classifiers'
self.mapfile = None
self.separator = ','
self.smallest = False
self.solver = 'g3'
self.verb = 0
if command:
self.parse(command)
def parse(self, command):
"""
Parser.
"""
self.command = command
try:
opts, args = getopt.getopt(command[1:],
'e:hc:d:Mn:o:s:tvx:X:',
['encode=', 'help', 'use-categorical=',
'maxdepth=', 'minimum', 'nbestims=',
'output=', 'seed=', 'solver=', 'testsplit=',
'train', 'verbose', 'explain=', 'xtype=' ])
except getopt.GetoptError as err:
sys.stderr.write(str(err).capitalize())
self.usage()
sys.exit(1)
for opt, arg in opts:
if opt in ('-a', '--accmin'):
self.accmin = float(arg)
elif opt in ('-c', '--use-categorical'):
self.use_categorical = True
elif opt in ('-d', '--maxdepth'):
self.maxdepth = int(arg)
elif opt in ('-e', '--encode'):
self.encode = str(arg)
elif opt in ('-h', '--help'):
self.usage()
sys.exit(0)
elif opt in ('-M', '--minimum'):
self.smallest = True
elif opt in ('-n', '--nbestims'):
self.n_estimators = int(arg)
elif opt in ('-o', '--output'):
self.output = str(arg)
elif opt == '--seed':
self.seed = int(arg)
elif opt == '--sep':
self.separator = str(arg)
elif opt in ('-s', '--solver'):
self.solver = str(arg)
elif opt == '--testsplit':
self.testsplit = float(arg)
elif opt in ('-t', '--train'):
self.train = True
elif opt in ('-v', '--verbose'):
self.verb += 1
elif opt in ('-x', '--explain'):
self.explain = str(arg)
elif opt in ('-X', '--xtype'):
self.xtype = str(arg)
else:
assert False, 'Unhandled option: {0} {1}'.format(opt, arg)
if self.encode == 'none':
self.encode = None
self.files = args
def usage(self):
"""
Print usage message.
"""
print('Usage: ' + os.path.basename(self.command[0]) + ' [options] input-file')
print('Options:')
#print(' -a, --accmin=<float> Minimal accuracy')
#print(' Available values: [0.0, 1.0] (default = 0.95)')
#print(' -c, --use-categorical Treat categorical features as categorical (with categorical features info if available)')
print(' -d, --maxdepth=<int> Maximal depth of a tree')
print(' Available values: [1, INT_MAX] (default = 3)')
#print(' -e, --encode=<smt> Encode a previously trained model')
#print(' Available values: sat, maxsat, none (default = none)')
print(' -h, --help Show this message')
#print(' -m, --map-file=<string> Path to a file containing a mapping to original feature values. (default: none)')
#print(' -M, --minimum Compute a smallest size explanation (instead of a subset-minimal one)')
print(' -n, --nbestims=<int> Number of trees in the ensemble')
print(' Available values: [1, INT_MAX] (default = 100)')
print(' -o, --output=<string> Directory where output files will be stored (default: \'temp\')')
print(' --seed=<int> Seed for random splitting')
print(' Available values: [1, INT_MAX] (default = 7)')
print(' --sep=<string> Field separator used in input file (default = \',\')')
print(' -s, --solver=<string> A SAT oracle to use')
print(' Available values: glucose3, minisat (default = g3)')
print(' -t, --train Train a model of a given dataset')
print(' --testsplit=<float> Training and test sets split')
print(' Available values: [0.0, 1.0] (default = 0.2)')
print(' -v, --verbose Increase verbosity level')
print(' -x, --explain=<string> Explain a decision for a given comma-separated sample (default: none)')
print(' -X, --xtype=<string> Type of explanation to compute: abductive or contrastive')
This diff is collapsed.
#from .tree import *
from .rndmforest import *
from .xforest import *
\ No newline at end of file
from sklearn.ensemble._voting import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import sys
import os
import resource
import collections
from itertools import combinations
from six.moves import range
import six
import math
#
#==============================================================================
class VotingRF(VotingClassifier):
"""
Majority rule classifier
"""
def fit(self, X, y, sample_weight=None):
self.estimators_ = []
for _, est in self.estimators:
self.estimators_.append(est)
self.le_ = LabelEncoder().fit(y)
self.classes_ = self.le_.classes_
def predict(self, X):
"""Predict class labels for X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
maj : array-like of shape (n_samples,)
Predicted class labels.
"""
#check_is_fitted(self)
# 'hard' voting
predictions = self._predict(X)
predictions = np.asarray(predictions, np.int64) #NEED TO BE CHECKED
maj = np.apply_along_axis(
lambda x: np.argmax(
np.bincount(x, weights=self._weights_not_none)),
axis=1, arr=predictions)
maj = self.le_.inverse_transform(maj)
return maj
#
#==============================================================================
class RF2001(object):
"""
The main class to train Random Forest Classifier (RFC).
"""
def __init__(self, **options):
"""
Constructor.
"""
self.forest = None
self.voting = None
param_dist = {'n_estimators':options['n_trees'],
'max_depth':options['depth'],
'criterion':'entropy',
'random_state':324089}
self.forest = RandomForestClassifier(**param_dist)
def fit(self, X_train, y_train):
"""
building Breiman'01 Random Forest
(similar to train(dataset) fnc)
"""
self.forest.fit(X_train,y_train)
rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)]
self.voting = VotingRF(estimators=rtrees)
self.voting.fit(X_train,y_train)
return self
def train(self, dataset, verb=0):
"""
Train a random forest.
"""
X_train, X_test, y_train, y_test = dataset.train_test_split()
X_train = dataset.transform(X_train)
X_test = dataset.transform(X_test)
print("Build a random forest.")
self.forest.fit(X_train,y_train)
rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)]
self.voting = VotingRF(estimators=rtrees)
self.voting.fit(X_train,y_train)
train_acc = accuracy_score(self.predict(X_train), y_train)
test_acc = accuracy_score(self.predict(X_test), y_test)
if verb > 1:
self.print_acc_vote(X_train, X_test, y_train, y_test)
self.print_acc_prob(X_train, X_test, y_train, y_test)
return train_acc, test_acc
def predict(self, X):
return self.voting.predict(X)
def predict_prob(self, X):
self.forest.predict(X)
def estimators(self):
assert(self.forest.estimators_ is not None)
return self.forest.estimators_
def n_estimators(self):
return self.forest.n_estimators
def print_accuracy(self, X_test, y_test):
test_acc = accuracy_score(self.predict(X_test), y_test)
print("c Model accuracy: {0:.2f}".format(100. * test_acc))
#print("----------------------")
\ No newline at end of file
#
#==============================================================================
from anytree import Node, RenderTree,AsciiStyle
import json
import numpy as np
import math
import os
#
#==============================================================================
class dt_node(Node):
def __init__(self, id, parent = None):
Node.__init__(self, id, parent)
self.id = id # The node value
self.name = None
self.left_node_id = -1 # Left child
self.right_node_id = -1 # Right child
self.feature = -1
self.threshold = None
self.values = -1
#iai
#self.split = None
def __str__(self):
pref = ' ' * self.depth
if (len(self.children) == 0):
return (pref+ "leaf: {} {}".format(self.id, self.values))
else:
if(self.name is None):
return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold))
else:
return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold))
#==============================================================================
def build_tree(tree_, feature_names = None):
##
feature = tree_.feature
threshold = tree_.threshold
values = tree_.value
n_nodes = tree_.node_count
children_left = tree_.children_left
children_right = tree_.children_right
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaf = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
# If we have a test node
if (children_left[node_id] != children_right[node_id]):
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaf[node_id] = True
##
m = tree_.node_count
assert (m > 0), "Empty tree"
def extract_data(idx, root = None, feature_names = None):
i = idx
assert (i < m), "Error index node"
if (root is None):
node = dt_node(i)
else:
node = dt_node(i, parent = root)
#node.cover = json_node["cover"]
if is_leaf[i]:
node.values = np.argmax(values[i])
#if(inverse):
# node.values = -node.values
else:
node.feature = feature[i]
if (feature_names is not None):
node.name = feature_names[feature[i]]
node.threshold = threshold[i]
node.left_node_id = children_left[i]
node.right_node_id = children_right[i]
extract_data(node.left_node_id, node, feature_names) #feat < threshold ( < 0.5 False)
extract_data(node.right_node_id, node, feature_names) #feat >= threshold ( >= 0.5 True)
return node
root = extract_data(0, None, feature_names)
return root
#==============================================================================
def walk_tree(node):
if (len(node.children) == 0):
# leaf
print(node)
else:
print(node)
walk_tree(node.children[0])
walk_tree(node.children[1])
def count_nodes(root):
def count(node):
if len(node.children):
return sum([1+count(n) for n in node.children])
else:
return 0
m = count(root) + 1
return m
#
#==============================================================================
def predict_tree(node, sample):
if (len(node.children) == 0):
# leaf
return node.values
else:
feature_branch = node.feature
sample_value = sample[feature_branch]
assert(sample_value is not None)
if(sample_value < node.threshold):
return predict_tree(node.children[0], sample)
else:
return predict_tree(node.children[1], sample)
#
#==============================================================================
class Forest:
""" An ensemble of decision trees.
This object provides a common interface to many different types of models.
"""
def __init__(self, rf, feature_names = None):
#self.rf = rf
self.trees = [ build_tree(dt.tree_, feature_names) for dt in rf.estimators()]
self.sz = sum([dt.tree_.node_count for dt in rf.estimators()])
self.md = max([dt.tree_.max_depth for dt in rf.estimators()])
####
nb_nodes = [dt.tree_.node_count for dt in rf.estimators()]
print("min: {0} | max: {1}".format(min(nb_nodes), max(nb_nodes)))
assert([dt.tree_.node_count for dt in rf.estimators()] == [count_nodes(dt) for dt in self.trees])
#self.print_trees()
def print_trees(self):
for i,t in enumerate(self.trees):
print("tree number: ", i)
walk_tree(t)
def predict_inst(self, inst):
scores = [predict_tree(dt, inst) for dt in self.trees]
scores = np.asarray(scores)
maj = np.argmax(np.bincount(scores))
return maj
def predict(self, samples):
predictions = []
print("#Trees: ", len(self.trees))
for sample in np.asarray(samples):
scores = []
for i,t in enumerate(self.trees):
s = predict_tree(t, sample)
scores.append((s))
scores = np.asarray(scores)
predictions.append(scores)
predictions = np.asarray(predictions)
#print(predictions)
#np.bincount(x, weights=self._weights_not_none)
maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions)
return maj
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment