diff --git a/data_retriever.json b/data_retriever.json index 5ef4f56851cd35ba4dfc85f6aea5860c14b5d05e..c85d663bd1b15502c50c4ebe766f77282b3770b5 100644 --- a/data_retriever.json +++ b/data_retriever.json @@ -22,7 +22,7 @@ "ml_type" : "RandomForest", "component" : "RandomForestComponent", "solvers" : ["LIME", "ANCHOR", "SHAP"], - "xtypes" : {"H": "Heuristic", "HV": "Heuristic and validation", "G": "Global"} + "xtypes" : {"S": "Smallest", "NS": "Not smallest"} } ] diff --git a/pages/application/DecisionTree/DecisionTreeComponent.py b/pages/application/DecisionTree/DecisionTreeComponent.py index edb89a4d4c0a863ee9cfe80c6a35b28745038eca..23dc3172ae0156c80d1268bf37513f9c666a3608 100644 --- a/pages/application/DecisionTree/DecisionTreeComponent.py +++ b/pages/application/DecisionTree/DecisionTreeComponent.py @@ -11,16 +11,16 @@ from pages.application.DecisionTree.utils.dtviz import (visualize, from pages.application.DecisionTree.utils.upload_tree import UploadedDecisionTree -class DecisionTreeComponent(): +class DecisionTreeComponent: - def __init__(self, tree, type_tree='SKL', info=None, type_info=''): + def __init__(self, tree, info=None, type_info=''): if info is not None and '.csv' in type_info: self.categorical = True data = Data(info) fvmap = data.mapping_features() feature_names = data.names[:-1] - self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), + self.uploaded_dt = UploadedDecisionTree(tree, "SKL", maxdepth=tree.get_depth(), feature_names=feature_names, nb_classes=tree.n_classes_) self.dt_format, self.map, features_names_mapping = self.uploaded_dt.dump(fvmap, feat_names=feature_names) @@ -38,7 +38,7 @@ class DecisionTreeComponent(): dom = sorted(dom) for j, v in enumerate(dom): fvmap[f'f{i}'][j] = (fid, True, v) - self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), + self.uploaded_dt = UploadedDecisionTree(tree, "SKL", maxdepth=tree.get_depth(), feature_names=feature_names, nb_classes=tree.n_classes_) self.dt_format, self.map, features_names_mapping = self.uploaded_dt.dump(fvmap, feat_names=feature_names) @@ -48,7 +48,7 @@ class DecisionTreeComponent(): feature_names = tree.feature_names_in_ except: feature_names = [f'f{i}' for i in range(tree.n_features_in_)] - self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), + self.uploaded_dt = UploadedDecisionTree(tree, "SKL", maxdepth=tree.get_depth(), feature_names=feature_names, nb_classes=tree.n_classes_) self.dt_format, self.map, features_names_mapping = self.uploaded_dt.convert_dt(feat_names=feature_names) diff --git a/pages/application/NaiveBayes/NaiveBayesComponent.py b/pages/application/NaiveBayes/NaiveBayesComponent.py index 98c7848a12a2e33bc270c8d30a0caab823f10c05..1d0d2efbbb1e6657e1fc40920e2371e557ffbd86 100644 --- a/pages/application/NaiveBayes/NaiveBayesComponent.py +++ b/pages/application/NaiveBayes/NaiveBayesComponent.py @@ -11,7 +11,7 @@ import shlex class NaiveBayesComponent(): - def __init__(self, model, type_model='SKL', info=None, type_info=''): + def __init__(self, model, info=None, type_info=''): #Conversion model p=subprocess.Popen(['perl','pages/application/NaiveBayes/utils/cnbc2xlc.pl', model],stdout=subprocess.PIPE) diff --git a/pages/application/RandomForest/RandomForestComponent.py b/pages/application/RandomForest/RandomForestComponent.py index 50dd5d332e9c3b3e1e66ed57007f10415152d221..25f27d5e666a38a711f32ce18a3ff8ec5d83ae3e 100644 --- a/pages/application/RandomForest/RandomForestComponent.py +++ b/pages/application/RandomForest/RandomForestComponent.py @@ -1,83 +1,83 @@ -import base64 +from dash import html +from io import StringIO +import pandas as pd -import dash_bootstrap_components as dbc -import numpy as np -from dash import dcc, html +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble._voting import VotingClassifier +from pages.application.RandomForest.utils import xrf +from pages.application.RandomForest.utils.xrf.rndmforest import XRF, RF2001, VotingRF, Dataset +from xgboost import XGBClassifier, XGBRFClassifier -from pages.application.RandomForest.utils.data import Data from pages.application.RandomForest.utils.anchor_wrap import anchor_call from pages.application.RandomForest.utils.lime_wrap import lime_call from pages.application.RandomForest.utils.shap_wrap import shap_call -from pages.application.RandomForest.utils.xgbooster import XGBooster, preprocess_dataset -from pages.application.RandomForest.utils.xgbrf import XGBRandomForest +from pages.application.RandomForest.utils.xgbooster import XGBooster +from pages.application.RandomForest.utils.xgbrf import XGBRandomForest -class RandomForestComponent: - def __init__(self, model, type_model='SKL', info=None, type_info=''): +################################################################# +##################### Questions ################################# +##### Can upload sklearn model or need to translate to xrf ###### +# separate XGBRandomForest and XGBooster # +# data format : avoir un fichier chelou de données là # +# pas adapté, get_dump ? beaucoup de classe, besoin de diagramme de classe # - if info is not None and '.csv' in type_info: - self.data = Data(info) - # Conversion model - if type_model == "RF": - self.random_forest = XGBRandomForest(info, from_model=model) - else: - self.random_forest = XGBooster(info, from_model=model) +class RandomForestComponent: - # self.random_forest.encode(test_on=info) + def __init__(self, model, info=None, type_info=''): - self.map_file = "" + # Conversion model + options = {} + if info is not None and '.csv' in type_info: + self.data = Dataset(info) + self.data.mapping_features() + options["n_classes"] = self.data.num_class + options["feature_names"] = self.data.feature_names + options["n_features"] = self.data.nb_features + if isinstance(model, RandomForestClassifier) or isinstance(model, VotingClassifier) or isinstance(model, + xrf.rndmforest.RF2001): + self.random_forest = XRF(model, self.data) + elif isinstance(model, XGBRFClassifier): + self.random_forest = XGBRandomForest(options, from_model=model) + elif isinstance(model, XGBClassifier): + self.random_forest = XGBooster(options, from_model=model) self.network = html.Div([]) self.explanation = [] - def update_with_explicability(self, instance, enum_feats=None, validation=None, xtype=None, solver=None, ): + def update_with_explicability(self, instance, enum_feats=None, xtype=None, solver=None, ): # Call explanation if not enum_feats and self.data is not None: - enum_feats = len(self.data.names) - 1 + enum_feats = len(self.data.nb_features) - 1 - expl = self.random_forest.explain(instance, - use_lime=lime_call if solver == "lime" else None, - use_anchor=anchor_call if solver == "anchor" else None, - use_shap=shap_call if solver == "shap" else None, - nof_feats=enum_feats) - - if validation: - coex = self.random_forest.validate(instance, expl) - if coex: - # repairing the local explanation - gexpl = self.random_forest.explain(instance, expl_ext=expl, prefer_ext=True) - else: - # an attempt to refine the local explanation further - gexpl = self.random_forest.explain(instance, expl_ext=expl) - - print(expl) + smallest = True if xtype == "S" else False + if isinstance(self.random_forest, XRF): + explanation_result = self.random_forest.explain(instance) + else: + explanation_result = self.random_forest.explain(instance, smallest, solver, + use_lime=lime_call if solver == "LIME" else None, + use_anchor=anchor_call if solver == "ANCHOR" else None, + use_shap=shap_call if solver == "SHAP" else None, + nof_feats=enum_feats) self.explanation = [] list_explanations_path = [] - explanation = {} self.network = html.Div([]) # Creating a clean and nice text component - # instance plotting - self.explanation.append(html.H4("Instance : \n")) - self.explanation.append(html.P(str([str(instance[i]) for i in range(len(instance))]))) - for k in explanation.keys(): - if k != "List of path explanation(s)" and k != "List of path contrastive explanation(s)": - if k in ["List of abductive explanation(s)", "List of contrastive explanation(s)"]: - self.explanation.append(html.H4(k)) - for expl in explanation[k]: - self.explanation.append(html.Hr()) - self.explanation.append(html.P(expl)) - self.explanation.append(html.Hr()) - else: - self.explanation.append(html.P(k + explanation[k])) - else: - list_explanations_path = explanation["List of path explanation(s)"] - list_contrastive_explanations_path = explanation["List of path contrastive explanation(s)"] - - return list_explanations_path, list_contrastive_explanations_path + compt=0 + for sample_expl in explanation_result: + compt+=1 + self.explanation.append(html.H4("Sample{0} : \n".format(compt))) + for k in sample_expl.keys(): + self.explanation.append(html.H5(k)) + self.explanation.append(html.Hr()) + self.explanation.append(html.P(sample_expl[k])) + self.explanation.append(html.Hr()) + + return list_explanations_path, [] diff --git a/pages/application/RandomForest/utils/data.py b/pages/application/RandomForest/utils/data.py index 6c94e3da69d365d8270afa00f5f8fa7db1506ab7..d86e8ecbf6360b8959a17b3282cb8f7303b0dc51 100644 --- a/pages/application/RandomForest/utils/data.py +++ b/pages/application/RandomForest/utils/data.py @@ -26,8 +26,7 @@ class Data(object): Class for representing data (transactions). """ - def __init__(self, filename=None, fpointer=None, mapfile=None, - separator=' ', use_categorical = False): + def __init__(self, file=None, mapfile=None, separator=',', use_categorical = False): """ Constructor and parser. """ @@ -40,55 +39,21 @@ class Data(object): self.fvmap = None self.ovmap = {} self.fvars = None - self.fname = filename self.mname = mapfile self.deleted = set([]) - if filename: - with open(filename, 'r') as fp: - self.parse(fp, separator) - elif fpointer: - self.parse(fpointer, separator) + self.parse(file, separator) if self.mname: self.read_orig_values() - # check if we have extra info about categorical_features - - if (use_categorical): - extra_file = filename+".pkl" - try: - f = open(extra_file, "rb") - print("Attempt: loading extra data from ", extra_file) - extra_info = pickle.load(f) - print("loaded") - f.close() - self.categorical_features = extra_info["categorical_features"] - self.categorical_names = extra_info["categorical_names"] - self.class_names = extra_info["class_names"] - self.categorical_onehot_names = extra_info["categorical_names"].copy() - - for i, name in enumerate(self.class_names): - self.class_names[i] = str(name).replace("b'","'") - for c in self.categorical_names.items(): - clean_feature_names = [] - for i, name in enumerate(c[1]): - name = str(name).replace("b'","'") - clean_feature_names.append(name) - self.categorical_names[c[0]] = clean_feature_names - - except Exception as e: - f.close() - print("Please provide info about categorical features or omit option -c", e) - exit() - def parse(self, fp, separator): """ Parse input file. """ # reading data set from file - lines = fp.readlines() + lines = fp.split('\n') # reading preamble self.names = lines[0].strip().split(separator) diff --git a/pages/application/RandomForest/utils/xgbooster/encode.py b/pages/application/RandomForest/utils/xgbooster/encode.py index 6a77fb3afb792f0cd15276c11e03b4b4005f5109..bf98a6b94477d7cad42b84716783ed364ef83ae7 100644 --- a/pages/application/RandomForest/utils/xgbooster/encode.py +++ b/pages/application/RandomForest/utils/xgbooster/encode.py @@ -42,7 +42,6 @@ class SMTEncoder(object): self.feats = {f: i for i, f in enumerate(feats)} self.nofcl = nof_classes self.idmgr = IDPool() - self.optns = xgb.options # xgbooster will also be needed self.xgb = xgb @@ -165,8 +164,8 @@ class SMTEncoder(object): # if targeting interval-based encoding, # traverse all trees and extract all possible intervals # for each feature - if self.optns.encode == 'smtbool': - self.compute_intervals() + # if self.optns.encode == 'smtbool': + self.compute_intervals() # traversing and encoding each tree for i, tree in enumerate(self.ensemble.trees): @@ -203,13 +202,9 @@ class SMTEncoder(object): # number of variables nof_vars = len(self.enc.get_free_variables()) - if self.optns.verb: - print('encoding vars:', nof_vars) - print('encoding asserts:', nof_asserts) - return self.enc, self.intvs, self.imaps, self.ivars - def test_sample(self, sample): + def test_sample(self, sample, solver=None): """ Check whether or not the encoding "predicts" the same class as the classifier given an input sample. @@ -221,9 +216,6 @@ class SMTEncoder(object): # score arrays computed for each class csum = [[] for c in range(self.nofcl)] - if self.optns.verb: - print('testing sample:', list(sample)) - sample_internal = list(self.xgb.transform(sample)[0]) # traversing all trees @@ -274,7 +266,7 @@ class SMTEncoder(object): # now, getting the model escores = [] - model = get_model(And(self.enc, *hypos), solver_name=self.optns.solver) + model = get_model(And(self.enc, *hypos), solver_name=solver) for c in range(self.nofcl): v = Symbol('class{0}_score'.format(c), typename=REAL) escores.append(float(model.get_py_value(v))) @@ -282,10 +274,6 @@ class SMTEncoder(object): assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \ 'wrong prediction: {0} vs {1}'.format(cscores, escores) - if self.optns.verb: - print('xgb scores:', cscores) - print('enc scores:', escores) - def save_to(self, outfile): """ Save the encoding into a file with a given name. diff --git a/pages/application/RandomForest/utils/xgbooster/explain.py b/pages/application/RandomForest/utils/xgbooster/explain.py index ca078749949fa2ad3fa3d01cb080653965effdf5..a4e5f8b1e107d51263a616ce5c84588807bdce61 100644 --- a/pages/application/RandomForest/utils/xgbooster/explain.py +++ b/pages/application/RandomForest/utils/xgbooster/explain.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#-*- coding:utf-8 -*- +# -*- coding:utf-8 -*- ## ## explain.py ## @@ -9,7 +9,7 @@ ## # -#============================================================================== +# ============================================================================== from __future__ import print_function import numpy as np import os @@ -23,6 +23,7 @@ from six.moves import range import sys + # #============================================================================== class SMTExplainer(object): @@ -31,7 +32,7 @@ class SMTExplainer(object): """ def __init__(self, formula, intvs, imaps, ivars, feats, nof_classes, - options, xgb): + solver, xgb): """ Constructor. """ @@ -41,14 +42,12 @@ class SMTExplainer(object): self.imaps = imaps self.ivars = ivars self.nofcl = nof_classes - self.optns = options self.idmgr = IDPool() # saving XGBooster self.xgb = xgb - self.verbose = self.optns.verb - self.oracle = Solver(name=options.solver) + self.oracle = Solver(name=solver) self.inps = [] # input (feature value) variables for f in self.xgb.extended_feature_names_as_array_strings: @@ -150,28 +149,26 @@ class SMTExplainer(object): disj.append(GT(self.outs[i], self.outs[self.out_id])) self.oracle.add_assertion(Implies(self.selv, Or(disj))) - if self.verbose: - inpvals = self.xgb.readable_sample(sample) + inpvals = self.xgb.readable_sample(sample) - self.preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in str(v): - self.preamble.append('{0} = {1}'.format(f, v)) - else: - self.preamble.append(v) + self.preamble = [] + for f, v in zip(self.xgb.feature_names, inpvals): + if f not in str(v): + self.preamble.append('{0} = {1}'.format(f, v)) + else: + self.preamble.append(v) - print(' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.output)) + explanation_dic = {} + explanation_dic["explaning instance"] = ' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.output) + return explanation_dic def explain(self, sample, smallest, expl_ext=None, prefer_ext=False): """ Hypotheses minimization. """ - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - # adapt the solver to deal with the current sample - self.prepare(sample) + explanation_dic = self.prepare(sample) # saving external explanation to be minimized further if expl_ext == None or prefer_ext: @@ -182,27 +179,20 @@ class SMTExplainer(object): # if satisfiable, then the observation is not implied by the hypotheses if self.oracle.solve([self.selv] + [h for h, c in zip(self.rhypos, self.to_consider) if c]): - print(' no implication!') - print(self.oracle.get_model()) - sys.exit(1) - - if not smallest: - self.compute_minimal(prefer_ext=prefer_ext) - else: - self.compute_smallest() - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - expl = sorted([self.sel2fid[h] for h in self.rhypos]) + explanation_dic["no implication"] = self.oracle.get_model() + else : + if not smallest: + self.compute_minimal(prefer_ext=prefer_ext) + else: + self.compute_smallest() - if self.verbose: + expl = sorted([self.sel2fid[h] for h in self.rhypos]) + explanation_dic["explanation brute "] = expl self.preamble = [self.preamble[i] for i in expl] - print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id])) - print(' # hypos left:', len(self.rhypos)) - print(' time: {0:.2f}'.format(self.time)) + explanation_dic["explanation"] = ' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id]) + explanation_dic["Hyphothesis left"] = ' # hypos left:' + str(len(self.rhypos)) - return expl + return explanation_dic def compute_minimal(self, prefer_ext=False): """ @@ -258,10 +248,6 @@ class SMTExplainer(object): hset = hitman.get() iters += 1 - if self.verbose > 1: - print('iter:', iters) - print('cand:', hset) - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset]): to_hit = [] satisfied, unsatisfied = [], [] @@ -302,10 +288,7 @@ class SMTExplainer(object): else: to_hit.append(h) - if self.verbose > 1: - print('coex:', to_hit) - hitman.hit(to_hit) else: self.rhypos = [self.rhypos[i] for i in hset] - break + break \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xgbooster/xgbooster.py b/pages/application/RandomForest/utils/xgbooster/xgbooster.py index 25cd86ce6de6ec5e5f0836344354afa7f7b87d26..b7e62b449357fccc267d08442bd6e2fcc7388f3e 100644 --- a/pages/application/RandomForest/utils/xgbooster/xgbooster.py +++ b/pages/application/RandomForest/utils/xgbooster/xgbooster.py @@ -11,24 +11,16 @@ # # ============================================================================== from __future__ import print_function -from .validate import SMTValidator -from .encode import SMTEncoder -from .explain import SMTExplainer -import numpy as np -import os -import resource -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -import sklearn -# print('The scikit-learn version is {}.'.format(sklearn.__version__)) -from sklearn.preprocessing import OneHotEncoder, LabelEncoder -import sys +import random + +import numpy as np from six.moves import range -from .tree import TreeEnsemble -import xgboost as xgb -from xgboost import XGBClassifier, Booster -import pickle +from xgboost import XGBClassifier + +from .encode import SMTEncoder +from .explain import SMTExplainer +from .validate import SMTValidator # @@ -38,164 +30,20 @@ class XGBooster(object): The main class to train/encode/explain XGBoost models. """ - def __init__(self, options, from_data=None, from_model=None, - from_encoding=None): + def __init__(self, options, from_model=None): """ Constructor. """ + np.random.seed(random.randint(1, 100)) - assert from_data or from_model or from_encoding, \ - 'At least one input file should be specified' - - self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime - - # saving command-line options - self.options = options - self.seed = self.options.seed - np.random.seed(self.seed) - - if from_data: - self.use_categorical = self.options.use_categorical - # saving data - self.data = from_data - ## - samps = np.asarray(self.data.samps) - if not all(c.isnumeric() for c in samps[:, -1]): - le = LabelEncoder() - le.fit(samps[:, -1]) - samps[:, -1] = le.transform(samps[:, -1]) - # self.class_names = le.classes_ - # print(le.classes_) - ## - dataset = np.asarray(samps, dtype=np.float32) - # dataset = np.asarray(self.data.samps, dtype=np.float32) - - # split data into X and y - self.feature_names = self.data.names[:-1] - self.nb_features = len(self.feature_names) - - self.X = dataset[:, 0:self.nb_features] - self.Y = dataset[:, self.nb_features] - self.num_class = len(set(self.Y)) - self.target_name = list(range(self.num_class)) - - param_dist = {'n_estimators': self.options.n_estimators, - 'max_depth': self.options.maxdepth} - - if (self.num_class == 2): - param_dist['objective'] = 'binary:logistic' - - self.model = XGBClassifier(**param_dist) - - # split data into train and test sets - self.test_size = self.options.testsplit - if (self.test_size > 0): - self.X_train, self.X_test, self.Y_train, self.Y_test = \ - train_test_split(self.X, self.Y, test_size=self.test_size, - random_state=self.seed) - else: - self.X_train = self.X - self.X_test = [] # need a fix - self.Y_train = self.Y - self.Y_test = [] # need a fix - - # check if we have info about categorical features - if (self.use_categorical): - self.categorical_features = from_data.categorical_features - self.categorical_names = from_data.categorical_names - self.target_name = from_data.class_names - - #################################### - # this is a set of checks to make sure that we use the same as anchor encoding - cat_names = sorted(self.categorical_names.keys()) - assert (cat_names == self.categorical_features) - self.encoder = {} - for i in self.categorical_features: - self.encoder.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , - self.encoder[i].fit(self.X[:, [i]]) - - else: - self.categorical_features = [] - self.categorical_names = [] - self.encoder = [] - - fname = from_data - - elif from_model: - fname = from_model - self.load_datainfo(from_model) - if (self.use_categorical is False) and (self.options.use_categorical is True): - print( - "Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions") - exit() - # load model - - elif from_encoding: - fname = from_encoding - - # encoding, feature names, and number of classes - # are read from an input file - enc = SMTEncoder(None, None, None, self, from_encoding) - self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \ - self.num_class = enc.access() - - # create extra file names - try: - os.stat(options.output) - except: - os.mkdir(options.output) - - self.mapping_features() - ################# - self.test_encoding_transformes() - - bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] - bench_dir_name = options.output + "/bt/" + bench_name - try: - os.stat(bench_dir_name) - except: - os.mkdir(bench_dir_name) - - self.basename = (os.path.join(bench_dir_name, bench_name + - "_nbestim_" + str(options.n_estimators) + - "_maxdepth_" + str(options.maxdepth) + - "_testsplit_" + str(options.testsplit))) - - data_suffix = '.splitdata.pkl' - self.modfile = self.basename + '.mod.pkl' - - self.mod_plainfile = self.basename + '.mod.txt' - - self.resfile = self.basename + '.res.txt' - self.encfile = self.basename + '.enc.txt' - self.expfile = self.basename + '.exp.txt' - - def load_datainfo(self, model_from_pkl, data_from_pkl): self.model = XGBClassifier() - self.model = model_from_pkl - loaded_data = data_from_pkl - self.X = loaded_data["X"] - self.Y = loaded_data["Y"] - self.X_train = loaded_data["X_train"] - self.X_test = loaded_data["X_test"] - self.Y_train = loaded_data["Y_train"] - self.Y_test = loaded_data["Y_test"] - self.feature_names = loaded_data["feature_names"] - self.target_name = loaded_data["target_name"] - self.num_class = loaded_data["num_class"] - self.nb_features = len(self.feature_names) - self.categorical_features = loaded_data["categorical_features"] - self.categorical_names = loaded_data["categorical_names"] - self.encoder = loaded_data["encoder"] - self.use_categorical = loaded_data["use_categorical"] + self.model = from_model - def train(self, outfile=None): - """ - Train a tree ensemble using XGBoost. - """ + self.feature_names = options["feature_names"] + self.num_class = options["n_classes"] + self.nb_features = options["n_features"] - return self.build_xgbtree(outfile) + self.mapping_features() def encode(self, test_on=None): """ @@ -208,9 +56,7 @@ class XGBooster(object): if test_on: encoder.test_sample(np.array(test_on)) - # encoder.save_to(self.encfile) - - def explain(self, sample, use_lime=None, use_anchor=None, use_shap=None, + def explain(self, sample, smallest, solver, use_lime=None, use_anchor=None, use_shap=None, expl_ext=None, prefer_ext=False, nof_feats=5): """ Explain a prediction made for a given sample with a previously @@ -229,12 +75,10 @@ class XGBooster(object): if 'x' not in dir(self): self.x = SMTExplainer(self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, self.num_class, - self.options, self) + solver, self) - expl = self.x.explain(np.array(sample), self.options.smallest, - expl_ext, prefer_ext) + expl = self.x.explain(np.array(sample), smallest, expl_ext, prefer_ext) - # returning the explanation return expl def validate(self, sample, expl): @@ -255,191 +99,12 @@ class XGBooster(object): # try to compute a counterexample return self.v.validate(np.array(sample), expl) - def transform(self, x): - if (len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert (self.encoder != []) - tx = [] - for i in range(self.nb_features): - self.encoder[i].drop = None - if (i in self.categorical_features): - tx_aux = self.encoder[i].transform(x[:, [i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:, [i]]) - tx = np.hstack(tx) - return tx - else: - return x - - def transform_inverse(self, x): - if (len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert (self.encoder != []) - inverse_x = [] - for i, xi in enumerate(x): - inverse_xi = np.zeros(self.nb_features) - for f in range(self.nb_features): - if f in self.categorical_features: - nb_values = len(self.categorical_names[f]) - v = xi[:nb_values] - v = np.expand_dims(v, axis=0) - iv = self.encoder[f].inverse_transform(v) - inverse_xi[f] = iv - xi = xi[nb_values:] - - else: - inverse_xi[f] = xi[0] - xi = xi[1:] - inverse_x.append(inverse_xi) - return inverse_x - else: - return x - - def transform_inverse_by_index(self, idx): - if (idx in self.extended_feature_names): - return self.extended_feature_names[idx] - else: - print("Warning there is no feature {} in the internal mapping".format(idx)) - return None - - def transform_by_value(self, feat_value_pair): - if (feat_value_pair in self.extended_feature_names.values()): - keys = ( - list(self.extended_feature_names.keys())[list(self.extended_feature_names.values()).index(feat_value_pair)]) - return keys - else: - print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) - return None - def mapping_features(self): self.extended_feature_names = {} self.extended_feature_names_as_array_strings = [] counter = 0 - if (self.use_categorical): - for i in range(self.nb_features): - if (i in self.categorical_features): - for j, _ in enumerate(self.encoder[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append( - "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) - counter = counter + 1 - else: - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - else: - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - - def readable_sample(self, x): - readable_x = [] - for i, v in enumerate(x): - if (i in self.categorical_features): - readable_x.append(self.categorical_names[i][int(v)]) - else: - readable_x.append(v) - return np.asarray(readable_x) - - def test_encoding_transformes(self): - # test encoding - - X = self.X_train[[0], :] - - print("Sample of length", len(X[0]), " : ", X) - enc_X = self.transform(X) - print("Encoded sample of length", len(enc_X[0]), " : ", enc_X) - inv_X = self.transform_inverse(enc_X) - print("Back to sample", inv_X) - print("Readable sample", self.readable_sample(inv_X[0])) - assert ((inv_X == X).all()) - - if (self.options.verb > 1): - for i in range(len(self.extended_feature_names)): - print(i, self.transform_inverse_by_index(i)) - for key, value in self.extended_feature_names.items(): - print(value, self.transform_by_value(value)) - - def transfomed_sample_info(self, i): - print(enc.categories_) - - def build_xgbtree(self, outfile=None): - """ - Build an ensemble of trees. - """ - - if (outfile is None): - outfile = self.modfile - else: - self.datafile = sefl.form_datefile_name(outfile) - - # fit model no training data - - if (len(self.X_test) > 0): - eval_set = [(self.transform(self.X_train), self.Y_train), (self.transform(self.X_test), self.Y_test)] - else: - eval_set = [(self.transform(self.X_train), self.Y_train)] - - print("start xgb") - self.model.fit(self.transform(self.X_train), self.Y_train, - eval_set=eval_set, - verbose=self.options.verb) # eval_set=[(X_test, Y_test)], - print("end xgb") - - evals_result = self.model.evals_result() - ########## saving model - self.save_datainfo(outfile) - print("saving plain model to ", self.mod_plainfile) - self.model._Booster.dump_model(self.mod_plainfile) - - ensemble = TreeEnsemble(self.model, self.extended_feature_names_as_array_strings, nb_classes=self.num_class) - - y_pred_prob = self.model.predict_proba(self.transform(self.X_train[:10])) - y_pred_prob_compute = ensemble.predict(self.transform(self.X_train[:10]), self.num_class) - - assert (np.absolute(y_pred_prob_compute - y_pred_prob).sum() < 0.01 * len(y_pred_prob)) - - ### accuracy - try: - train_accuracy = round(1 - evals_result['validation_0']['merror'][-1], 2) - except: - try: - train_accuracy = round(1 - evals_result['validation_0']['error'][-1], 2) - except: - assert (False) - - try: - test_accuracy = round(1 - evals_result['validation_1']['merror'][-1], 2) - except: - try: - test_accuracy = round(1 - evals_result['validation_1']['error'][-1], 2) - except: - print("no results test data") - test_accuracy = 0 - - #### saving - print("saving results to ", self.resfile) - with open(self.resfile, 'w') as f: - f.write("{} & {} & {} &{} &{} & {} \\\\ \n \hline \n".format( - os.path.basename(self.options.files[0]).replace("_", "-"), - train_accuracy, - test_accuracy, - self.options.n_estimators, - self.options.maxdepth, - self.options.testsplit)) - f.close() - - print("c BT sz:", ensemble.sz) - print("Train accuracy: %.2f%%" % (train_accuracy * 100.0)) - print("Test accuracy: %.2f%%" % (test_accuracy * 100.0)) - return train_accuracy, test_accuracy, self.model + for i in range(self.nb_features): + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) + counter = counter + 1 diff --git a/pages/application/RandomForest/utils/xgbrf/encode.py b/pages/application/RandomForest/utils/xgbrf/encode.py index 6a77fb3afb792f0cd15276c11e03b4b4005f5109..78816a8e5e5723ff99979cd918a9b4b4b27f113c 100644 --- a/pages/application/RandomForest/utils/xgbrf/encode.py +++ b/pages/application/RandomForest/utils/xgbrf/encode.py @@ -39,10 +39,10 @@ class SMTEncoder(object): """ self.model = model + self.features_names = feats self.feats = {f: i for i, f in enumerate(feats)} self.nofcl = nof_classes self.idmgr = IDPool() - self.optns = xgb.options # xgbooster will also be needed self.xgb = xgb @@ -165,8 +165,8 @@ class SMTEncoder(object): # if targeting interval-based encoding, # traverse all trees and extract all possible intervals # for each feature - if self.optns.encode == 'smtbool': - self.compute_intervals() + # if self.optns.encode == 'smtbool': + self.compute_intervals() # traversing and encoding each tree for i, tree in enumerate(self.ensemble.trees): @@ -203,13 +203,9 @@ class SMTEncoder(object): # number of variables nof_vars = len(self.enc.get_free_variables()) - if self.optns.verb: - print('encoding vars:', nof_vars) - print('encoding asserts:', nof_asserts) - return self.enc, self.intvs, self.imaps, self.ivars - def test_sample(self, sample): + def test_sample(self, sample, solver): """ Check whether or not the encoding "predicts" the same class as the classifier given an input sample. @@ -221,9 +217,6 @@ class SMTEncoder(object): # score arrays computed for each class csum = [[] for c in range(self.nofcl)] - if self.optns.verb: - print('testing sample:', list(sample)) - sample_internal = list(self.xgb.transform(sample)[0]) # traversing all trees @@ -274,7 +267,7 @@ class SMTEncoder(object): # now, getting the model escores = [] - model = get_model(And(self.enc, *hypos), solver_name=self.optns.solver) + model = get_model(And(self.enc, *hypos), solver_name=solver) for c in range(self.nofcl): v = Symbol('class{0}_score'.format(c), typename=REAL) escores.append(float(model.get_py_value(v))) @@ -282,10 +275,6 @@ class SMTEncoder(object): assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \ 'wrong prediction: {0} vs {1}'.format(cscores, escores) - if self.optns.verb: - print('xgb scores:', cscores) - print('enc scores:', escores) - def save_to(self, outfile): """ Save the encoding into a file with a given name. diff --git a/pages/application/RandomForest/utils/xgbrf/tree.py b/pages/application/RandomForest/utils/xgbrf/tree.py index afe34d97e331057f9a5b7c0ccef63e85801a572d..6c7f7538508ac5c08d695d5caaab8ee12b9ce245 100644 --- a/pages/application/RandomForest/utils/xgbrf/tree.py +++ b/pages/application/RandomForest/utils/xgbrf/tree.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#-*- coding:utf-8 -*- +# -*- coding:utf-8 -*- ## ## tree.py (reuses parts of the code of SHAP) ## @@ -9,8 +9,8 @@ ## # -#============================================================================== -from anytree import Node, RenderTree,AsciiStyle +# ============================================================================== +from anytree import Node, RenderTree, AsciiStyle import json import numpy as np import xgboost as xgb @@ -18,13 +18,13 @@ import math # -#============================================================================== +# ============================================================================== class xgnode(Node): - def __init__(self, id, parent = None): + def __init__(self, id, parent=None): Node.__init__(self, id, parent) self.id = id # The node value self.name = None - self.left_node_id = -1 # Left child + self.left_node_id = -1 # Left child self.right_node_id = -1 # Right child self.missing_node_id = -1 @@ -37,29 +37,31 @@ class xgnode(Node): def __str__(self): pref = ' ' * self.depth if (len(self.children) == 0): - return (pref+ "leaf: {} {}".format(self.id, self.values)) + return (pref + "leaf: {} {}".format(self.id, self.values)) else: - if(self.name is None): - return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) + if (self.name is None): + return (pref + "{} f{}<{}".format(self.id, self.feature, self.threshold)) else: - return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) + return (pref + "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) # -#============================================================================== -def build_tree(json_tree, node = None, feature_names = None, inverse = False): +# ============================================================================== +def build_tree(json_tree, node=None, feature_names=None, inverse=False): def max_id(node): if "children" in node: return max(node["nodeid"], *[max_id(n) for n in node["children"]]) else: return node["nodeid"] + m = max_id(json_tree) + 1 - def extract_data(json_node, root = None, feature_names = None): + + def extract_data(json_node, root=None, feature_names=None): i = json_node["nodeid"] if (root is None): node = xgnode(i) else: - node = xgnode(i, parent = root) + node = xgnode(i, parent=root) node.cover = json_node["cover"] if "children" in json_node: @@ -73,8 +75,8 @@ def build_tree(json_tree, node = None, feature_names = None, inverse = False): for c, n in enumerate(json_node["children"]): child = extract_data(n, node, feature_names) elif "leaf" in json_node: - node.values = json_node["leaf"] - if(inverse): + node.values = json_node["leaf"] + if (inverse): node.values = -node.values return node @@ -83,7 +85,7 @@ def build_tree(json_tree, node = None, feature_names = None, inverse = False): # -#============================================================================== +# ============================================================================== def walk_tree(node): if (len(node.children) == 0): # leaf @@ -95,7 +97,7 @@ def walk_tree(node): # -#============================================================================== +# ============================================================================== def scores_tree(node, sample): if (len(node.children) == 0): # leaf @@ -103,41 +105,43 @@ def scores_tree(node, sample): else: feature_branch = node.feature sample_value = sample[feature_branch] - assert(sample_value is not None) - if(sample_value < node.threshold): + assert (sample_value is not None) + if (sample_value < node.threshold): return scores_tree(node.children[0], sample) else: return scores_tree(node.children[1], sample) # -#============================================================================== +# ============================================================================== class TreeEnsemble: """ An ensemble of decision trees. This object provides a common interface to many different types of models. """ - def __init__(self, model, feature_names = None, nb_classes = 0): + + def __init__(self, model, feature_names=None, nb_classes=0): self.model_type = "xgboost" - #self.original_model = model.get_booster() + # self.original_model = model.get_booster() self.original_model = model #### self.base_offset = None - json_trees = get_xgboost_json(self.original_model) + json_trees = get_xgboost_json(self.original_model, feature_names) self.trees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - if(nb_classes == 2): + if (nb_classes == 2): # NASTY trick for binary # We change signs of values in leaves so that we can just sum all the values in leaves for class X # and take max to get the right class - self.otrees = [build_tree(json.loads(t), None, feature_names, inverse = True) for t in json_trees] + self.otrees = [build_tree(json.loads(t), None, feature_names, inverse=True) for t in json_trees] self.itrees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] self.trees = [] - for i,_ in enumerate(self.otrees): + for i, _ in enumerate(self.otrees): self.trees.append(self.otrees[i]) self.trees.append(self.itrees[i]) self.feature_names = feature_names + def print_tree(self): - for i,t in enumerate(self.trees): + for i, t in enumerate(self.trees): print("tree number: ", i) walk_tree(t) @@ -149,13 +153,14 @@ class TreeEnsemble: self.invert_tree_prob(node.children[0]) self.invert_tree_prob(node.children[1]) return node + def predict(self, samples, nb_classes): # https://github.com/dmlc/xgboost/issues/1746#issuecomment-290130695 prob = [] - nb_estimators = int(len(self.trees)/nb_classes) + nb_estimators = int(len(self.trees) / nb_classes) for sample in np.asarray(samples): scores = [] - for i,t in enumerate(self.trees): + for i, t in enumerate(self.trees): s = scores_tree(t, sample) scores.append((s)) scores = np.asarray(scores) @@ -163,30 +168,31 @@ class TreeEnsemble: if (nb_classes == 2): for i in range(nb_classes): - class_scores.append(math.exp(-(scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor - s0 = class_scores[0] - s1 = class_scores[1] - v0 = 1/(1 + s0) - v1 = 1/(1 + s1) + class_scores.append(math.exp(-( + scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor + s0 = class_scores[0] + s1 = class_scores[1] + v0 = 1 / (1 + s0) + v1 = 1 / (1 + s1) class_scores[0] = v0 class_scores[1] = v1 else: - for i in range(0,nb_classes*nb_estimators,nb_estimators): - class_scores.append(math.exp((scores[i:i+nb_estimators]).sum())) - #for i in range(nb_classes): + for i in range(0, nb_classes * nb_estimators, nb_estimators): + class_scores.append(math.exp((scores[i:i + nb_estimators]).sum())) + # for i in range(nb_classes): # class_scores.append(math.exp((scores[i::nb_classes]).sum())) class_scores = np.asarray(class_scores) - prob.append(class_scores/class_scores.sum()) + prob.append(class_scores / class_scores.sum()) return np.asarray(prob).reshape((-1, nb_classes)) # -#============================================================================== -def get_xgboost_json(model): +# ============================================================================== +def get_xgboost_json(model, feature_names): """ REUSED FROM SHAP This gets a JSON dump of an XGBoost model while ensuring the feature names are their indexes. """ - fnames = model.feature_names + fnames = feature_names model.feature_names = None json_trees = model.get_dump(with_stats=True, dump_format="json") model.feature_names = fnames diff --git a/pages/application/RandomForest/utils/xgbrf/xgb_rf.py b/pages/application/RandomForest/utils/xgbrf/xgb_rf.py index 024225fe8cf140a3eaaeef5f8dfc653c85db8d0f..840def741bb12263a980354ad1f6449ff18a9a97 100644 --- a/pages/application/RandomForest/utils/xgbrf/xgb_rf.py +++ b/pages/application/RandomForest/utils/xgbrf/xgb_rf.py @@ -10,6 +10,10 @@ # #============================================================================== +from __future__ import print_function + +import random + from .validate import SMTValidator from .pi_checker import SMTChecker from .encode import SMTEncoder @@ -39,222 +43,22 @@ class XGBRandomForest(object): The main class to train/encode/explain Random Forest models. """ - def __init__(self, options, from_data=None, from_model=None, - from_encoding=None): + def __init__(self, options, from_model=None): """ Constructor. """ + self.extended_feature_names = None + self.extended_feature_names_as_array_strings = None + np.random.seed(random.randint(1, 100)) - assert from_data or from_model or from_encoding, \ - 'At least one input file should be specified' - - self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime - - # saving command-line options - self.options = options - self.seed = 42 - np.random.seed(self.seed) - - if from_data: - self.use_categorical = self.options.use_categorical - # saving data - self.data = from_data - dataset = np.asarray(self.data.samps, dtype=np.float32) - - - # split data into X and y - self.feature_names = self.data.names[:-1] - self.nb_features = len(self.feature_names) - - self.X = dataset[:, 0:self.nb_features] - self.Y = dataset[:, self.nb_features] - self.num_class = len(set(self.Y)) - self.target_name = list(range(self.num_class)) - - param_dist = {'n_estimators':self.options.n_estimators, - 'max_depth':self.options.maxdepth} - - self.params = { 'num_parallel_tree': self.options.n_estimators, - 'max_depth': self.options.maxdepth, - 'colsample_bynode': 0.8, 'subsample': 0.8, - 'learning_rate': 1, 'random_state': self.seed, - 'verbosity' : self.options.verb - } - - if(self.num_class == 2): - self.params['eval_metric'] = 'error' - self.params['objective'] = 'binary:logistic' - else: - self.params['eval_metric'] = 'merror' - self.params['num_class'] = self.num_class - self.params['objective'] = 'multi:softprob' - - if(self.num_class == 2): - param_dist['objective'] = 'binary:logistic' - - #self.model = XGBRFClassifier(**param_dist) - self.model = None - - # split data into train and test sets - self.test_size = self.options.testsplit - if (self.test_size > 0): - self.X_train, self.X_test, self.Y_train, self.Y_test = \ - train_test_split(self.X, self.Y, test_size=self.test_size, - random_state=self.seed) - else: - self.X_train = self.X - self.X_test = [] # need a fix - self.Y_train = self.Y - self.Y_test = []# need a fix - - # check if we have info about categorical features - if (self.use_categorical): - self.categorical_features = from_data.categorical_features - self.categorical_names = from_data.categorical_names - self.target_name = from_data.class_names - - #################################### - # this is a set of checks to make sure that we use the same as anchor encoding - cat_names = sorted(self.categorical_names.keys()) - assert(cat_names == self.categorical_features) - self.encoder = {} - for i in self.categorical_features: - self.encoder.update({i: OneHotEncoder(categories='auto', sparse=False)})#, - self.encoder[i].fit(self.X[:,[i]]) - - else: - self.categorical_features = [] - self.categorical_names = [] - self.encoder = [] - - fname = from_data - - elif from_model: - fname = from_model - self.load_datainfo(from_model) - if (self.use_categorical is False) and (self.options.use_categorical is True): - print("Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions") - exit() - # load model - - elif from_encoding: - fname = from_encoding - - # encoding, feature names, and number of classes - # are read from an input file - enc = SMTEncoder(None, None, None, self, from_encoding) - self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \ - self.num_class = enc.access() - - # create extra file names - try: - os.stat(options.output) - except: - os.mkdir(options.output) - - self.mapping_features() - ################# - self.test_encoding_transformes() - - bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] - bench_dir_name = options.output + "/" + bench_name - try: - os.stat(bench_dir_name) - except: - os.mkdir(bench_dir_name) - - self.basename = (os.path.join(bench_dir_name, bench_name + - "_nbestim_" + str(options.n_estimators) + - "_maxdepth_" + str(options.maxdepth) + - "_testsplit_" + str(options.testsplit))) - - data_suffix = '.splitdata.pkl' - self.modfile = self.basename + '.mod.pkl' - - self.mod_plainfile = self.basename + '.mod.txt' - - self.resfile = self.basename + '.res.txt' - self.encfile = self.basename + '.enc.txt' - self.expfile = self.basename + '.exp.txt' - - def form_datefile_name(self, modfile): - data_suffix = '.splitdata.pkl' - return modfile + data_suffix - - def pickle_save_file(self, filename, data): - try: - f = open(filename, "wb") - pickle.dump(data, f) - f.close() - except: - print("Cannot save to file", filename) - exit() - - def pickle_load_file(self, filename): - try: - f = open(filename, "rb") - data = pickle.load(f) - f.close() - return data - except: - print("Cannot load from file", filename) - exit() - - def save_datainfo(self, filename): - - print("saving model to ", filename) - self.pickle_save_file(filename, self.model) - - filename_data = self.form_datefile_name(filename) - print("saving data to ", filename_data) - samples = {} - samples["X"] = self.X - samples["Y"] = self.Y - samples["X_train"] = self.X_train - samples["Y_train"] = self.Y_train - samples["X_test"] = self.X_test - samples["Y_test"] = self.Y_test - samples["feature_names"] = self.feature_names - samples["target_name"] = self.target_name - samples["num_class"] = self.num_class - samples["categorical_features"] = self.categorical_features - samples["categorical_names"] = self.categorical_names - samples["encoder"] = self.encoder - samples["use_categorical"] = self.use_categorical - - - self.pickle_save_file(filename_data, samples) - - def load_datainfo(self, filename): - print("loading model from ", filename) self.model = XGBRFClassifier() - self.model = self.pickle_load_file(filename) - - datafile = self.form_datefile_name(filename) - print("loading data from ", datafile) - loaded_data = self.pickle_load_file(datafile) - self.X = loaded_data["X"] - self.Y = loaded_data["Y"] - self.X_train = loaded_data["X_train"] - self.X_test = loaded_data["X_test"] - self.Y_train = loaded_data["Y_train"] - self.Y_test = loaded_data["Y_test"] - self.feature_names = loaded_data["feature_names"] - self.target_name = loaded_data["target_name"] - self.num_class = loaded_data["num_class"] - self.nb_features = len(self.feature_names) - self.categorical_features = loaded_data["categorical_features"] - self.categorical_names = loaded_data["categorical_names"] - self.encoder = loaded_data["encoder"] - self.use_categorical = loaded_data["use_categorical"] + self.model = from_model - def train(self, outfile=None): - """ - Train a random forest using XGBoost. - """ + self.feature_names = options["feature_names"] + self.num_class = options["n_classes"] + self.nb_features = options["n_features"] - return self.build_xgbtree(outfile) + self.mapping_features() def encode(self, test_on=None): """ @@ -266,32 +70,27 @@ class XGBRandomForest(object): if test_on: encoder.test_sample(np.array(test_on)) - encoder.save_to(self.encfile) - - def explain(self, sample, use_lime=None, use_anchor=None, use_shap=None, - expl_ext=None, prefer_ext=False, nof_feats=5): + def explain(self, sample, smallest, solver, use_lime=None, use_anchor=None, use_shap=None, + expl_ext=None, prefer_ext=False, nof_feats=5): """ Explain a prediction made for a given sample with a previously trained tree ensemble. """ if use_lime: - expl = use_lime(self, sample=sample, nb_samples=5, nb_features_in_exp=nof_feats) + expl = use_lime(self, sample=sample, nb_samples=5, + nb_features_in_exp=nof_feats) elif use_anchor: expl = use_anchor(self, sample=sample, nb_samples=5, - nb_features_in_exp=nof_feats, threshold=0.95) + nb_features_in_exp=nof_feats, threshold=0.95) elif use_shap: expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats) else: - if 'x' not in dir(self): - self.x = SMTExplainer(self.enc, self.intvs, self.imaps, - self.ivars, self.feature_names, self.num_class, - self.options, self) - - expl = self.x.explain(np.array(sample), self.options.smallest, - expl_ext, prefer_ext) + self.x = SMTExplainer(self.enc, self.intvs, self.imaps, + self.ivars, self.feature_names, self.num_class, + solver, self) + expl = self.x.explain(np.array(sample), smallest, expl_ext, prefer_ext) - # returning the explanation return expl def validate(self, sample, expl): @@ -312,289 +111,12 @@ class XGBRandomForest(object): # try to compute a counterexample return self.v.validate(np.array(sample), expl) - - def isPrimeImplicant(self, sample, expl): - """ - Check the explnation if it is a prime implicant. - """ - - # there must exist an encoding - if 'enc' not in dir(self): - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, - self) - self.enc, _, _, _ = encoder.encode() - - if 'checker' not in dir(self): - self.checker = SMTChecker(self.enc, self.feature_names, self.num_class, - self) - - # check the explanation - return self.checker.check(np.array(sample), expl) - - def repair(self, sample, expl): - """ - Make an attempt to repair that a given pessimistic (incorrect) explanation. - """ - #encode without sample - self.encode() - gexpl = self.explain(sample, expl_ext=expl, prefer_ext=True) - - #global explanation - return gexpl - - def refine(self, sample, expl): - """ - Make an attempt to refine that a given optimistic explanation. - """ - #encode without sample - self.encode() - gexpl = self.explain(sample, expl_ext=expl) - - #global explanation - return gexpl - - def transform(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.encoder != []) - tx = [] - for i in range(self.nb_features): - #self.encoder[i].drop = None - if (i in self.categorical_features): - self.encoder[i].drop = None - tx_aux = self.encoder[i].transform(x[:,[i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:,[i]]) - tx = np.hstack(tx) - return tx - else: - return x - - def transform_inverse(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.encoder != []) - inverse_x = [] - for i, xi in enumerate(x): - inverse_xi = np.zeros(self.nb_features) - for f in range(self.nb_features): - if f in self.categorical_features: - nb_values = len(self.categorical_names[f]) - v = xi[:nb_values] - v = np.expand_dims(v, axis=0) - iv = self.encoder[f].inverse_transform(v) - inverse_xi[f] =iv - xi = xi[nb_values:] - - else: - inverse_xi[f] = xi[0] - xi = xi[1:] - inverse_x.append(inverse_xi) - return inverse_x - else: - return x - - def transform_inverse_by_index(self, idx): - if (idx in self.extended_feature_names): - return self.extended_feature_names[idx] - else: - print("Warning there is no feature {} in the internal mapping".format(idx)) - return None - - def transform_by_value(self, feat_value_pair): - if (feat_value_pair in self.extended_feature_names.values()): - keys = (list(self.extended_feature_names.keys())[list( self.extended_feature_names.values()).index(feat_value_pair)]) - return keys - else: - print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) - return None - def mapping_features(self): self.extended_feature_names = {} self.extended_feature_names_as_array_strings = [] counter = 0 - if (self.use_categorical): - for i in range(self.nb_features): - if (i in self.categorical_features): - for j, _ in enumerate(self.encoder[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append("f{}_{}".format(i,j)) # str(self.feature_names[i]), j)) - counter = counter + 1 - else: - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) #(self.feature_names[i]) - counter = counter + 1 - else: - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i))#(self.feature_names[i]) - counter = counter + 1 - - def readable_sample(self, x): - readable_x = [] - for i, v in enumerate(x): - if (i in self.categorical_features): - readable_x.append(self.categorical_names[i][int(v)]) - else: - #readable_x.append(v) - readable_x.append(str(v)) - return np.asarray(readable_x) - - def test_encoding_transformes(self): - # test encoding - - X = self.X_train[[0],:] - - print("Sample of length", len(X[0])," : ", X) - enc_X = self.transform(X) - print("Encoded sample of length", len(enc_X[0])," : ", enc_X) - inv_X = self.transform_inverse(enc_X) - print("Back to sample", inv_X) - print("Readable sample", self.readable_sample(inv_X[0])) - assert((inv_X == X).all()) - - if (self.options.verb > 1): - for i in range(len(self.extended_feature_names)): - print(i, self.transform_inverse_by_index(i)) - for key, value in self.extended_feature_names.items(): - print(value, self.transform_by_value(value)) - - def transfomed_sample_info(self, i): - print(enc.categories_) - - def build_xgbtree(self, outfile=None): - """ - Build an ensemble of trees (forest). - """ - - if (outfile is None): - outfile = self.modfile - else: - self.datafile = sefl.form_datefile_name(outfile) - - # fit model no training data - - if (len(self.X_test) > 0): - eval_set=[(self.transform(self.X_train), self.Y_train), (self.transform(self.X_test), self.Y_test)] - else: - eval_set=[(self.transform(self.X_train), self.Y_train)] - - print("start xgb") - ''' - self.model.fit(self.transform(self.X_train), self.Y_train, - eval_set=eval_set, - verbose=self.options.verb) # eval_set=[(X_test, Y_test)], - ''' - dtrain = xgb.DMatrix(self.transform(self.X_train), label=self.Y_train) - dtest = xgb.DMatrix(self.transform(self.X_test), label=self.Y_test) - eval_set = [(dtrain, 'train'), (dtest, 'eval')] - evals_result = {} - self.model = xgb.train(self.params, dtrain, num_boost_round=1, - evals=eval_set, evals_result=evals_result) - print("end xgb") - print(self.model.get_score()) - print(len(self.model.get_score())) - #for i in range(5): - # xgb.plot_tree(self.model, num_trees=i) - # plt.show() - - - try: - train_accuracy = round(1 - evals_result['train']['merror'][-1],2) - except: - try: - train_accuracy = round(1 - evals_result['train']['error'][-1],2) - except: - assert(False) - try: - test_accuracy = round(1 - evals_result['eval']['merror'][-1],2) - except: - try: - test_accuracy = round(1 - evals_result['eval']['error'][-1],2) - except: - assert(False) - #print('Train accuracy_xgb: ',train_accuracy) - #print('Test accuracy_xgb: ', test_accuracy) - - - #evals_result = self.model.evals_result() - ########## saving model - self.save_datainfo(outfile) - print("saving plain model to ", self.mod_plainfile) - #self.model._Booster.dump_model(self.mod_plainfile) - self.model.dump_model(self.mod_plainfile) - - ensemble = TreeEnsemble(self.model, self.extended_feature_names_as_array_strings, nb_classes = self.num_class) - #ensemble.print_tree() - - #y_pred_prob = self.model.predict_proba(self.transform(self.X_train[:10])) - classone_probs = self.model.predict(xgb.DMatrix(self.transform(self.X_train[:10]))) - if self.num_class == 2: - classzero_probs = 1.0 - classone_probs - y_pred_prob = np.vstack((classzero_probs, classone_probs)).transpose() - else: - y_pred_prob = classone_probs - - y_pred_prob_compute = ensemble.predict(self.transform(self.X_train[:10]), self.num_class) - #print('y_pred_prob: \n', y_pred_prob) - #print('y_pred_prob_compute: \n',y_pred_prob_compute) - #print('\n\n') - - assert(np.absolute(y_pred_prob_compute- y_pred_prob).sum() < 0.01*len(y_pred_prob)) - - ### accuracy - ''' - try: - train_accuracy = round(1 - evals_result['validation_0']['merror'][-1],2) - except: - try: - train_accuracy = round(1 - evals_result['validation_0']['error'][-1],2) - except: - assert(False) - - try: - test_accuracy = round(1 - evals_result['validation_1']['merror'][-1],2) - except: - try: - test_accuracy = round(1 - evals_result['validation_1']['error'][-1],2) - except: - print("no results test data") - test_accuracy = 0 - ''' - - #### saving - - print("saving results to ", self.resfile) - with open(self.resfile, 'w') as f: - f.write("{} & {} & {} &{} &{} & {} \\\\ \n \hline \n".format( - os.path.basename(self.options.files[0]).replace("_","-"), - train_accuracy, - test_accuracy, - self.options.n_estimators, - self.options.maxdepth, - self.options.testsplit)) - f.close() - - print("Train accuracy: %.2f%%" % (train_accuracy * 100.0)) - print("Test accuracy: %.2f%%" % (test_accuracy * 100.0)) - - - return train_accuracy, test_accuracy, self.model - - def predict(self, X): - classone_probs = self.model.predict(xgb.DMatrix(self.transform(X))) - if self.num_class == 2: - classzero_probs = 1.0 - classone_probs - y_pred_prob = np.vstack((classzero_probs, classone_probs)).transpose() - else: - y_pred_prob = classone_probs - return y_pred_prob + for i in range(self.nb_features): + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) + counter = counter + 1 diff --git a/pages/application/RandomForest/utils/xprf.py b/pages/application/RandomForest/utils/xprf.py index acd3fd81c9201a52d6460d5d8397c7074e7f2e8d..c22b6185f32da5e8f09777ec8689b4dcd22854d9 100755 --- a/pages/application/RandomForest/utils/xprf.py +++ b/pages/application/RandomForest/utils/xprf.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#-*- coding:utf-8 -*- +# -*- coding:utf-8 -*- ## ## xrf.py ## @@ -9,32 +9,34 @@ ## # -#============================================================================== +# ============================================================================== from __future__ import print_function -from pages.application.RandomForest.utils.data import Data +from pages.application.RandomForest.utils.data import Data import os import sys import pickle import resource -from pages.application.RandomForest.utils.xgbooster import preprocess_dataset +from pages.application.RandomForest.utils.xgbooster import preprocess_dataset -from pages.application.RandomForest.utils.xrf import XRF, RF2001, Dataset, Checker +from pages.application.RandomForest.utils.xrf import XRF, RF2001, Dataset, Checker import numpy as np ################## -from pages.application.RandomForest.utils.xpLime import lime_call +from pages.application.RandomForest.utils.xpLime import lime_call import math import lime import lime.lime_tabular ### -from pages.application.RandomForest.utils.xpAnchor import anchor_call -#from anchor import utils +from pages.application.RandomForest.utils.xpAnchor import anchor_call +# from anchor import utils from anchor import anchor_tabular + + ################ # -#============================================================================== +# ============================================================================== def show_info(): """ Print info message. @@ -42,35 +44,37 @@ def show_info(): print("c XRF: eXplaining Random Forest.") print('c') - + # -#============================================================================== +# ============================================================================== def pickle_save_file(filename, data): try: - f = open(filename, "wb") + f = open(filename, "wb") pickle.dump(data, f) f.close() except: print("Cannot save to file", filename) exit() + def pickle_load_file(filename): try: - f = open(filename, "rb") + f = open(filename, "rb") data = pickle.load(f) f.close() return data except: print("Cannot load from file", filename) - exit() - - -# -#============================================================================== + exit() + + # + + +# ============================================================================== if __name__ == '__main__': # parsing command-line options options = Options(sys.argv) - + # making output unbuffered if sys.version_info.major == 2: sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) @@ -81,35 +85,34 @@ if __name__ == '__main__': if (options.preprocess_categorical): preprocess_dataset(options.files[0], options.preprocess_categorical_files, options.use_categorical) exit() - - + if options.files: cls = None xrf = None - + print("loading data ...") data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, use_categorical = options.use_categorical) - + separator=options.separator, use_categorical=options.use_categorical) + if options.train: ''' data = Dataset(filename=options.files[0], mapfile=options.mapfile, separator=options.separator, use_categorical = options.use_categorical) - ''' + ''' cls = RF2001(options) train_accuracy, test_accuracy = cls.train(data) - + if options.verb == 1: print("----------------------") print("Train accuracy: {0:.2f}".format(100. * train_accuracy)) print("Test accuracy: {0:.2f}".format(100. * test_accuracy)) - print("----------------------") - + print("----------------------") + xrf = XRF(options, cls, data) - #xrf.test_tree_ensemble() - + # xrf.test_tree_ensemble() + bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] bench_dir_name = options.output + "/RF/" + bench_name try: @@ -118,23 +121,22 @@ if __name__ == '__main__': os.mkdir(bench_dir_name) basename = (os.path.join(bench_dir_name, bench_name + - "_nbestim_" + str(options.n_estimators) + - "_maxdepth_" + str(options.maxdepth))) + "_nbestim_" + str(options.n_estimators) + + "_maxdepth_" + str(options.maxdepth))) - modfile = basename + '.mod.pkl' + modfile = basename + '.mod.pkl' print("saving model to ", modfile) pickle_save_file(modfile, cls) - - #data_suffix = '.splitdata.pkl' - #filename_data = basename + data_suffix - #print("saving data to ", filename_data) - #pickle_save_file(filename_data, data) + # data_suffix = '.splitdata.pkl' + # filename_data = basename + data_suffix + # print("saving data to ", filename_data) + # pickle_save_file(filename_data, data) # read a sample from options.explain - #if options.explain: + # if options.explain: # options.explain = [float(v.strip()) for v in options.explain.split(',')] - + ''' if options.encode: # encode it and save the encoding to another file @@ -143,25 +145,24 @@ if __name__ == '__main__': ''' if options.explain: mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss + if not xrf: print("loading model ...") - cls = pickle_load_file(options.files[1]) + cls = pickle_load_file(options.files[1]) xrf = XRF(options, cls, data) - - - #expl = xrf.explain(options.explain) - - #expl_checker = Checker(xrf.f, data.num_class, data.extended_feature_names_as_array_strings) - - cls.print_accuracy(data) # print test accuracy of the RF model - + + # expl = xrf.explain(options.explain) + + # expl_checker = Checker(xrf.f, data.num_class, data.extended_feature_names_as_array_strings) + + cls.print_accuracy(data) # print test accuracy of the RF model + samps_file = options.explain.strip() print(samps_file) with open(samps_file, 'r') as fp: lines = fp.readlines() - + # timers atimes = [] lengths = [] @@ -171,46 +172,46 @@ if __name__ == '__main__': utimes = [] nSatCalls = [] nUnsCalls = [] - + ltimes = [] ctimes = [] wins = 0 - + for i, s in enumerate(lines): sample = [float(v.strip()) for v in s.split(',')] if tuple(sample) in tested: continue - - #print("inst#{0}".format(i+1)) - + + # print("inst#{0}".format(i+1)) + tested.add(tuple(sample)) - #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) - + # print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) + xrf.encode(sample) expl = xrf.explain(sample) - atimes.append(xrf.x.time) + atimes.append(xrf.x.time) lengths.append(len(expl)) nvars = xrf.enc.cnf.nv nclauses = len(xrf.enc.cnf.clauses) - - #mSAT = max(xrf.x.stimes+[mSAT]) - #mUNSAT = max(xrf.x.utimes+[mUNSAT]) + + # mSAT = max(xrf.x.stimes+[mSAT]) + # mUNSAT = max(xrf.x.utimes+[mUNSAT]) if len(xrf.x.stimes): stimes.append(max(xrf.x.stimes)) if len(xrf.x.utimes): utimes.append(max(xrf.x.utimes)) nSatCalls.append(xrf.x.nsat) nUnsCalls.append(xrf.x.nunsat) - - #inst = data.transform(np.array(sample))[0] - #expl_checker.check(np.array(inst), expl) + + # inst = data.transform(np.array(sample))[0] + # expl_checker.check(np.array(inst), expl) #####check_expl(np.array(inst), expl, xrf.enc.forest, xrf.enc.intvs) - + del xrf.enc del xrf.x - + #####################LIME########### ''' _, ltime = lime_call(cls, data, sample, verbose=options.verb) # call lime @@ -218,20 +219,19 @@ if __name__ == '__main__': #wins += 1 if atimes[-1] < ltime: wins += 1 - ''' - - _, ctime = anchor_call(cls, data, sample, verbose=options.verb) # call lime + ''' + + _, ctime = anchor_call(cls, data, sample, verbose=options.verb) # call lime ctimes.append(ctime) if atimes[-1] < ctime: - wins += 1 - - #if i == 1: + wins += 1 + + # if i == 1: # break - + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem - - + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem + # reporting the time spent print('') print('tot time: {0:.2f}'.format(sum(atimes))) @@ -240,22 +240,21 @@ if __name__ == '__main__': print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) print('') #### - print('avg length: {0:.0f}'.format(round(sum(lengths) / len(lengths))*100/len(sample))) - #print('max SAT: {0:.2f}'.format(mSAT)) - #print('max UNSAT: {0:.2f}'.format(mUNSAT)) + print('avg length: {0:.0f}'.format(round(sum(lengths) / len(lengths)) * 100 / len(sample))) + # print('max SAT: {0:.2f}'.format(mSAT)) + # print('max UNSAT: {0:.2f}'.format(mUNSAT)) print('max SAT: {0:.2f}'.format(max(stimes))) - print('max UNSAT: {0:.2f}'.format(max(utimes))) + print('max UNSAT: {0:.2f}'.format(max(utimes))) print('avg #SAT: {0:.0f}'.format(sum(nSatCalls) / len(nSatCalls))) print('avg #UNSAT: {0:.0f}'.format(sum(nUnsCalls) / len(nUnsCalls))) print('') - #reporting nof_vars and nof_clauses + # reporting nof_vars and nof_clauses print('c nof vars: {0}'.format(nvars)) print('c nof clauses: {0}'.format(nclauses)) # print('c nof instances: {0}'.format(len(tested))) - print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) - - + print("c mem used: {0:.2f} Mb".format(mem / (1024 * 1024))) + # LIME runtimes ''' print('') @@ -263,12 +262,11 @@ if __name__ == '__main__': print('avg time for Lime: {0:.2f}'.format(sum(ltimes) / len(ltimes))) print('#wins {0} out of {1}'.format(wins, len(tested)) ) ''' - + # Anchor runtimes print('') print('tot time for Anchor: {0:.2f}'.format(sum(ctimes))) print('max time for Anchor: {0:.2f}'.format(max(ctimes))) print('min time for Anchor: {0:.2f}'.format(min(ctimes))) print('avg time for Anchor: {0:.2f}'.format(sum(ctimes) / len(ctimes))) - print('#wins {0} out of {1}'.format(wins, len(tested)) ) - \ No newline at end of file + print('#wins {0} out of {1}'.format(wins, len(tested))) diff --git a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so b/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so deleted file mode 100755 index 8037905889ba350ca148832a64d0dfd51f823953..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so b/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so deleted file mode 100755 index 7aa11a7e970e9b6d0a19149ddf63f889f952f4a7..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o b/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o deleted file mode 100644 index ba7953dbc2b8e701d652333c93f2d2aa33024d72..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o b/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o deleted file mode 100644 index 972595fc8d85029e1a21ad144b452cd34d8306b4..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/encode.py b/pages/application/RandomForest/utils/xrf/archive/encode.py deleted file mode 100644 index 3478ebc15ec768b853dfbca892b966d2177bac58..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/encode.py +++ /dev/null @@ -1,276 +0,0 @@ - -from pysat.formula import CNF, IDPool -from pysat.solvers import Solver -from pysat.card import * -from itertools import combinations - -import collections -import six -from six.moves import range - -from .tree import Forest, predict_tree -from .sortnetwrk import HSorNetwrk - -# -#============================================================================== -class SATEncoder(object): - """ - Encoder of Random Forest classifier into SAT. - """ - - def __init__(self, forest, feats, nof_classes, extended_feature_names=None, from_file=None): - #self.model = model - self.forest = forest - self.feats = {f: i for i, f in enumerate(feats)} - self.num_class = nof_classes - self.vpool = IDPool() - #self.optns = xgb.options - self.extended_feature_names = extended_feature_names - - #encoding formula - self.cnf = None - - # for interval-based encoding - self.intvs, self.imaps, self.ivars = None, None, None - - #if from_file: - # self.load_from(from_file) - - def newVar(self, name): - assert(name) - - if name in self.vpool.obj2id: #var has been already created - return self.vpool.obj2id[name] - - var = self.vpool.id('{0}'.format(name)) - return var - - - def traverse(self, tree, k, clause): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - var = self.newVar(tree.name) - #print("{0} => {1}".format(tree.name, var)) - pos, neg = var, -var - - self.traverse(tree.children[0], k, clause + [-neg]) # -var - self.traverse(tree.children[1], k, clause + [-pos]) # --var - else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k)) - print('c: ', clause + [cvar]) - self.cnf.append(clause + [cvar]) - - - ''' - def encode_node(self, node): - """ - Encode a node of a tree. - """ - - if '_' not in node.name: - # continuous features => expecting an upper bound - # feature and its upper bound (value) - f, v = node.name, node.threshold - - existing = True if tuple([f, v]) in self.idmgr.obj2id else False - vid = self.idmgr.id(tuple([f, v])) - bv = Symbol('bvar{0}'.format(vid), typename=BOOL) - - if not existing: - if self.intvs: - d = self.imaps[f][v] + 1 - pos, neg = self.ivars[f][:d], self.ivars[f][d:] - self.enc.append(Iff(bv, Or(pos))) - self.enc.append(Iff(Not(bv), Or(neg))) - else: - fvar, fval = Symbol(f, typename=REAL), Real(v) - self.enc.append(Iff(bv, LT(fvar, fval))) - - return bv, Not(bv) - else: - # all features are expected to be categorical and - # encoded with one-hot encoding into Booleans - # each node is expected to be of the form: f_i < 0.5 - bv = Symbol(node.name, typename=BOOL) - - # left branch is positive, i.e. bv is true - # right branch is negative, i.e. bv is false - return Not(bv), bv - ''' - - - def compute_intervals(self): - """ - Traverse all trees in the ensemble and extract intervals for each - feature. - - At this point, the method only works for numerical datasets! - """ - - def traverse_intervals(tree): - """ - Auxiliary function. Recursive tree traversal. - """ - - if tree.children: - f = tree.name - v = tree.threshold - self.intvs[f].add(v) - - traverse_intervals(tree.children[0]) - traverse_intervals(tree.children[1]) - - # initializing the intervals - self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} - - for tree in self.forest.trees: - traverse_intervals(tree) - - # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} - - self.imaps, self.ivars = {}, {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - - ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) - self.ivars[feat].append(ivar) - - def encode(self, sample): - """ - Do the job. - """ - - self.cnf = CNF() - # getting a tree ensemble - #self.forest = Forest(self.model, self.extended_feature_names) - num_tree = len(self.forest.trees) - - # introducing class score variables - cvars = [[] for t in range(num_tree)] - for k in range(len(self.forest.trees)): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j,k)) - cvars[k].append(-var) - - # if targeting interval-based encoding, - # traverse all trees and extract all possible intervals - # for each feature - ''' - if self.optns.encode == 'smtbool': - self.compute_intervals() - ''' - - # traversing and encoding each tree - for k, tree in enumerate(self.forest.trees): - print("Encode tree#{0}".format(k)) - # encoding the tree - self.traverse(tree, k, []) - #exactly one class var is true this could could be squeezed - # more to reduce NB binary clauses!!!!!!! - enc = CardEnc.atmost(lits=[-v for v in cvars[k]], - vpool=self.vpool, - encoding=EncType.cardnetwrk) #AtMostOne constraint - self.cnf.extend(enc.clauses) - - - csum = [[] for c in range(self.num_class)] - for k, tree in enumerate(self.forest.trees): - c = predict_tree(tree, sample) - csum[c].append(k) - cvars[k][c] = - cvars[k][c] - - # encoding the majority - cmaj,_ = max(enumerate(csum), key=(lambda x: len(x[1]))) - sorted_lits = [[] for c in range(self.num_class)] - #sorting bits - for j in range(self.num_class): - tvars = [cvars[k][j] for k in range(num_tree)] - clauses, vout, _ = HSorNetwrk(lits=tvars, vpool = self.vpool) - self.cnf.extend(clauses) - sorted_lits[j] = vout - #print("tvars: {0} ==> {3} \nclauses: {1}\ntop: {2}".format(tvars, clauses, self.vpool.top, vout)) - #compare bits - for j in range(self.num_class): - if j == cmaj: - continue - for k in range(num_tree): - self.cnf.append([ -sorted_lits[j][k], sorted_lits[cmaj][k] ]) # (v1 => v2) - #print("-{0} => {1}".format(sorted_lits[j][k], sorted_lits[cmaj][k])) - - ''' - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.extended_feature_names: - if '_' in f: - categories[f.split('_')[0]].append(self.newVar(f)) - - for c, feats in six.iteritems(categories): - #ExactlyOne feat is True - self.cnf.append(feats) - enc = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(enc.clauses) - ''' - - #if self.optns.verb: - # number of variables - print('#vars:', self.cnf.nv) - # number of clauses - print('#clauses:', len(self.cnf.clauses)) - #print(self.cnf.clauses) - - return self.cnf, self.intvs, self.imaps, self.ivars - - ''' - def test_sample(self, sample): - """ - Check whether or not the encoding "predicts" the same class - as the classifier given an input sample. - """ - - # first, compute the scores for all classes as would be - # predicted by the classifier - - # score arrays computed for each class - csum = [[] for c in range(self.num_class)] - - #if self.optns.verb: - print('testing sample:', list(sample)) - - # traversing all trees - for i, tree in enumerate(self.forest.trees): - c = predict_tree(tree, sample) - csum[c].append(i) - - # encoding the majority - cmaj,_ = max(enumerate(csum), key=(lambda x: len(x[1]))) - - # second, get the scores computed with the use of the encoding - assert self.cnf, "There is no encoding." - - slv = Solver(name="minisat22") - slv.add_formula(self.cnf) - - - # asserting the sample - hypos = [] - - #for i, fval in enumerate(sample): - - ''' - - - def access(self): - """ - Get access to the encoding, features names, and the number of - classes. - """ - - return self.cnf, self.intvs, self.imaps, self.ivars, self.feats, self.num_class \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc b/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc deleted file mode 100644 index 8a44d9682946bf694983856d759abfaab3351f42..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc +++ /dev/null @@ -1,248 +0,0 @@ - - -#define PY_SSIZE_T_CLEAN - -#include <setjmp.h> -#include <signal.h> -#include <stdio.h> -#include <Python.h> - -#include "sortcard.hh" - -using namespace std; - -// docstrings -//============================================================================= -static char module_docstring[] = "This module provides an interface for " - "encoding a few types of cardinality " - "constraints"; -//static char atmost_docstring[] = "Create an AtMost(k) constraint."; -//static char atleast_docstring[] = "Create an AtLeast(k) constraint."; -static char sortn_docstring[] = "Sort an array of bits."; - -static PyObject *CardError; -static jmp_buf env; - -// function declaration for functions available in module -//============================================================================= -extern "C" { - //static PyObject *py_encode_atmost (PyObject *, PyObject *); - //static PyObject *py_encode_atleast (PyObject *, PyObject *); - static PyObject *py_sortn (PyObject *, PyObject *); -} - -// module specification -//============================================================================= -static PyMethodDef module_methods[] = { - //{ "encode_atmost", py_encode_atmost, METH_VARARGS, atmost_docstring }, - //{ "encode_atleast", py_encode_atleast, METH_VARARGS, atleast_docstring }, - { "HSort", py_sortn, METH_VARARGS, sortn_docstring }, - - { NULL, NULL, 0, NULL } -}; - -extern "C" { - -// signal handler for SIGINT -//============================================================================= -static void sigint_handler(int signum) -{ - longjmp(env, -1); -} - -//#if PY_MAJOR_VERSION >= 3 // for Python3 -// PyInt_asLong() -//============================================================================= -static int pyint_to_cint(PyObject *i_obj) -{ - return PyLong_AsLong(i_obj); -} - -// PyInt_fromLong() -//============================================================================= -static PyObject *pyint_from_cint(int i) -{ - return PyLong_FromLong(i); -} - -// PyCapsule_New() -//============================================================================= -static PyObject *void_to_pyobj(void *ptr) -{ - return PyCapsule_New(ptr, NULL, NULL); -} - -// PyCapsule_GetPointer() -//============================================================================= -static void *pyobj_to_void(PyObject *obj) -{ - return PyCapsule_GetPointer(obj, NULL); -} - -// PyInt_Check() -//============================================================================= -static int pyint_check(PyObject *i_obj) -{ - return PyLong_Check(i_obj); -} - -// module initialization -//============================================================================= -static struct PyModuleDef module_def = { - PyModuleDef_HEAD_INIT, - "pysortnetwrk", /* m_name */ - module_docstring, /* m_doc */ - -1, /* m_size */ - module_methods, /* m_methods */ - NULL, /* m_reload */ - NULL, /* m_traverse */ - NULL, /* m_clear */ - NULL, /* m_free */ -}; - -/* -PyMODINIT_FUNC PyInit_pycard(void) -{ - PyObject *m = PyModule_Create(&module_def); - - if (m == NULL) - return NULL; - - CardError = PyErr_NewException((char *)"pycard.error", NULL, NULL); - Py_INCREF(CardError); - - if (PyModule_AddObject(m, "error", CardError) < 0) { - Py_DECREF(CardError); - return NULL; - } - - return m; -}*/ - -PyMODINIT_FUNC PyInit_pysortnetwrk(void) -{ - PyObject *m = PyModule_Create(&module_def); - - if (m == NULL) - return NULL; - - CardError = PyErr_NewException((char *)"pycard.error", NULL, NULL); - Py_INCREF(CardError); - - if (PyModule_AddObject(m, "error", CardError) < 0) { - Py_DECREF(CardError); - return NULL; - } - - return m; -} - - -// auxiliary function for translating an iterable to a vector<int> -//============================================================================= -static bool pyiter_to_vector(PyObject *obj, vector<int>& vect) -{ - PyObject *i_obj = PyObject_GetIter(obj); - - if (i_obj == NULL) { - PyErr_SetString(PyExc_RuntimeError, - "Object does not seem to be an iterable."); - return false; - } - - PyObject *l_obj; - while ((l_obj = PyIter_Next(i_obj)) != NULL) { - if (!pyint_check(l_obj)) { - Py_DECREF(l_obj); - Py_DECREF(i_obj); - PyErr_SetString(PyExc_TypeError, "integer expected"); - return false; - } - - int l = pyint_to_cint(l_obj); - Py_DECREF(l_obj); - - if (l == 0) { - Py_DECREF(i_obj); - PyErr_SetString(PyExc_ValueError, "non-zero integer expected"); - return false; - } - - vect.push_back(l); - } - - Py_DECREF(i_obj); - return true; -} - -// -//============================================================================= -static PyObject *py_sortn(PyObject *self, PyObject *args) -{ - - PyObject *av_obj; - //PyObject *cv_obj; - int top; - int zvar; - - //PyObject *lhs_obj; - //int rhs; - //int top; - //int enc; - int main_thread; - - if (!PyArg_ParseTuple(args, "Oiii", &av_obj, &top, &zvar, - &main_thread)) - return NULL; - - vector<int> av; - if (pyiter_to_vector(av_obj, av) == false) - return NULL; - - PyOS_sighandler_t sig_save; - if (main_thread) { - sig_save = PyOS_setsig(SIGINT, sigint_handler); - - if (setjmp(env) != 0) { - PyErr_SetString(CardError, "Caught keyboard interrupt"); - return NULL; - } - } - - // calling encoder - ClauseSet dest; - vector<int> cv; - sortn_half_sorter_recur(top, dest, av, cv, zvar); - //_encode_atmost(dest, lhs, rhs, top, enc); - - if (main_thread) - PyOS_setsig(SIGINT, sig_save); - - // creating the resulting clause set - PyObject *dest_obj = PyList_New(dest.size()); - for (size_t i = 0; i < dest.size(); ++i) { - PyObject *cl_obj = PyList_New(dest[i].size()); - - for (size_t j = 0; j < dest[i].size(); ++j) { - PyObject *lit_obj = pyint_from_cint(dest[i][j]); - PyList_SetItem(cl_obj, j, lit_obj); - } - - PyList_SetItem(dest_obj, i, cl_obj); - } - - PyObject *cv_obj = PyList_New(cv.size()); - for (size_t i = 0; i < cv.size(); ++i) { - PyObject *lit_obj = pyint_from_cint(cv[i]); - PyList_SetItem(cv_obj, i, lit_obj); - } - - PyObject *ret = Py_BuildValue("OOn", dest_obj, cv_obj, (Py_ssize_t)top); - Py_DECREF(dest_obj); - Py_DECREF(cv_obj); - - return ret; -} - - -} // extern "C" diff --git a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so b/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so deleted file mode 100755 index 7aa11a7e970e9b6d0a19149ddf63f889f952f4a7..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/rfc.py b/pages/application/RandomForest/utils/xrf/archive/rfc.py deleted file mode 100644 index 411ad46c1e635664cd64695394bc2093710c2368..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/rfc.py +++ /dev/null @@ -1,636 +0,0 @@ -from sklearn.ensemble._voting import VotingClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.preprocessing import OneHotEncoder, LabelEncoder -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -import numpy as np -import sys -import os -import resource - -import collections -from six.moves import range -import six - -from pages.application.RandomForest.utils.data import Data -from .tree import Forest, predict_tree -# from .encode import SATEncoder -from .sortnetwrk import HSorNetwrk -from pysat.formula import CNF, IDPool -from pysat.solvers import Solver -from pysat.card import CardEnc, EncType -from itertools import combinations - - -# -# ============================================================================== -class Dataset(Data): - """ - Class for representing dataset (transactions). - """ - - def __init__(self, filename=None, fpointer=None, mapfile=None, - separator=' ', use_categorical=False): - super().__init__(filename, fpointer, mapfile, separator, use_categorical) - - # split data into X and y - self.feature_names = self.names[:-1] - self.nb_features = len(self.feature_names) - self.use_categorical = use_categorical - samples = np.asarray(self.samps, dtype=np.float32) - self.X = samples[:, 0: self.nb_features] - self.y = samples[:, self.nb_features] - self.num_class = len(set(self.y)) - self.target_name = list(range(self.num_class)) - - print("c nof_features: {0}".format(self.nb_features)) - print("c nof_samples: {0}".format(len(self.samps))) - - # check if we have info about categorical features - if (self.use_categorical): - self.binarizer = {} - for i in self.categorical_features: - self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , - self.binarizer[i].fit(self.X[:, [i]]) - else: - self.binarize = [] - # feat map - self.mapping_features() - - def train_test_split(self, test_size=0.2, seed=0): - return train_test_split(self.X, self.y, test_size=test_size, random_state=seed) - - def transform(self, x): - if (len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert (self.binarizer != []) - tx = [] - for i in range(self.nb_features): - self.binarizer[i].drop = None - if (i in self.categorical_features): - tx_aux = self.binarizer[i].transform(x[:, [i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:, [i]]) - tx = np.hstack(tx) - return tx - else: - return x - - def transform_inverse(self, x): - if (len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert (self.binarizer != []) - inverse_x = [] - for i, xi in enumerate(x): - inverse_xi = np.zeros(self.nb_features) - for f in range(self.nb_features): - if f in self.categorical_features: - nb_values = len(self.categorical_names[f]) - v = xi[:nb_values] - v = np.expand_dims(v, axis=0) - iv = self.binarizer[f].inverse_transform(v) - inverse_xi[f] = iv - xi = xi[nb_values:] - - else: - inverse_xi[f] = xi[0] - xi = xi[1:] - inverse_x.append(inverse_xi) - return inverse_x - else: - return x - - def transform_inverse_by_index(self, idx): - if (idx in self.extended_feature_names): - return self.extended_feature_names[idx] - else: - print("Warning there is no feature {} in the internal mapping".format(idx)) - return None - - def transform_by_value(self, feat_value_pair): - if (feat_value_pair in self.extended_feature_names.values()): - keys = ( - list(self.extended_feature_names.keys())[list(self.extended_feature_names.values()).index(feat_value_pair)]) - return keys - else: - print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) - return None - - def mapping_features(self): - self.extended_feature_names = {} - self.extended_feature_names_as_array_strings = [] - counter = 0 - if (self.use_categorical): - for i in range(self.nb_features): - if (i in self.categorical_features): - for j, _ in enumerate(self.binarizer[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append( - "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) - counter = counter + 1 - else: - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - else: - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - - def readable_sample(self, x): - readable_x = [] - for i, v in enumerate(x): - if (i in self.categorical_features): - readable_x.append(self.categorical_names[i][int(v)]) - else: - readable_x.append(v) - return np.asarray(readable_x) - - def test_encoding_transformes(self, X_train): - # test encoding - - X = X_train[[0], :] - - print("Sample of length", len(X[0]), " : ", X) - enc_X = self.transform(X) - print("Encoded sample of length", len(enc_X[0]), " : ", enc_X) - inv_X = self.transform_inverse(enc_X) - print("Back to sample", inv_X) - print("Readable sample", self.readable_sample(inv_X[0])) - assert ((inv_X == X).all()) - - ''' - for i in range(len(self.extended_feature_names)): - print(i, self.transform_inverse_by_index(i)) - for key, value in self.extended_feature_names.items(): - print(value, self.transform_by_value(value)) - ''' - - -# -# ============================================================================== -class VotingRF(VotingClassifier): - """ - Majority rule classifier - """ - - def fit(self, X, y, sample_weight=None): - self.estimators_ = [] - for _, est in self.estimators: - self.estimators_.append(est) - - self.le_ = LabelEncoder().fit(y) - self.classes_ = self.le_.classes_ - - def predict(self, X): - """Predict class labels for X. - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. - Returns - ------- - maj : array-like of shape (n_samples,) - Predicted class labels. - """ - # check_is_fitted(self) - - # 'hard' voting - predictions = self._predict(X) - predictions = np.asarray(predictions, np.int64) # NEED TO BE CHECKED - maj = np.apply_along_axis( - lambda x: np.argmax( - np.bincount(x, weights=self._weights_not_none)), - axis=1, arr=predictions) - - maj = self.le_.inverse_transform(maj) - - return maj - - -# -# ============================================================================== -class RF2001(object): - """ - The main class to train Random Forest Classifier (RFC). - """ - - def __init__(self, options, from_data=None, from_model=None): - """ - Constructor. - """ - self.forest = None - self.voting = None - self.opt = options - - param_dist = {'n_estimators': options.n_estimators, - 'max_depth': options.maxdepth} - - self.forest = RandomForestClassifier(**param_dist) - - def train(self, dataset, outfile=None): - """ - Train a random forest. - """ - - X_train, X_test, y_train, y_test = dataset.train_test_split() - - dataset.test_encoding_transformes(X_train) - X_train = dataset.transform(X_train) - X_test = dataset.transform(X_test) - - print("Build a random forest.") - self.forest.fit(X_train, y_train) - - rtrees = [('dt', dt) for i, dt in enumerate(self.forest.estimators_)] - self.voting = VotingRF(estimators=rtrees) - self.voting.fit(X_train, y_train) - - train_acc = accuracy_score(self.voting.predict(X_train), y_train) - ''' - print(X_test[[0],:]) - print("RF: ",np.asarray(self.voting.predict(X_test[[0],:]))) - for i,t in enumerate(self.forest.estimators_): - print("DT_{0}: {1}".format(i,np.asarray(t.predict(X_test[[0],:])))) - ''' - test_acc = accuracy_score(self.voting.predict(X_test), y_test) - print("----------------------") - print("RF2001:") - print("Train accuracy RF2001: {0:.2f}".format(100. * train_acc)) - print("Test accuracy RF2001: {0:.2f}".format(100. * test_acc)) - print("----------------------") - - train_acc = accuracy_score(self.forest.predict(X_train), y_train) - test_acc = accuracy_score(self.forest.predict(X_test), y_test) - print("RF-scikit:") - print("Train accuracy RF-scikit: {0:.2f}".format(100. * train_acc)) - print("Test accuracy RF-scikit: {0:.2f}".format(100. * test_acc)) - print("----------------------") - - return train_acc, test_acc - - def predict(self, X): - return self.voting.predict(X) - - def estimators(self): - assert (self.forest.estimators_ is not None) - return self.forest.estimators_ - - def n_estimators(self): - return self.forest.n_estimators - - -# -# ============================================================================== -class XRF(object): - """ - class to encode and explain Random Forest classifiers. - """ - - def __init__(self, options, model): - self.cls = model - self.f = Forest(model) - # self.f.print_tree() - self.verbose = options.verb - - def encode(self, data): - """ - Encode a tree ensemble trained previously. - """ - ########## - self.f = Forest(self.cls, data.extended_feature_names_as_array_strings) - self.f.print_tree() - ####### - self.sat_enc = SATEncoder(self.f, data.feature_names, data.num_class, - extended_feature_names=data.extended_feature_names_as_array_strings) - - _, X_test, _, y_test = data.train_test_split() - - inst = X_test[[1], :] - inst = data.transform(inst)[0] - self.sat_enc.encode(inst) - self.explain(inst, data) - - def explain(self, sample, data): - """ - Explain a prediction made for a given sample with a previously - trained RF. - """ - - preamble = None - if self.verbose: - inpvals = data.readable_sample(sample) - - preamble = [] - for f, v in zip(data.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - inps = data.extended_feature_names_as_array_strings # input (feature value) variables - # print("inps: {0}".format(inps)) - - if 'x' not in dir(self): - self.x = SATExplainer(self.sat_enc, inps, preamble, data.target_name) - - expl = self.x.explain(np.array(sample)) - - # returning the explanation - return expl - - def test_tree_ensemble(self, dataset): - _, X_test, _, y_test = dataset.train_test_split() - X_test = dataset.transform(X_test) - - y_pred_forest = self.f.predict(X_test) - acc = accuracy_score(y_pred_forest, y_test) - print("Test accuracy: {0:.2f}".format(100. * acc)) - - y_pred_cls = self.cls.predict(X_test) - # print(np.asarray(y_pred_cls, np.int64)) - # print(y_pred_forest) - - assert ((y_pred_cls == y_pred_forest).all()) - - -# -# ============================================================================== -class SATEncoder(object): - """ - Encoder of Random Forest classifier into SAT. - """ - - def __init__(self, forest, feats, nof_classes, extended_feature_names=None, from_file=None): - # self.model = model - self.forest = forest - self.feats = {f: i for i, f in enumerate(feats)} - self.num_class = nof_classes - self.vpool = IDPool() - # self.optns = xgb.options - self.extended_feature_names = extended_feature_names - - # encoding formula - self.cnf = None - - # for interval-based encoding - self.intvs, self.imaps, self.ivars = None, None, None - - # if from_file: - # self.load_from(from_file) - - def newVar(self, name): - assert (name) - - if name in self.vpool.obj2id: # var has been already created - return self.vpool.obj2id[name] - - var = self.vpool.id('{0}'.format(name)) - return var - - def traverse(self, tree, k, clause): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - var = self.newVar(tree.name) - # print("{0} => {1}".format(tree.name, var)) - pos, neg = var, -var - - self.traverse(tree.children[0], k, clause + [-neg]) # -var - self.traverse(tree.children[1], k, clause + [-pos]) # --var - else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values, k)) - # print('c: ', clause + [cvar]) - self.cnf.append(clause + [cvar]) - - def encode(self, sample): - """ - Do the job. - """ - - self.cnf = CNF() - # getting a tree ensemble - # self.forest = Forest(self.model, self.extended_feature_names) - num_tree = len(self.forest.trees) - - # introducing class variables - cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_tree)] - - # introducing class-tree variables - ctvars = [[] for t in range(num_tree)] - for k in range(num_tree): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j, k)) - ctvars[k].append(var) - - # if targeting interval-based encoding, - # traverse all trees and extract all possible intervals - # for each feature - ''' - if self.optns.encode == 'smtbool': - self.compute_intervals() - ''' - - # traversing and encoding each tree - for k, tree in enumerate(self.forest.trees): - # print("Encode tree#{0}".format(k)) - # encoding the tree - self.traverse(tree, k, []) - # exactly one class var is true this could could be squeezed - # more to reduce NB binary clauses!!!!!!! - enc = CardEnc.atmost(lits=ctvars[k], - vpool=self.vpool, - encoding=EncType.cardnetwrk) # AtMostOne constraint - self.cnf.extend(enc.clauses) - - csum = [[] for c in range(self.num_class)] - for k, tree in enumerate(self.forest.trees): - c = predict_tree(tree, sample) - csum[c].append(k) - - # encoding the majority - self.cmaj, _ = max(enumerate(csum), key=(lambda x: len(x[1]))) - sorted_lits = [[] for c in range(self.num_class)] - # sorting bits - for j in range(self.num_class): - lits = [ctvars[k][j] for k in range(num_tree)] - clauses, vout, _ = HSorNetwrk(lits=lits, vpool=self.vpool) - self.cnf.extend(clauses) - sorted_lits[j] = vout - print("{0}:{2} => {1}".format(j, vout, lits)) - # compare bits - for j in range(self.cmaj): - - for j in range(self.cmaj + 1, self.num_class): - for k in range(num_tree): - self.cnf.append([-sorted_lits[j][k], sorted_lits[self.cmaj][k]]) # (v1 => v2) - # print("-{0} => {1}".format(sorted_lits[j][k], sorted_lits[self.cmaj][k])) - - ''' - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.extended_feature_names: - if '_' in f: - categories[f.split('_')[0]].append(self.newVar(f)) - - for c, feats in six.iteritems(categories): - #ExactlyOne feat is True - self.cnf.append(feats) - enc = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(enc.clauses) - ''' - for cl in self.cnf: - print("{0} == {1}".format(cl, - [self.vpool.obj(abs(p)) if p > 0 else "!" + str(self.vpool.obj(abs(p))) for p in - cl])) - - # if self.optns.verb: - # number of variables - print('#vars:', self.cnf.nv) - # number of clauses - print('#clauses:', len(self.cnf.clauses)) - # print(self.cnf.clauses) - - return self.cnf, self.intvs, self.imaps, self.ivars - - -# -# ============================================================================== -class SATExplainer(object): - """ - An SAT-inspired minimal explanation extractor for Random Forest models. - """ - - def __init__(self, sat_enc, inps, preamble, target_name): - """ - Constructor. - """ - - self.enc = sat_enc - # self.optns = options - self.inps = inps # input (feature value) variables - self.target_name = target_name - self.preamble = preamble - - self.verbose = True # self.optns.verb - # self.slv = Solver(name=options.solver) - self.slv = Solver(name="minisat22") - - # CNF formula - self.slv.append_formula(self.enc.cnf) - - # current selector - # self.selv = None - - def explain(self, sample, smallest=False): - """ - Hypotheses minimization. - """ - if self.verbose: - print( - ' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.assums = [] # var selectors to be used as assumptions - self.sel2fid = {} # selectors to original feature ids - self.sel2vid = {} # selectors to categorical feature ids - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, sample), 1): - feat = inp.split('_')[0] - selv = self.enc.newVar('selv_{0}'.format(feat)) - - self.assums.append(selv) - if selv not in self.sel2fid: - self.sel2fid[selv] = int(feat[1:]) - self.sel2vid[selv] = [i - 1] - else: - self.sel2vid[selv].append(i - 1) - - if not self.enc.intvs: - for inp, val, sel in zip(self.inps, sample, self.assums): - p = self.enc.newVar(inp) - hypo = [-sel, p if val else -p] - print("{0} => {1}".format(self.enc.vpool.obj(sel), inp if val else "!" + inp)) - self.slv.add_clause(hypo) - else: - raise NotImplementedError('Intervals are not supported.') - - self.assums = sorted(set(self.assums)) - # print("selctors: ", self.assums) - - self.slv.solve(assumptions=self.assums) - print("Model1:") - for p in self.slv.get_model(): - # if self.enc.vpool.obj(abs(p)) :and self.enc.vpool.obj(abs(p)) in self.inps: - if self.enc.vpool.obj(abs(p)) and "class" in self.enc.vpool.obj(abs(p)): - print((p, self.enc.vpool.obj(abs(p)))) - print(self.slv.get_model()) - - # forcing a misclassification, i.e. a wrong observation - for k in range(len(self.enc.forest.trees)): - cl = [] - for j in range(self.enc.num_class): - if j != self.enc.cmaj: - cl.append(self.enc.newVar('class{0}_tr{1}'.format(j, k))) - self.slv.add_clause(cl) - - # if satisfiable, then the observation is not implied by the hypotheses - if self.slv.solve(assumptions=self.assums): - print(' no implication!') - print(self.slv.get_model()) - # print("Model: {0}".format([ (p, self.enc.vpool.obj(abs(p))) for p in self.slv.get_model()])) - sys.exit(1) - - if not smallest: - self.compute_minimal() - else: - raise NotImplementedError('Smallest explanation is not yet implemented.') - # self.compute_smallest() - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - expl = sorted([self.sel2fid[h] for h in self.assums if h > 0]) - print("expl-selctors: ", expl) - - if self.verbose: - self.preamble = [self.preamble[i] for i in expl] - # print("cmaj: ", self.enc.cmaj) - print( - ' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) - print(' # hypos left:', len(self.assums)) - print(' time: {0:.2f}'.format(self.time)) - - return expl - - def compute_minimal(self): - """ - Compute any subset-minimal explanation. - """ - i = 0 - # simple deletion-based linear search - for i, p in enumerate(self.assums): - to_test = self.assums[:i] + self.assums[(i + 1):] + [-p] - # print(to_test) - if self.slv.solve(assumptions=to_test): - self.assums[i] = -p - print("Model:") - for p in self.slv.get_model(): - if self.enc.vpool.obj(abs(p)) and self.enc.vpool.obj(abs(p)) in self.inps: - print((p, self.enc.vpool.obj(abs(p)))) diff --git a/pages/application/RandomForest/utils/xrf/archive/setup.py b/pages/application/RandomForest/utils/xrf/archive/setup.py deleted file mode 100644 index 15c642788ddbea871d9a2a51fa61101d62230808..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 - -from distutils.core import setup, Extension - -pysortn_ext = Extension('pysortnetwrk', - sources=['pysortnetwrk.cc'], - include_dirs=['sortcard'], - language='c++') - -setup(name='pysortnetwrk', - version='1.0', - description='This module provides a sorting network to sort a vector of bits', - py_modules=['pysortnetwrk'], - ext_modules=[pysortn_ext]) diff --git a/pages/application/RandomForest/utils/xrf/archive/sortcard.hh b/pages/application/RandomForest/utils/xrf/archive/sortcard.hh deleted file mode 100644 index e6410504144682d39d43fcd172ce1c0c10a58f53..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/sortcard.hh +++ /dev/null @@ -1,298 +0,0 @@ - -#ifndef SORTCARD_HH_ -#define SORTCARD_HH_ - -#include <vector> -#include <algorithm> -#include <vector> -#include <ostream> - - -#define NOPTCLS true - - -using namespace std; - -class ClauseSet { -public: - ClauseSet() : clauses(0) {} - ClauseSet(ClauseSet& orig) : clauses(orig.clauses) {} - - void clear() - { - clauses.clear(); - } - - size_t size() - { - return clauses.size(); - } - - void resize(size_t sz_new) - { - return clauses.resize(sz_new); - } - - vector<int>& operator[](size_t i) - { - return clauses[i]; - } - - void erase(vector<int>& cl) - { - clauses.erase(std::find(clauses.begin(), clauses.end(), cl)); - } - - void erase_subset(size_t start, ClauseSet& clset) - { - if (clset.size()) { - vector<int>& cl_first = clset[0]; - vector<vector<int> >::iterator begin = std::find(clauses.begin() + start, clauses.end(), cl_first); - clauses.erase(begin, begin + clset.size()); - } - } - - vector<vector<int> >& get_clauses() - { - return clauses; - } - - void add_clause(vector<int> cl) - { - clauses.push_back(cl); - } - - void add_clause_ref(vector<int>& cl) - { - clauses.push_back(cl); - } - - void create_clause(vector<int>& cl) - { - add_clause(cl); - } - - void create_unit_clause(int l) - { - vector<int> cl; cl.push_back(l); - clauses.push_back(cl); - } - - void create_binary_clause(int l1, int l2) - { - vector<int> cl; - cl.push_back(l1); - cl.push_back(l2); - - clauses.push_back(cl); - } - - void create_ternary_clause(int l1, int l2, int l3) - { - vector<int> cl; - cl.push_back(l1); - cl.push_back(l2); - cl.push_back(l3); - - clauses.push_back(cl); - } - - void dump(ostream& out) - { - for (size_t i = 0; i < clauses.size(); ++i) - dump_clause(out, clauses[i]); - } -private: - void dump_clause(ostream& out, vector<int>& cl) - { - for (size_t i = 0; i < cl.size(); ++i) - out << cl[i] << " "; - out << "0" << endl; - } -protected: - vector<vector<int> > clauses; -}; - - - -// -//============================================================================= -inline void create_vvect(int& top_id, vector<int>& ov, size_t nvars) -{ - assert(nvars > 0); - - size_t refnv = ov.size(); - size_t tvars = refnv + nvars; - ov.resize(tvars, 0); - - for (size_t k = refnv; k < tvars; ++k) - ov[k] = ++top_id; - - assert(ov.size() > 0); -} - - -// -//============================================================================= -inline void copy_vvect(int& top_id, vector<int>& ov, vector<int>& iv) -{ - size_t refnv = ov.size(); - ov.resize(refnv + iv.size(), 0); - - for (size_t k = 0; k < iv.size(); ++k) - ov[refnv + k] = iv[k]; - - assert(ov.size() > 0); -} - - - -// -//============================================================================= -inline void mk_half_vect(vector<int>& ov, vector<int>& iv, size_t offset) -{ - assert(iv.size() > 0); - - size_t ns = iv.size() / 2; - ov.resize(ns, 0); - - for (size_t k = 0; k < ns; ++k) - ov[k] = iv[offset + k]; -} - -// -//============================================================================= -inline void mk_odd_vect(vector<int>& ov, vector<int>& iv) -{ - assert(iv.size() > 0); - - size_t ns = iv.size() / 2; - ov.resize(ns, 0); - - for (size_t k = 0; k < ns; ++k) - ov[k] = iv[k * 2]; -} - -//============================================================================= -inline void mk_even_vect(vector<int>& ov, vector<int>& iv) -{ - assert(iv.size() > 0); - - size_t ns = iv.size() / 2; - ov.resize(ns, 0); - - for (size_t k = 0; k < ns; ++k) - ov[k] = iv[k * 2 + 1]; -} - -// sorting networks -//============================================================================= -inline void sortn_half_merge_recur( - int& top_id, - ClauseSet& clset, - vector<int>& av, - vector<int>& bv, - vector<int>& cv, - size_t zvar -) -{ - assert(bv.size() == av.size()); - - if (av.size() == 1) { // vectors of size 1 - assert(av[0] != 0); - if (NOPTCLS || (av[0] != zvar && bv[0] != zvar)) { - create_vvect(top_id, cv, 2); - clset.create_binary_clause (-av[0], cv[0]); - clset.create_binary_clause (-bv[0], cv[0]); - clset.create_ternary_clause(-av[0], -bv[0], cv[1]); - } - else { - if (av[0] == zvar) { - cv.push_back(bv[0]); - cv.push_back(av[0]); - } - else { - assert(bv[0] == zvar); - cv.push_back(av[0]); - cv.push_back(bv[0]); - } - } - } - else { - if (NOPTCLS || - ((av[0] != zvar || av[av.size() - 1] != zvar) && - (bv[0] != zvar || bv[av.size() - 1] != zvar))) { - vector<int> aodd, aeven, bodd, beven, dv, ev; - - mk_odd_vect(aodd, av); mk_even_vect(aeven, av); - mk_odd_vect(bodd, bv); mk_even_vect(beven, bv); - - sortn_half_merge_recur(top_id, clset, aodd, bodd, dv, zvar); - sortn_half_merge_recur(top_id, clset, aeven, beven, ev, zvar); - - assert(cv.size() == 0); - cv.push_back(dv[0]); - create_vvect(top_id, cv, 2 * av.size() - 2); - cv.push_back(ev[ev.size() - 1]); - - for (size_t i = 0; i < av.size() - 1; ++i) { - assert(i + 1 < dv.size()); - assert(i < ev.size()); - assert(2 * 1 + 1 < cv.size()); - - clset.create_binary_clause (-dv[i + 1], cv[2 * i + 1]); - clset.create_binary_clause (-ev[i ], cv[2 * i + 1]); - clset.create_ternary_clause(-dv[i + 1], -ev[i], cv[2 * i + 2]); - } - } - else { - if (av[0] == zvar && av[av.size() - 1] == zvar) { - copy_vvect(top_id, cv, bv); - copy_vvect(top_id, cv, av); - } - else { - assert(bv[0] == zvar && bv[av.size() - 1] == zvar); - copy_vvect(top_id, cv, av); - copy_vvect(top_id, cv, bv); - } - } - } - - assert(cv.size() > 0); -} - -// -//============================================================================= -inline vector<int>& sortn_half_sorter_recur( - int& top_id, - ClauseSet& clset, - vector<int>& av, - vector<int>& cv, - size_t zvar -) -{ - assert(av.size() > 1); - - if (av.size() == 2) { - assert(av[0] != 0 && av[1] != 0); - - vector<int> xav, xbv; xav.push_back(av[0]); xbv.push_back(av[1]); - sortn_half_merge_recur(top_id, clset, xav, xbv, cv, zvar); - } - else { - vector<int> dv1, dv2, lav, uav; - mk_half_vect(lav, av, 0); - mk_half_vect(uav, av, av.size() / 2); - - assert(lav.size() == uav.size()); - sortn_half_sorter_recur(top_id, clset, lav, dv1, zvar); assert(dv1.size() > 0); - sortn_half_sorter_recur(top_id, clset, uav, dv2, zvar); assert(dv2.size() > 0); - sortn_half_merge_recur (top_id, clset, dv1, dv2, cv, zvar); - } - - assert(cv.size() > 0); - return cv; -} - - -#endif // SORTCARD_HH_ diff --git a/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py b/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py deleted file mode 100644 index f18c26ece6ad9804c0d40f4e584df7e149b075c0..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -# -from math import exp, log, trunc - -from pysat._utils import MainThread -from pysat.formula import CNF, IDPool -import pysortnetwrk - - - -def HSorNetwrk(lits, top_id=None, vpool=None): - assert not top_id or not vpool, \ - 'Use either a top id or a pool of variables but not both.' - - - # we are going to return this formula - #ret = CNF() - - # if the list of literals is empty, return empty formula - if not lits: - return ret - - # obtaining the top id from the variable pool - if vpool: - top_id = vpool.top - - - # making sure we are dealing with a list of literals - lits = list(lits) - - # choosing the maximum id among the current top and the list of literals - top_id = max(map(lambda x: abs(x), lits + [top_id if top_id != None else 0])) - - - nvars = len(lits) - - #get smallest power of 2 larger than number of vars - exponent = trunc(log(nvars) / log(2)) # assume exponent - nvtmp = exp(log(2) * exponent) - - # check if number of vars already is power of 2; correct exponent if required - exponent = exponent if (nvars - nvtmp < 0.000001) else exponent + 1 - nnvars = trunc(exp(log(2) * exponent) + 0.1) - - cl = None - zvar = 0 - if (nnvars != nvars): - top_id += 1 - zvar = top_id - lits.extend([zvar] * (nnvars - nvars)) - cl = [-zvar] - - # generate odd-even sorting network - clset,slits,top = pysortnetwrk.HSort(lits, top_id, zvar, int(MainThread.check())) - - clset = clset +[cl] if (cl is not None) else clset - - - # updating vpool if necessary - if vpool: - vpool.top = top - 1 - vpool._next() - - - - return clset, slits, top - -if __name__ == '__main__': - print("Sorting Network:") - lits = [1, 2, 3] - top_id = 5 - clauses, slits, top = HSorNetwrk(lits, top_id) - print(clauses) - print(slits) \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/archive/tree.py b/pages/application/RandomForest/utils/xrf/archive/tree.py deleted file mode 100644 index 9e3794544ac8d0e2baded3a4271bab312f4aa86e..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/tree.py +++ /dev/null @@ -1,154 +0,0 @@ -# -#============================================================================== -from anytree import Node, RenderTree,AsciiStyle -import json -import numpy as np -import math -import os - - -# -#============================================================================== -class xgnode(Node): - def __init__(self, id, parent = None): - Node.__init__(self, id, parent) - self.id = id # The node value - self.name = None - self.left_node_id = -1 # Left child - self.right_node_id = -1 # Right child - - self.feature = -1 - self.threshold = None - self.values = -1 - #iai - #self.split = None - - def __str__(self): - pref = ' ' * self.depth - if (len(self.children) == 0): - return (pref+ "leaf: {} {}".format(self.id, self.values)) - else: - if(self.name is None): - return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) - else: - return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) - - -#============================================================================== -def build_tree(tree_, feature_names = None): - ## - feature = tree_.feature - threshold = tree_.threshold - values = tree_.value - n_nodes = tree_.node_count - children_left = tree_.children_left - children_right = tree_.children_right - node_depth = np.zeros(shape=n_nodes, dtype=np.int64) - is_leaf = np.zeros(shape=n_nodes, dtype=bool) - stack = [(0, -1)] # seed is the root node id and its parent depth - while len(stack) > 0: - node_id, parent_depth = stack.pop() - node_depth[node_id] = parent_depth + 1 - - # If we have a test node - if (children_left[node_id] != children_right[node_id]): - stack.append((children_left[node_id], parent_depth + 1)) - stack.append((children_right[node_id], parent_depth + 1)) - else: - is_leaf[node_id] = True - ## - - m = tree_.node_count - assert (m > 0), "Empty tree" - - def extract_data(idx, root = None, feature_names = None): - i = idx - assert (i < m), "Error index node" - if (root is None): - node = xgnode(i) - else: - node = xgnode(i, parent = root) - #node.cover = json_node["cover"] - if is_leaf[i]: - node.values = np.argmax(values[i]) - #if(inverse): - # node.values = -node.values - else: - node.feature = feature[i] - if (feature_names is not None): - node.name = feature_names[feature[i]] - node.threshold = threshold[i] - node.left_node_id = children_left[i] - node.right_node_id = children_right[i] - extract_data(node.left_node_id, node, feature_names) #feat < 0.5 (False) - extract_data(node.right_node_id, node, feature_names) #feat > 0.% (True) - - return node - - root = extract_data(0, None, feature_names) - - return root - - -#============================================================================== -def walk_tree(node): - if (len(node.children) == 0): - # leaf - print(node) - else: - print(node) - walk_tree(node.children[0]) - walk_tree(node.children[1]) - -# -#============================================================================== -def predict_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feature_branch = node.feature - sample_value = sample[feature_branch] - assert(sample_value is not None) - if(sample_value < node.threshold): - return predict_tree(node.children[0], sample) - else: - return predict_tree(node.children[1], sample) - - -# -#============================================================================== -class Forest: - """ An ensemble of decision trees. - - This object provides a common interface to many different types of models. - """ - def __init__(self, rf, feature_names = None): - self.rf = rf - ##self.feature_names = feature_names - self.trees = [ build_tree(dt.tree_, feature_names) for dt in self.rf.estimators()] - #self.print_trees() - - def print_trees(self): - for i,t in enumerate(self.trees): - print("tree number: ", i) - walk_tree(t) - - def predict(self, samples): - predictions = [] - n_estimators = self.rf.n_estimators() - print("#Trees: ", n_estimators) - for sample in np.asarray(samples): - scores = [] - for i,t in enumerate(self.trees): - s = predict_tree(t, sample) - scores.append((s)) - scores = np.asarray(scores) - predictions.append(scores) - predictions = np.asarray(predictions) - #print(predictions) - #np.bincount(x, weights=self._weights_not_none) - maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions) - - return maj - diff --git a/pages/application/RandomForest/utils/xrf/rndmforest.py b/pages/application/RandomForest/utils/xrf/rndmforest.py index 583fe91b8dcd0fab92b9be65b30794636497eb7d..187dd280752231383f3c74ab70af9b2d6cf74019 100644 --- a/pages/application/RandomForest/utils/xrf/rndmforest.py +++ b/pages/application/RandomForest/utils/xrf/rndmforest.py @@ -1,4 +1,3 @@ - from sklearn.ensemble._voting import VotingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import OneHotEncoder, LabelEncoder @@ -16,132 +15,54 @@ import math from pages.application.RandomForest.utils.data import Data from .tree import Forest, predict_tree -#from .encode import SATEncoder +# from .encode import SATEncoder from pysat.formula import CNF, IDPool from pysat.solvers import Solver from pysat.card import CardEnc, EncType from itertools import combinations - - # -#============================================================================== +# ============================================================================== class Dataset(Data): """ Class for representing dataset (transactions). """ - def __init__(self, filename=None, fpointer=None, mapfile=None, - separator=' ', use_categorical = False): - super().__init__(filename, fpointer, mapfile, separator, use_categorical) - + + def __init__(self, file=None, mapfile=None, separator=',', use_categorical=False): + super().__init__(file, mapfile, separator, use_categorical) + # split data into X and y self.feature_names = self.names[:-1] self.nb_features = len(self.feature_names) self.use_categorical = use_categorical - + samples = np.asarray(self.samps) - if not all(c.isnumeric() for c in samples[:, -1]): + if not all(c.isnumeric() for c in samples[:, -1]): le = LabelEncoder() le.fit(samples[:, -1]) - samples[:, -1]= le.transform(samples[:, -1]) - #self.class_names = le.classes_ - print(le.classes_) - print(samples[1:4, :]) - + samples[:, -1] = le.transform(samples[:, -1]) + # self.class_names = le.classes_ + samples = np.asarray(samples, dtype=np.float32) self.X = samples[:, 0: self.nb_features] self.y = samples[:, self.nb_features] self.num_class = len(set(self.y)) - self.target_name = list(range(self.num_class)) - - print("c nof features: {0}".format(self.nb_features)) - print("c nof classes: {0}".format(self.num_class)) - print("c nof samples: {0}".format(len(self.samps))) - + self.target_name = list(range(self.num_class)) + # check if we have info about categorical features if (self.use_categorical): - self.target_name = self.class_names - + self.target_name = self.class_names + self.binarizer = {} for i in self.categorical_features: - self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)})#, - self.binarizer[i].fit(self.X[:,[i]]) + self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , + self.binarizer[i].fit(self.X[:, [i]]) else: self.categorical_features = [] - self.categorical_names = [] - self.binarizer = [] - #feat map - self.mapping_features() - - - - def train_test_split(self, test_size=0.2, seed=0): - return train_test_split(self.X, self.y, test_size=test_size, random_state=seed) - - - def transform(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.binarizer != []) - tx = [] - for i in range(self.nb_features): - #self.binarizer[i].drop = None - if (i in self.categorical_features): - self.binarizer[i].drop = None - tx_aux = self.binarizer[i].transform(x[:,[i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:,[i]]) - tx = np.hstack(tx) - return tx - else: - return x - - def transform_inverse(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.binarizer != []) - inverse_x = [] - for i, xi in enumerate(x): - inverse_xi = np.zeros(self.nb_features) - for f in range(self.nb_features): - if f in self.categorical_features: - nb_values = len(self.categorical_names[f]) - v = xi[:nb_values] - v = np.expand_dims(v, axis=0) - iv = self.binarizer[f].inverse_transform(v) - inverse_xi[f] =iv - xi = xi[nb_values:] - - else: - inverse_xi[f] = xi[0] - xi = xi[1:] - inverse_x.append(inverse_xi) - return inverse_x - else: - return x - - def transform_inverse_by_index(self, idx): - if (idx in self.extended_feature_names): - return self.extended_feature_names[idx] - else: - print("Warning there is no feature {} in the internal mapping".format(idx)) - return None - - def transform_by_value(self, feat_value_pair): - if (feat_value_pair in self.extended_feature_names.values()): - keys = (list(self.extended_feature_names.keys())[list( self.extended_feature_names.values()).index(feat_value_pair)]) - return keys - else: - print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) - return None + self.categorical_names = [] + self.binarizer = [] + # feat map + self.mapping_features() def mapping_features(self): self.extended_feature_names = {} @@ -151,17 +72,18 @@ class Dataset(Data): for i in range(self.nb_features): if (i in self.categorical_features): for j, _ in enumerate(self.binarizer[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append("f{}_{}".format(i,j)) # str(self.feature_names[i]), j)) + self.extended_feature_names.update({counter: (self.feature_names[i], j)}) + self.extended_feature_names_as_array_strings.append( + "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) counter = counter + 1 else: self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) #(self.feature_names[i]) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) counter = counter + 1 else: for i in range(self.nb_features): self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i))#(self.feature_names[i]) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) counter = counter + 1 def readable_sample(self, x): @@ -173,42 +95,43 @@ class Dataset(Data): readable_x.append(v) return np.asarray(readable_x) - - def test_encoding_transformes(self, X_train): - # test encoding - - X = X_train[[0],:] - - print("Sample of length", len(X[0])," : ", X) - enc_X = self.transform(X) - print("Encoded sample of length", len(enc_X[0])," : ", enc_X) - inv_X = self.transform_inverse(enc_X) - print("Back to sample", inv_X) - print("Readable sample", self.readable_sample(inv_X[0])) - assert((inv_X == X).all()) + def transform(self, x): + if(len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert(self.binarizer != []) + tx = [] + for i in range(self.nb_features): + #self.binarizer[i].drop = None + if (i in self.categorical_features): + self.binarizer[i].drop = None + tx_aux = self.binarizer[i].transform(x[:,[i]]) + tx_aux = np.vstack(tx_aux) + tx.append(tx_aux) + else: + tx.append(x[:,[i]]) + tx = np.hstack(tx) + return tx + else: + return x - ''' - for i in range(len(self.extended_feature_names)): - print(i, self.transform_inverse_by_index(i)) - for key, value in self.extended_feature_names.items(): - print(value, self.transform_by_value(value)) - ''' # -#============================================================================== +# ============================================================================== class VotingRF(VotingClassifier): """ Majority rule classifier """ - + def fit(self, X, y, sample_weight=None): self.estimators_ = [] for _, est in self.estimators: self.estimators_.append(est) - + self.le_ = LabelEncoder().fit(y) - self.classes_ = self.le_.classes_ - - + self.classes_ = self.le_.classes_ + def predict(self, X): """Predict class labels for X. Parameters @@ -220,23 +143,23 @@ class VotingRF(VotingClassifier): maj : array-like of shape (n_samples,) Predicted class labels. """ - #check_is_fitted(self) - + # check_is_fitted(self) + # 'hard' voting predictions = self._predict(X) - predictions = np.asarray(predictions, np.int64) #NEED TO BE CHECKED + predictions = np.asarray(predictions, np.int64) # NEED TO BE CHECKED maj = np.apply_along_axis( lambda x: np.argmax( np.bincount(x, weights=self._weights_not_none)), axis=1, arr=predictions) - + maj = self.le_.inverse_transform(maj) return maj - - + + # -#============================================================================== +# ============================================================================== class RF2001(object): """ The main class to train Random Forest Classifier (RFC). @@ -245,155 +168,68 @@ class RF2001(object): def __init__(self, options): """ Constructor. - """ + """ self.forest = None self.voting = None self.opt = options - - param_dist = {'n_estimators':options.n_estimators, - 'max_depth':options.maxdepth, - 'criterion':'entropy', - 'random_state':324089} - + + param_dist = {'n_estimators': options.n_estimators, + 'max_depth': options.maxdepth, + 'criterion': 'entropy', + 'random_state': 324089} + self.forest = RandomForestClassifier(**param_dist) - - - - def train(self, dataset, outfile=None): - """ - Train a random forest. - """ - - X_train, X_test, y_train, y_test = dataset.train_test_split() - - if self.opt.verb: - dataset.test_encoding_transformes(X_train) - - X_train = dataset.transform(X_train) - X_test = dataset.transform(X_test) - - print("Build a random forest.") - self.forest.fit(X_train,y_train) - - rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)] - self.voting = VotingRF(estimators=rtrees) - self.voting.fit(X_train,y_train) - - ''' - print(X_test[[0],:]) - print("RF: ",np.asarray(self.voting.predict(X_test[[0],:]))) - for i,t in enumerate(self.forest.estimators_): - print("DT_{0}: {1}".format(i,np.asarray(t.predict(X_test[[0],:])))) - ''' - - train_acc = accuracy_score(self.predict(X_train), y_train) - test_acc = accuracy_score(self.predict(X_test), y_test) - - if self.opt.verb > 1: - self.print_acc_vote(X_train, X_test, y_train, y_test) - self.print_acc_prob(X_train, X_test, y_train, y_test) - - return train_acc, test_acc - + def predict(self, X): return self.voting.predict(X) - + def predict_prob(self, X): self.forest.predict(X) - + def estimators(self): - assert(self.forest.estimators_ is not None) + assert (self.forest.estimators_ is not None) return self.forest.estimators_ - + def n_estimators(self): return self.forest.n_estimators - - def print_acc_vote(self, X_train, X_test, y_train, y_test): - train_acc = accuracy_score(self.predict(X_train), y_train) - test_acc = accuracy_score(self.predict(X_test), y_test) - print("----------------------") - print("RF2001:") - print("Train accuracy RF2001: {0:.2f}".format(100. * train_acc)) - print("Test accuracy RF2001: {0:.2f}".format(100. * test_acc)) - print("----------------------") - - def print_acc_prob(self, X_train, X_test, y_train, y_test): - train_acc = accuracy_score(self.forest.predict(X_train), y_train) - test_acc = accuracy_score(self.forest.predict(X_test), y_test) - print("RF-scikit:") - print("Train accuracy RF-scikit: {0:.2f}".format(100. * train_acc)) - print("Test accuracy RF-scikit: {0:.2f}".format(100. * test_acc)) - print("----------------------") - - def print_accuracy(self, data): - _, X_test, _, y_test = data.train_test_split() - #X_train = dataset.transform(X_train) - X_test = data.transform(X_test) - test_acc = accuracy_score(self.predict(X_test), y_test) - #print("----------------------") - #print("Train accuracy : {0:.2f}".format(100. * train_acc)) - #print("Test accuracy : {0:.2f}".format(100. * test_acc)) - print("c Cross-Validation: {0:.2f}".format(100. * test_acc)) - #print("----------------------") + + # -#============================================================================== +# ============================================================================== class XRF(object): """ class to encode and explain Random Forest classifiers. """ - - def __init__(self, options, model, dataset): + + def __init__(self, model, dataset): self.cls = model self.data = dataset - self.verbose = options.verb - self.f = Forest(model, dataset.extended_feature_names_as_array_strings) - - if options.verb > 2: - self.f.print_trees() - print("c RF sz:", self.f.sz) - print('c max-depth:', self.f.md) - print('c nof DTs:', len(self.f.trees)) - - + self.forest = Forest(model, dataset.extended_feature_names_as_array_strings) + + self.enc = SATEncoder(self.forest, self.data.feature_names, self.data.num_class, + self.data.extended_feature_names_as_array_strings) + def encode(self, inst): """ Encode a tree ensemble trained previously. """ if 'f' not in dir(self): self.f = Forest(self.cls, self.data.extended_feature_names_as_array_strings) - #self.f.print_tree() - - time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - self.enc = SATEncoder(self.f, self.data.feature_names, self.data.num_class, \ - self.data.extended_feature_names_as_array_strings) - + + self.enc = SATEncoder(self.forest, self.data.feature_names, self.data.num_class, + self.data.extended_feature_names_as_array_strings) + inst = self.data.transform(np.array(inst))[0] formula, _, _, _ = self.enc.encode(inst) - - time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - time - - if self.verbose: - print('c nof vars:', formula.nv) # number of variables - print('c nof clauses:', len(formula.clauses)) # number of clauses - print('c encoding time: {0:.3f}'.format(time)) - - def explain(self, inst): + + def explain_sample(self, inst): """ Explain a prediction made for a given sample with a previously trained RF. """ - - time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - + if 'enc' not in dir(self): self.encode(inst) - - #if self.verbose: - # print("instance: {0}".format(np.array(inst)) ) inpvals = self.data.readable_sample(inst) preamble = [] @@ -402,73 +238,96 @@ class XRF(object): preamble.append('{0} = {1}'.format(f, v)) else: preamble.append(v) - - inps = self.data.extended_feature_names_as_array_strings # input (feature value) variables - #print("inps: {0}".format(inps)) - - self.x = SATExplainer(self.enc, inps, preamble, self.data.target_name, verb=self.verbose) - inst = self.data.transform(np.array(inst))[0] + + inps = self.data.extended_feature_names_as_array_strings # input (feature value) variables + # print("inps: {0}".format(inps)) + + self.x = SATExplainer(self.enc, inps, preamble, self.data.target_name) + inst = self.data.transform(np.array(inst))[0] expl = self.x.explain(np.array(inst)) - time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - time - - if self.verbose: - print("c Total time: {0:.3f}".format(time)) - - return expl + return expl - - def test_tree_ensemble(self): - if 'f' not in dir(self): - self.f = Forest(self.cls) - - _, X_test, _, y_test = self.data.train_test_split() - X_test = self.data.transform(X_test) - - y_pred_forest = self.f.predict(X_test) - acc = accuracy_score(y_pred_forest, y_test) - print("Test accuracy: {0:.2f}".format(100. * acc)) - - y_pred_cls = self.cls.predict(X_test) - #print(np.asarray(y_pred_cls, np.int64)) - #print(y_pred_forest) - - assert((y_pred_cls == y_pred_forest).all()) - - -# -#============================================================================== + # copie de git + def explain(self, samples): + # timers + lengths = [] + tested = set() + mSAT, mUNSAT = 0.0, 0.0 + stimes = [] + utimes = [] + nSatCalls = [] + nUnsCalls = [] + + ltimes = [] + ctimes = [] + wins = 0 + + multiple_expls = [] + for i, s in enumerate(samples): + sample = [float(v.strip()) for v in s.split(',')] + + if tuple(sample) in tested: + continue + + # print("inst#{0}".format(i+1)) + + tested.add(tuple(sample)) + # print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) + + self.encode(sample) + expl = self.explain_sample(sample) + multiple_expls.append(expl) + lengths.append(len(expl)) + + nvars = self.enc.cnf.nv + nclauses = len(self.enc.cnf.clauses) + + # mSAT = max(xrf.x.stimes+[mSAT]) + # mUNSAT = max(xrf.x.utimes+[mUNSAT]) + if len(self.x.stimes): + stimes.append(max(self.x.stimes)) + if len(self.x.utimes): + utimes.append(max(self.x.utimes)) + nSatCalls.append(self.x.nsat) + nUnsCalls.append(self.x.nunsat) + + del self.enc + del self.x + + return multiple_expls + + + # ============================================================================== class SATEncoder(object): """ Encoder of Random Forest classifier into SAT. """ - - def __init__(self, forest, feats, nof_classes, extended_feature_names, from_file=None): + + def __init__(self, forest, feats, nof_classes, extended_feature_names, from_file=None): self.forest = forest - #self.feats = {f: i for i, f in enumerate(feats)} + # self.feats = {f: i for i, f in enumerate(feats)} self.num_class = nof_classes self.vpool = IDPool() self.extended_feature_names = extended_feature_names - - #encoding formula + + # encoding formula self.cnf = None # for interval-based encoding self.intvs, self.imaps, self.ivars, self.thvars = None, None, None, None - - + def newVar(self, name): - - if name in self.vpool.obj2id: #var has been already created + + if name in self.vpool.obj2id: # var has been already created return self.vpool.obj2id[name] - + var = self.vpool.id('{0}'.format(name)) return var - + def printLits(self, lits): - print(["{0}{1}".format("-" if p<0 else "",self.vpool.obj(abs(p))) for p in lits]) - + print(["{0}{1}".format("-" if p < 0 else "", self.vpool.obj(abs(p))) for p in lits]) + def traverse(self, tree, k, clause): """ Traverse a tree and encode each node. @@ -484,21 +343,20 @@ class SATEncoder(object): else: var = self.newVar(tree.name) pos, neg = var, -var - #print("{0} => {1}".format(tree.name, var)) - + # print("{0} => {1}".format(tree.name, var)) + assert (pos and neg) self.traverse(tree.children[0], k, clause + [-neg]) - self.traverse(tree.children[1], k, clause + [-pos]) + self.traverse(tree.children[1], k, clause + [-pos]) else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k)) + cvar = self.newVar('class{0}_tr{1}'.format(tree.values, k)) self.cnf.append(clause + [cvar]) - #self.printLits(clause + [cvar]) + # self.printLits(clause + [cvar]) def compute_intervals(self): """ Traverse all trees in the ensemble and extract intervals for each feature. - At this point, the method only works for numerical datasets! """ @@ -521,9 +379,10 @@ class SATEncoder(object): for tree in self.forest.trees: traverse_intervals(tree) - + # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ([math.inf] if len(self.intvs[f]) else []) for f in six.iterkeys(self.intvs)} + self.intvs = {f: sorted(self.intvs[f]) + ([math.inf] if len(self.intvs[f]) else []) for f in + six.iterkeys(self.intvs)} self.imaps, self.ivars = {}, {} self.thvars = {} @@ -536,195 +395,183 @@ class SATEncoder(object): ivar = self.newVar('{0}_intv{1}'.format(feat, i)) self.ivars[feat].append(ivar) - #print('{0}_intv{1}'.format(feat, i)) - + # print('{0}_intv{1}'.format(feat, i)) + if ub != math.inf: - #assert(i < len(intvs)-1) + # assert(i < len(intvs)-1) thvar = self.newVar('{0}_th{1}'.format(feat, i)) self.thvars[feat].append(thvar) - #print('{0}_th{1}'.format(feat, i)) - - + # print('{0}_th{1}'.format(feat, i)) def encode(self, sample): """ Do the job. """ - + ###print('Encode RF into SAT ...') self.cnf = CNF() # getting a tree ensemble - #self.forest = Forest(self.model, self.extended_feature_names) + # self.forest = Forest(self.model, self.extended_feature_names) num_tree = len(self.forest.trees) self.forest.predict_inst(sample) - #introducing class variables - #cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] - + # introducing class variables + # cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] + # define Tautology var vtaut = self.newVar('Tautology') - + # introducing class-tree variables ctvars = [[] for t in range(num_tree)] for k in range(num_tree): for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j,k)) - ctvars[k].append(var) + var = self.newVar('class{0}_tr{1}'.format(j, k)) + ctvars[k].append(var) - # traverse all trees and extract all possible intervals + # traverse all trees and extract all possible intervals # for each feature ###print("compute intervarls ...") self.compute_intervals() - - #print(self.intvs) - #print([len(self.intvs[f]) for f in self.intvs]) - #print(self.imaps) - #print(self.ivars) - #print(self.thvars) - #print(ctvars) - - + + # print(self.intvs) + # print([len(self.intvs[f]) for f in self.intvs]) + # print(self.imaps) + # print(self.ivars) + # print(self.thvars) + # print(ctvars) + ##print("encode trees ...") # traversing and encoding each tree for k, tree in enumerate(self.forest.trees): - #print("Encode tree#{0}".format(k)) - # encoding the tree + # print("Encode tree#{0}".format(k)) + # encoding the tree self.traverse(tree, k, []) # exactly one class var is true - #self.printLits(ctvars[k]) - card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool,encoding=EncType.cardnetwrk) + # self.printLits(ctvars[k]) + card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool, encoding=EncType.cardnetwrk) self.cnf.extend(card.clauses) - - - - # calculate the majority class - self.cmaj = self.forest.predict_inst(sample) - - ##print("encode majority class ...") - #Cardinality constraint AtMostK to capture a j_th class - - if(self.num_class == 2): + + # calculate the majority class + self.cmaj = self.forest.predict_inst(sample) + + ##print("encode majority class ...") + # Cardinality constraint AtMostK to capture a j_th class + + if (self.num_class == 2): rhs = math.floor(num_tree / 2) + 1 - if(self.cmaj==1 and not num_tree%2): - rhs = math.floor(num_tree / 2) + if (self.cmaj == 1 and not num_tree % 2): + rhs = math.floor(num_tree / 2) lhs = [ctvars[k][1 - self.cmaj] for k in range(num_tree)] - atls = CardEnc.atleast(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + atls = CardEnc.atleast(lits=lhs, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) self.cnf.extend(atls) - else: + else: zvars = [] - zvars.append([self.newVar('z_0_{0}'.format(k)) for k in range (num_tree) ]) - zvars.append([self.newVar('z_1_{0}'.format(k)) for k in range (num_tree) ]) + zvars.append([self.newVar('z_0_{0}'.format(k)) for k in range(num_tree)]) + zvars.append([self.newVar('z_1_{0}'.format(k)) for k in range(num_tree)]) ## rhs = num_tree - lhs0 = zvars[0] + [ - ctvars[k][self.cmaj] for k in range(num_tree)] + lhs0 = zvars[0] + [- ctvars[k][self.cmaj] for k in range(num_tree)] ##self.printLits(lhs0) - atls = CardEnc.atleast(lits = lhs0, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + atls = CardEnc.atleast(lits=lhs0, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) self.cnf.extend(atls) ## - #rhs = num_tree - 1 + # rhs = num_tree - 1 rhs = num_tree + 1 ########### - lhs1 = zvars[1] + [ - ctvars[k][self.cmaj] for k in range(num_tree)] + lhs1 = zvars[1] + [- ctvars[k][self.cmaj] for k in range(num_tree)] ##self.printLits(lhs1) - atls = CardEnc.atleast(lits = lhs1, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(atls) + atls = CardEnc.atleast(lits=lhs1, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(atls) # pvars = [self.newVar('p_{0}'.format(k)) for k in range(self.num_class + 1)] ##self.printLits(pvars) - for k,p in enumerate(pvars): + for k, p in enumerate(pvars): for i in range(num_tree): if k == 0: z = zvars[0][i] - #self.cnf.append([-p, -z, vtaut]) - self.cnf.append([-p, z, -vtaut]) - #self.printLits([-p, z, -vtaut]) - #print() - elif k == self.cmaj+1: + # self.cnf.append([-p, -z, vtaut]) + self.cnf.append([-p, z, -vtaut]) + # self.printLits([-p, z, -vtaut]) + # print() + elif k == self.cmaj + 1: z = zvars[1][i] - self.cnf.append([-p, z, -vtaut]) - - #self.printLits([-p, z, -vtaut]) - #print() - + self.cnf.append([-p, z, -vtaut]) + + # self.printLits([-p, z, -vtaut]) + # print() + else: - z = zvars[0][i] if (k<self.cmaj+1) else zvars[1][i] - self.cnf.append([-p, -z, ctvars[i][k-1] ]) - self.cnf.append([-p, z, -ctvars[i][k-1] ]) - - #self.printLits([-p, -z, ctvars[i][k-1] ]) - #self.printLits([-p, z, -ctvars[i][k-1] ]) - #print() - + z = zvars[0][i] if (k < self.cmaj + 1) else zvars[1][i] + self.cnf.append([-p, -z, ctvars[i][k - 1]]) + self.cnf.append([-p, z, -ctvars[i][k - 1]]) + + # self.printLits([-p, -z, ctvars[i][k-1] ]) + # self.printLits([-p, z, -ctvars[i][k-1] ]) + # print() + # - self.cnf.append([-pvars[0], -pvars[self.cmaj+1]]) + self.cnf.append([-pvars[0], -pvars[self.cmaj + 1]]) ## - lhs1 = pvars[:(self.cmaj+1)] + lhs1 = pvars[:(self.cmaj + 1)] ##self.printLits(lhs1) - eqls = CardEnc.equals(lits = lhs1, bound = 1, vpool=self.vpool, encoding=EncType.cardnetwrk) + eqls = CardEnc.equals(lits=lhs1, bound=1, vpool=self.vpool, encoding=EncType.cardnetwrk) self.cnf.extend(eqls) - - + lhs2 = pvars[(self.cmaj + 1):] ##self.printLits(lhs2) - eqls = CardEnc.equals(lits = lhs2, bound = 1, vpool=self.vpool, encoding=EncType.cardnetwrk) + eqls = CardEnc.equals(lits=lhs2, bound=1, vpool=self.vpool, encoding=EncType.cardnetwrk) self.cnf.extend(eqls) - - - + ##print("exactly-one feat const ...") # enforce exactly one of the feature values to be chosen # (for categorical features) categories = collections.defaultdict(lambda: []) for f in self.extended_feature_names: if '_' in f: - categories[f.split('_')[0]].append(self.newVar(f)) + categories[f.split('_')[0]].append(self.newVar(f)) for c, feats in six.iteritems(categories): # exactly-one feat is True self.cnf.append(feats) card = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) self.cnf.extend(card.clauses) - # lits of intervals + # lits of intervals for f, intvs in six.iteritems(self.ivars): if not len(intvs): continue - self.cnf.append(intvs) + self.cnf.append(intvs) card = CardEnc.atmost(lits=intvs, vpool=self.vpool, encoding=EncType.cardnetwrk) self.cnf.extend(card.clauses) - #self.printLits(intvs) - - - + # self.printLits(intvs) + for f, threshold in six.iteritems(self.thvars): for j, thvar in enumerate(threshold): - d = j+1 - pos, neg = self.ivars[f][d:], self.ivars[f][:d] - + d = j + 1 + pos, neg = self.ivars[f][d:], self.ivars[f][:d] + if j == 0: - assert(len(neg) == 1) + assert (len(neg) == 1) self.cnf.append([thvar, neg[-1]]) self.cnf.append([-thvar, -neg[-1]]) else: - self.cnf.append([thvar, neg[-1], -threshold[j-1]]) - self.cnf.append([-thvar, threshold[j-1]]) + self.cnf.append([thvar, neg[-1], -threshold[j - 1]]) + self.cnf.append([-thvar, threshold[j - 1]]) self.cnf.append([-thvar, -neg[-1]]) - + if j == len(threshold) - 1: - assert(len(pos) == 1) + assert (len(pos) == 1) self.cnf.append([-thvar, pos[0]]) self.cnf.append([thvar, -pos[0]]) else: - self.cnf.append([-thvar, pos[0], threshold[j+1]]) + self.cnf.append([-thvar, pos[0], threshold[j + 1]]) self.cnf.append([thvar, -pos[0]]) - self.cnf.append([thvar, -threshold[j+1]]) - + self.cnf.append([thvar, -threshold[j + 1]]) - return self.cnf, self.intvs, self.imaps, self.ivars # -#============================================================================== +# ============================================================================== class SATExplainer(object): """ An SAT-inspired minimal explanation extractor for Random Forest models. @@ -734,125 +581,121 @@ class SATExplainer(object): """ Constructor. """ - + self.enc = sat_enc self.inps = inps # input (feature value) variables self.target_name = target_name self.preamble = preamble self.verbose = verb - - self.slv = None + + self.slv = None ##self.slv = Solver(name=options.solver) ##self.slv = Solver(name="minisat22") - #self.slv = Solver(name="glucose3") + # self.slv = Solver(name="glucose3") # CNF formula - #self.slv.append_formula(self.enc.cnf) + # self.slv.append_formula(self.enc.cnf) - def explain(self, sample, smallest=False): """ Hypotheses minimization. """ - if self.verbose: - print(' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) - - #create a SAT solver - self.slv = Solver(name="glucose3") - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime + expl = {} + expl["Instance"] = '"IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj]) + + # create a SAT solver + self.slv = Solver(name="glucose3") # adapt the solver to deal with the current sample - #self.csel = [] + # self.csel = [] self.assums = [] # var selectors to be used as assumptions self.sel2fid = {} # selectors to original feature ids self.sel2vid = {} # selectors to categorical feature ids - self.sel2v = {} # selectors to (categorical/interval) values - - #for i in range(self.enc.num_class): + self.sel2v = {} # selectors to (categorical/interval) values + + # for i in range(self.enc.num_class): # self.csel.append(self.enc.newVar('class{0}'.format(i))) - #self.csel = self.enc.newVar('class{0}'.format(self.enc.cmaj)) - + # self.csel = self.enc.newVar('class{0}'.format(self.enc.cmaj)) + # preparing the selectors for i, (inp, val) in enumerate(zip(self.inps, sample), 1): if '_' in inp: - + assert (inp not in self.enc.intvs) - + feat = inp.split('_')[0] selv = self.enc.newVar('selv_{0}'.format(feat)) - - self.assums.append(selv) + + self.assums.append(selv) if selv not in self.sel2fid: self.sel2fid[selv] = int(feat[1:]) self.sel2vid[selv] = [i - 1] else: self.sel2vid[selv].append(i - 1) - - p = self.enc.newVar(inp) + + p = self.enc.newVar(inp) if not val: p = -p else: self.sel2v[selv] = p - + self.enc.cnf.append([-selv, p]) - - #self.enc.printLits([-selv, p]) - + + # self.enc.printLits([-selv, p]) + elif len(self.enc.intvs[inp]): - #v = None - #for intv in self.enc.intvs[inp]: + # v = None + # for intv in self.enc.intvs[inp]: # if intv > val: # v = intv - # break - v = next((intv for intv in self.enc.intvs[inp] if intv > val), None) - assert(v is not None) - - selv = self.enc.newVar('selv_{0}'.format(inp)) - self.assums.append(selv) - + # break + v = next((intv for intv in self.enc.intvs[inp] if intv > val), None) + assert (v is not None) + + selv = self.enc.newVar('selv_{0}'.format(inp)) + self.assums.append(selv) + assert (selv not in self.sel2fid) self.sel2fid[selv] = int(inp[1:]) self.sel2vid[selv] = [i - 1] - - for j,p in enumerate(self.enc.ivars[inp]): + + for j, p in enumerate(self.enc.ivars[inp]): cl = [-selv] if j == self.enc.imaps[inp][v]: cl += [p] self.sel2v[selv] = p else: cl += [-p] - + self.enc.cnf.append(cl) - #self.enc.printLits(cl) + # self.enc.printLits(cl) ''' with open("/tmp/pendigits.cnf", 'w') as fp: fp.write('p cnf {0} {1}\n'.format(self.enc.cnf.nv, len(self.enc.cnf.clauses))) for p in self.assums + [-self.csel]: fp.write('{0} 0\n'.format(str(p))) - + for cl in self.enc.cnf.clauses: fp.write(' '.join([str(p) for p in cl+[0]])) fp.write('\n') fp.close() print(self.assums + [self.csel]) ''' - + self.assums = sorted(set(self.assums)) if self.verbose: - print(' # hypos:', len(self.assums)) - - # pass a CNF formula - self.slv.append_formula(self.enc.cnf) - + expl['hypos:'] = len(self.assums) + + # pass a CNF formula + self.slv.append_formula(self.enc.cnf) + ''' # if unsat, then the observation is not implied by the assumptions if not self.slv.solve(assumptions=self.assums+[self.csel]): print(' no implication!') print(self.slv.get_core()) sys.exit(1) - + if self.verbose > 1: self.enc.printLits(self.assums+[self.csel]) self.print_sat_model() @@ -862,26 +705,21 @@ class SATExplainer(object): self.compute_minimal() else: raise NotImplementedError('Smallest explanation is not yet implemented.') - #self.compute_smallest() + # self.compute_smallest() - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time + explanation = sorted([self.sel2fid[h] for h in self.assums if h > 0]) + assert len(explanation), 'PI-explanation cannot be an empty-set! otherwise the RF predicts only one class' - expl = sorted([self.sel2fid[h] for h in self.assums if h>0 ]) - assert len(expl), 'PI-explanation cannot be an empty-set! otherwise the RF predicts only one class' - # delete sat solver self.slv.delete() self.slv = None - if self.verbose: - print("expl-selctors: ", expl) - self.preamble = [self.preamble[i] for i in expl] - print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) - print(' # hypos left:', len(expl)) - print(' time: {0:.3f}'.format(self.time)) + expl["expl selctors:"] = explanation + self.preamble = [self.preamble[i] for i in explanation] + expl["explanation :"] = "IF {0} THEN {1}".format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj]) + expl["hypos left :"] = len(explanation) - return expl + return expl def compute_minimal(self): """ @@ -889,27 +727,27 @@ class SATExplainer(object): """ nsat, nunsat = 0, 0 stimes, utimes = [], [] - + vtaut = self.enc.newVar('Tautology') - + # simple deletion-based linear search for i, p in enumerate(self.assums): to_test = [vtaut] + self.assums[:i] + self.assums[(i + 1):] + [-p, -self.sel2v[p]] - + t0 = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - + resource.getrusage(resource.RUSAGE_SELF).ru_utime + sat = self.slv.solve(assumptions=to_test) - + if not sat: - self.assums[i] = -p + self.assums[i] = -p elif self.verbose > 1: self.print_sat_model() - + t = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ resource.getrusage(resource.RUSAGE_SELF).ru_utime - t0 - #print("{0} {1:.2f}s".format("SAT" if sat else "UNSAT", t)) - + # print("{0} {1:.2f}s".format("SAT" if sat else "UNSAT", t)) + if sat: nsat += 1 stimes.append(t) @@ -917,60 +755,58 @@ class SATExplainer(object): self.enc.printLits(to_test) print("SAT") else: - #print("Core: ",self.slv.get_core()) + # print("Core: ",self.slv.get_core()) nunsat += 1 utimes.append(t) if self.verbose > 1: self.enc.printLits(to_test) - print("UNSAT") + print("UNSAT") if self.verbose: print('') - print('#SAT: {0} | #UNSAT: {1}'.format(len(stimes), len(utimes))) - if(nsat): + print('#SAT: {0} | #UNSAT: {1}'.format(len(stimes), len(utimes))) + if (nsat): print('SAT: tot: {0:.2f} | m: {1:.2f} | M: {2:.2f} | avg: {3:.2f}'.format( - sum(stimes), min(stimes), max(stimes), sum(stimes) / len(stimes))) - if(nunsat): + sum(stimes), min(stimes), max(stimes), sum(stimes) / len(stimes))) + if (nunsat): print('UNSAT: tot: {0:.2f} | m: {1:.2f} | M: {2:.2f} | avg: {3:.2f}'.format( - sum(utimes), min(utimes), max(utimes), sum(utimes) / len(utimes))) - print('') - + sum(utimes), min(utimes), max(utimes), sum(utimes) / len(utimes))) + print('') + self.stimes, self.utimes = stimes, utimes self.nsat, self.nunsat = nsat, nunsat - - + def print_sat_model(self): - assert(self.slv.get_model()) - model = [ p for p in self.slv.get_model() if self.enc.vpool.obj(abs(p)) ] + assert (self.slv.get_model()) + model = [p for p in self.slv.get_model() if self.enc.vpool.obj(abs(p))] str_model = [] lits = [] for p in model: if self.enc.vpool.obj(abs(p)) in self.inps: str_model.append((p, self.enc.vpool.obj(abs(p)))) - + elif ("class" in self.enc.vpool.obj(abs(p))): - str_model.append((p, self.enc.vpool.obj(abs(p)))) - - #elif ("intv" in self.enc.vpool.obj(abs(p))) : + str_model.append((p, self.enc.vpool.obj(abs(p)))) + + # elif ("intv" in self.enc.vpool.obj(abs(p))) : # str_model.append((p, self.enc.vpool.obj(abs(p)))) - - if ("_tr" in self.enc.vpool.obj(abs(p))) : - lits.append(p) - - if ("p_" in self.enc.vpool.obj(abs(p))) : - str_model.append((p, self.enc.vpool.obj(abs(p)))) - if ("z_" in self.enc.vpool.obj(abs(p))) : - str_model.append((p, self.enc.vpool.obj(abs(p)))) - + + if ("_tr" in self.enc.vpool.obj(abs(p))): + lits.append(p) + + if ("p_" in self.enc.vpool.obj(abs(p))): + str_model.append((p, self.enc.vpool.obj(abs(p)))) + if ("z_" in self.enc.vpool.obj(abs(p))): + str_model.append((p, self.enc.vpool.obj(abs(p)))) + print("Model:", str_model) - ###print(self.slv.get_model()) - + ###print(self.slv.get_model()) + num_tree = len(self.enc.forest.trees) num_class = self.enc.num_class - occ = [0]*num_class - + occ = [0] * num_class + for p in lits: if p > 0: j = int(self.enc.vpool.obj(abs(p))[5]) - occ[j] +=1 - print(occ) - + occ[j] += 1 + print(occ) \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/tree.py b/pages/application/RandomForest/utils/xrf/tree.py index da81c9820d69d96061446e9d1eafbcb265bf1351..bcac5b2721f9640cbbca33699855168ed172e325 100644 --- a/pages/application/RandomForest/utils/xrf/tree.py +++ b/pages/application/RandomForest/utils/xrf/tree.py @@ -1,6 +1,6 @@ # -#============================================================================== -from anytree import Node, RenderTree,AsciiStyle +# ============================================================================== +from anytree import Node, RenderTree, AsciiStyle import json import numpy as np import math @@ -8,34 +8,34 @@ import os # -#============================================================================== +# ============================================================================== class dt_node(Node): - def __init__(self, id, parent = None): + def __init__(self, id, parent=None): Node.__init__(self, id, parent) self.id = id # The node value self.name = None - self.left_node_id = -1 # Left child + self.left_node_id = -1 # Left child self.right_node_id = -1 # Right child self.feature = -1 self.threshold = None - self.values = -1 - #iai - #self.split = None + self.values = -1 + # iai + # self.split = None def __str__(self): pref = ' ' * self.depth if len(self.children) == 0: - return (pref+ "leaf: {} {}".format(self.id, self.values)) + return (pref + "leaf: {} {}".format(self.id, self.values)) else: - if(self.name is None): - return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) + if (self.name is None): + return (pref + "{} f{}<{}".format(self.id, self.feature, self.threshold)) else: - return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) + return (pref + "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) -#============================================================================== -def build_tree(tree_, feature_names = None): +# ============================================================================== +def build_tree(tree_, feature_names=None): ## feature = tree_.feature threshold = tree_.threshold @@ -49,29 +49,29 @@ def build_tree(tree_, feature_names = None): while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 - + # If we have a test node if (children_left[node_id] != children_right[node_id]): stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: - is_leaf[node_id] = True - ## - - m = tree_.node_count + is_leaf[node_id] = True + ## + + m = tree_.node_count assert (m > 0), "Empty tree" - - def extract_data(idx, root = None, feature_names = None): + + def extract_data(idx, root=None, feature_names=None): i = idx assert (i < m), "Error index node" if (root is None): node = dt_node(i) else: - node = dt_node(i, parent = root) - #node.cover = json_node["cover"] + node = dt_node(i, parent=root) + # node.cover = json_node["cover"] if is_leaf[i]: node.values = np.argmax(values[i]) - #if(inverse): + # if(inverse): # node.values = -node.values else: node.feature = feature[i] @@ -80,17 +80,17 @@ def build_tree(tree_, feature_names = None): node.threshold = threshold[i] node.left_node_id = children_left[i] node.right_node_id = children_right[i] - extract_data(node.left_node_id, node, feature_names) #feat < threshold ( < 0.5 False) - extract_data(node.right_node_id, node, feature_names) #feat >= threshold ( >= 0.5 True) + extract_data(node.left_node_id, node, feature_names) # feat < threshold ( < 0.5 False) + extract_data(node.right_node_id, node, feature_names) # feat >= threshold ( >= 0.5 True) return node - + root = extract_data(0, None, feature_names) - + return root -#============================================================================== +# ============================================================================== def walk_tree(node): if (len(node.children) == 0): # leaf @@ -100,17 +100,20 @@ def walk_tree(node): walk_tree(node.children[0]) walk_tree(node.children[1]) + def count_nodes(root): def count(node): if len(node.children): - return sum([1+count(n) for n in node.children]) + return sum([1 + count(n) for n in node.children]) else: return 0 + m = count(root) + 1 return m + # -#============================================================================== +# ============================================================================== def predict_tree(node, sample): if (len(node.children) == 0): # leaf @@ -118,33 +121,33 @@ def predict_tree(node, sample): else: feature_branch = node.feature sample_value = sample[feature_branch] - assert(sample_value is not None) - if(sample_value < node.threshold): + assert (sample_value is not None) + if (sample_value < node.threshold): return predict_tree(node.children[0], sample) else: return predict_tree(node.children[1], sample) - + # -#============================================================================== +# ============================================================================== class Forest: """ An ensemble of decision trees. This object provides a common interface to many different types of models. """ - def __init__(self, rf, feature_names = None): - #self.rf = rf - self.trees = [ build_tree(dt.tree_, feature_names) for dt in rf.estimators()] + + def __init__(self, rf, feature_names=None): + # self.rf = rf + + self.trees = [build_tree(dt.tree_, feature_names) for dt in rf.estimators()] self.sz = sum([dt.tree_.node_count for dt in rf.estimators()]) self.md = max([dt.tree_.max_depth for dt in rf.estimators()]) #### nb_nodes = [dt.tree_.node_count for dt in rf.estimators()] - print("min: {0} | max: {1}".format(min(nb_nodes), max(nb_nodes))) - assert([dt.tree_.node_count for dt in rf.estimators()] == [count_nodes(dt) for dt in self.trees]) - #self.print_trees() - + assert ([dt.tree_.node_count for dt in rf.estimators()] == [count_nodes(dt) for dt in self.trees]) + def print_trees(self): - for i,t in enumerate(self.trees): + for i, t in enumerate(self.trees): print("tree number: ", i) walk_tree(t) @@ -153,22 +156,20 @@ class Forest: scores = np.asarray(scores) maj = np.argmax(np.bincount(scores)) return maj - - - def predict(self, samples): + + def predict(self, samples): predictions = [] print("#Trees: ", len(self.trees)) for sample in np.asarray(samples): scores = [] - for i,t in enumerate(self.trees): + for i, t in enumerate(self.trees): s = predict_tree(t, sample) scores.append((s)) scores = np.asarray(scores) predictions.append(scores) - predictions = np.asarray(predictions) - #print(predictions) - #np.bincount(x, weights=self._weights_not_none) + predictions = np.asarray(predictions) + # print(predictions) + # np.bincount(x, weights=self._weights_not_none) maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions) - - return maj + return maj diff --git a/utils.py b/utils.py index 87f4fa3915fff6f3e7fc443f85f0d483cf0726b2..4b76ea3834d9fc79d7147e5a0a245e4e4a50858d 100644 --- a/utils.py +++ b/utils.py @@ -20,10 +20,11 @@ def parse_contents_graph(contents, filename): content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) try: - if 'mod.pkl' in filename: - data = pickle.load(io.BytesIO(decoded)) - elif '.pkl' in filename: - data = joblib.load(io.BytesIO(decoded)) + if '.pkl' in filename: + try: + data = joblib.load(io.BytesIO(decoded)) + except : + data = pickle.load(io.BytesIO(decoded)) elif '.txt' in filename: data = decoded.decode('utf-8').strip() except Exception as e: @@ -58,8 +59,11 @@ def parse_contents_instance(contents, filename): try: if '.csv' in filename: data = decoded.decode('utf-8') + features_names, data = str(data).strip().split('\n')[:2] + features_names = str(features_names).strip().split(',') data = str(data).strip().split(',') - data = list(map(lambda i: tuple([i[0], np.float32(i[1])]), [i.split('=') for i in data])) + data = list(tuple([features_names[i], np.float32(data[i])]) for i in range(len(data))) + print(data) elif '.txt' in filename: data = decoded.decode('utf-8') data = str(data).strip().split(',') @@ -72,6 +76,9 @@ def parse_contents_instance(contents, filename): data = decoded.decode('utf-8').strip() data = json.loads(data) data = list(tuple(data.items())) + elif '.samples' in filename: + decoded = decoded.decode('utf-8').strip() + data = str(decoded).split('\n') except Exception as e: print(e) return html.Div([