diff --git a/callbacks.py b/callbacks.py index d832b6eb999ccdd92111e317920562c6af9e2ae8..db26d680cc5aebd844896184f88ad7bc61a0c05d 100644 --- a/callbacks.py +++ b/callbacks.py @@ -4,6 +4,10 @@ from dash.exceptions import PreventUpdate from utils import parse_contents_graph, parse_contents_instance, parse_contents_data +from pages.application.RandomForest.utils import xrf +from pages.application.RandomForest.utils.xrf import * +sys.modules['xrf'] = xrf + def register_callbacks(page_home, page_course, page_application, app): page_list = ['home', 'course', 'application'] @@ -76,11 +80,12 @@ def register_callbacks(page_home, page_course, page_application, app): Input('solver_sat', 'value'), Input('expl_choice', 'value'), Input('cont_expl_choice', 'value'), + Input('choice_tree', 'value'), prevent_initial_call=True ) def update_ml_type(value_ml_model, pretrained_model_contents, pretrained_model_filename, model_info, - model_info_filename, - instance_contents, instance_filename, enum, xtype, solver, expl_choice, cont_expl_choice): + model_info_filename,instance_contents, instance_filename, enum, xtype, solver, expl_choice, + cont_expl_choice, id_tree): ctx = dash.callback_context if ctx.triggered: ihm_id = ctx.triggered[0]['prop_id'].split('.')[0] @@ -159,17 +164,27 @@ def register_callbacks(page_home, page_course, page_application, app): model_application.update_cont_expl(cont_expl_choice) return model_application.component.network, model_application.component.explanation + # In the case of RandomForest, id of tree to choose to draw tree + elif ihm_id == 'choice_tree': + if model_application.ml_model is None or model_application.pretrained_model is None: + raise PreventUpdate + model_application.update_tree_to_plot(id_tree) + return model_application.component.network, model_application.component.explanation + @app.callback( Output('explanation', 'hidden'), Output('interaction_graph', 'hidden'), Output('expl_choice', 'options'), Output('cont_expl_choice', 'options'), + Input('ml_model_choice', 'value'), Input('explanation', 'children'), Input('explanation_type', 'value'), prevent_initial_call=True ) - def layout_buttons_navigate_expls(explanation, explanation_type): - if explanation is None or len(explanation_type) == 0: + def layout_buttons_navigate_expls(ml_type, explanation, explanation_type): + if ml_type != "DecisionTree": + return True, True, {}, {} + elif explanation is None or len(explanation_type) == 0: return True, True, {}, {} elif "AXp" not in explanation_type and "CXp" in explanation_type: return False, True, {}, {} @@ -195,3 +210,14 @@ def register_callbacks(page_home, page_course, page_application, app): return False else: return True + + @app.callback( + Output('choosing_tree', 'hidden'), + Input('graph', 'children'), + prevent_initial_call=True + ) + def choose_tree_in_forest(graph): + if page_application.model.ml_model == "RandomForest" and graph is not None: + return False + else: + return True diff --git a/data_retriever.json b/data_retriever.json index 5a1a78bbb95fd8c141fa5c95ee4c998e6c7463ae..1607a1a1fcc43cca6c7025fa1ea37db6076e535f 100644 --- a/data_retriever.json +++ b/data_retriever.json @@ -20,8 +20,8 @@ { "ml_type" : "RandomForest", "component" : "RandomForestComponent", - "solvers" : ["LIME", "ANCHOR", "SHAP"], - "xtypes" : {"S": "Smallest", "NS": "Not smallest"} + "solvers" : ["SAT"], + "xtypes" : {"M": "Minimal explanation"} } ] diff --git a/pages/application/NaiveBayes/NaiveBayesComponent.py b/pages/application/NaiveBayes/NaiveBayesComponent.py index 1d0d2efbbb1e6657e1fc40920e2371e557ffbd86..a34cb38acf75184d623287568da207a49bef5e59 100644 --- a/pages/application/NaiveBayes/NaiveBayesComponent.py +++ b/pages/application/NaiveBayes/NaiveBayesComponent.py @@ -14,7 +14,7 @@ class NaiveBayesComponent(): def __init__(self, model, info=None, type_info=''): #Conversion model - p=subprocess.Popen(['perl','pages/application/NaiveBayes/utils/cnbc2xlc.pl', model],stdout=subprocess.PIPE) + p=subprocess.Popen(['perl','pages/application/NaiveBayes/utils_old/cnbc2xlc.pl', model],stdout=subprocess.PIPE) print(p.stdout.read()) self.naive_bayes = model @@ -27,7 +27,7 @@ class NaiveBayesComponent(): def update_with_explicability(self, instance, enum, xtype, solver) : # Call explanation - p=subprocess.Popen(['perl','pages/application/NaiveBayes/utils/xpxlc.pl', self.naive_bayes, instance, self.map_file],stdout=subprocess.PIPE) + p=subprocess.Popen(['perl','pages/application/NaiveBayes/utils_old/xpxlc.pl', self.naive_bayes, instance, self.map_file],stdout=subprocess.PIPE) print(p.stdout.read()) self.explanation = [] diff --git a/pages/application/RandomForest/RandomForestComponent.py b/pages/application/RandomForest/RandomForestComponent.py index 97deaeec675ef07998d98e35e46b8e386b711df5..ca8071f45d41519f6db1fac163930122b335c692 100644 --- a/pages/application/RandomForest/RandomForestComponent.py +++ b/pages/application/RandomForest/RandomForestComponent.py @@ -1,27 +1,8 @@ from dash import html -from io import StringIO -import pandas as pd - -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble._voting import VotingClassifier +import dash_interactive_graphviz +from sklearn import tree from pages.application.RandomForest.utils import xrf -from pages.application.RandomForest.utils.xrf.rndmforest import XRF, RF2001, VotingRF, Dataset -from xgboost import XGBClassifier, XGBRFClassifier - -from pages.application.RandomForest.utils.anchor_wrap import anchor_call -from pages.application.RandomForest.utils.lime_wrap import lime_call -from pages.application.RandomForest.utils.shap_wrap import shap_call - -from pages.application.RandomForest.utils.xgbooster import XGBooster -from pages.application.RandomForest.utils.xgbrf import XGBRandomForest - - -################################################################# -##################### Questions ################################# -##### Can upload sklearn model or need to translate to xrf ###### -# separate XGBRandomForest and XGBooster # -# data format : avoir un fichier chelou de données là # -# pas adapté, get_dump ? beaucoup de classe, besoin de diagramme de classe # +from pages.application.RandomForest.utils.xrf.xforest import XRF, Dataset class RandomForestComponent: @@ -29,52 +10,58 @@ class RandomForestComponent: def __init__(self, model, info=None, type_info=''): # Conversion model - options = {} - if info is not None and '.csv' in type_info: - if isinstance(model, RandomForestClassifier) or isinstance(model, VotingClassifier) \ - or isinstance(model, xrf.rndmforest.RF2001): - self.data = Dataset(info) - self.data.mapping_features() - self.random_forest = XRF(model, self.data) - elif isinstance(model, XGBRFClassifier): - self.random_forest = XGBRandomForest(options, from_model=model) - elif isinstance(model, XGBClassifier): - self.random_forest = XGBooster(options, from_model=model) - - self.network = html.Div([]) + self.data = Dataset(info) + self.data.mapping_features() + + if info is not None and 'csv' in type_info: + if isinstance(model, xrf.rndmforest.RF2001): + self.random_forest = XRF(model, self.data.feature_names, self.data.class_names) + else: + raise NotImplementedError('No explainer for this model') + + self.tree_to_plot = 0 + dot_source = tree.export_graphviz(self.random_forest.cls.estimators()[self.tree_to_plot]) + self.network = html.Div([dash_interactive_graphviz.DashInteractiveGraphviz( + dot_source=dot_source, style={"width": "50%", + "height": "80%", + "background-color": "transparent"} + )]) self.explanation = [] - def update_with_explicability(self, instance, enum_feats=None, xtype=None, solver=None, ): - - # Call explanation - - if not enum_feats and self.data is not None: - enum_feats = len(self.data.nb_features) - 1 + def update_with_explicability(self, instance, enum_feats=None, xtype=None, solver=None): - smallest = True if xtype == "S" else False - if isinstance(self.random_forest, XRF): - explanation_result = self.random_forest.explain(instance) + instance = instance[0] + if "=" in instance: + splitted_instance = [float(v.split('=')[1].strip()) for v in instance.split(',')] else: - explanation_result = self.random_forest.explain(instance, smallest, solver, - use_lime=lime_call if solver == "LIME" else None, - use_anchor=anchor_call if solver == "ANCHOR" else None, - use_shap=shap_call if solver == "SHAP" else None, - nof_feats=enum_feats) - + splitted_instance = [float(v.strip()) for v in instance.split(',')] + # Call explanation self.explanation = [] + self.explanation.append(html.H5("Instance")) + self.explanation.append(html.Hr()) + self.explanation.append( + html.P(str([tuple((self.data.feature_names[i], str(splitted_instance[i]))) for i in + range(len(splitted_instance))]))) + self.explanation.append(html.Hr()) + + explanation_result = None + if isinstance(self.random_forest, XRF): + explanation_result = self.random_forest.explain(splitted_instance) list_explanations_path = [] - - self.network = html.Div([]) - # Creating a clean and nice text component - compt = 0 - for sample_expl in explanation_result: - compt += 1 - self.explanation.append(html.H4("Sample{0} : \n".format(compt))) - for k in sample_expl.keys(): - self.explanation.append(html.H5(k)) - self.explanation.append(html.Hr()) - self.explanation.append(html.P(sample_expl[k])) - self.explanation.append(html.Hr()) + for k in explanation_result.keys(): + self.explanation.append(html.H5(k)) + self.explanation.append(html.Hr()) + self.explanation.append(html.P(explanation_result[k])) + self.explanation.append(html.Hr()) return list_explanations_path, [] + + def update_plotted_tree(self, tree_to_plot): + self.tree_to_plot = tree_to_plot + dot_source = tree.export_graphviz(self.random_forest.cls.estimators()[self.tree_to_plot]) + self.network = html.Div([dash_interactive_graphviz.DashInteractiveGraphviz( + dot_source=dot_source, style={"width": "50%", + "height": "80%", + "background-color": "transparent"} + )]) diff --git a/pages/application/RandomForest/utils/__init__.py b/pages/application/RandomForest/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pages/application/RandomForest/utils/anchor_wrap/__init__.py b/pages/application/RandomForest/utils/anchor_wrap/__init__.py deleted file mode 100644 index 3d4bcf2c92b14c5fdda52e9d291aa6995e713bce..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/anchor_wrap/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .anchor_wrap import * diff --git a/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py b/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py deleted file mode 100644 index f66ec0a8fd48dc0e25b2760968d20e16b392968f..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## anchor_wrap.py (reuses parts of the code of SHAP) -## -## Created on: Jan 6, 2019 -## Author: Nina Narodytska, Alexey Ignatiev -## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import json -import numpy as np -import xgboost as xgb -import math -import resource -from anchor import utils -from anchor import anchor_tabular -import sklearn -import sklearn.ensemble - - -# -#============================================================================== -def anchor_call(xgb, sample=None, nb_samples=5, feats='all', - nb_features_in_exp=5, threshold=0.95): - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # we need a way to say that features are categorical ? - # we do not have this informations. - explainer = anchor_tabular.AnchorTabularExplainer( - class_names=xgb.target_name, - feature_names=xgb.feature_names, - train_data=xgb.X, - categorical_names=xgb.categorical_names if xgb.use_categorical else {}) - # if (len(xgb.X_test) != 0): - # explainer.fit(xgb.X_train, xgb.Y_train, xgb.X_test, xgb.Y_test) - # else: - # explainer.fit(xgb.X_train, xgb.Y_train, xgb.X_train, xgb.Y_train) - predict_fn_xgb = lambda x: xgb.model.predict(xgb.transform(x)).astype(int) - - f2imap = {} - for i, f in enumerate(xgb.feature_names): - f2imap[f.strip()] = i - - if (sample is not None): - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except Exception as inst: - print("Cannot parse input sample:", sample, inst) - exit() - print("\n\n\nStarting Anchor explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(xgb.X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = xgb.transform(feat_sample_exp) - y_pred = xgb.model.predict(feat_sample_exp)[0] - y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] - #hack testiing that we use the same onehot encoding - # test_feat_sample_exp = explainer.encoder.transform(feat_sample_exp) - test_y_pred = xgb.model.predict(feat_sample_exp)[0] - test_y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] - assert(np.allclose(y_pred_prob, test_y_pred_prob)) - print('Prediction: ', explainer.class_names[predict_fn_xgb(feat_sample.reshape(1, -1))[0]]) - # exp = explainer.explain_instance(feat_sample, xgb.model.predict, threshold=threshold) - print('sample ====== ', feat_sample) - exp = explainer.explain_instance(feat_sample, predict_fn_xgb, threshold=threshold) - print('Anchor: %s' % (' AND '.join(exp.names()))) - print('Precision: %.2f' % exp.precision()) - print('Coverage: %.2f' % exp.coverage()) - - # explanation - expl = [] - - if (xgb.use_categorical): - for k, v in enumerate(exp.features()): - expl.append(v) - print("Clause ", k, end=": ") - print("feature (", v, ",", explainer.feature_names[v], end="); ") - print("value (", feat_sample[v], ",", explainer.categorical_names[v][int(feat_sample[v])] , ")") - else: - print("We only support datasets with categorical features for Anchor. Please pre-process your data.") - exit() - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - - return sorted(expl) - - ###################################### TESTING - max_sample = nb_samples - y_pred_prob = xgb.model.predict_proba(xgb.X_test) - y_pred = xgb.model.predict(xgb.X_test) - - nb_tests = min(max_sample,len(xgb.Y_test)) - top_labels = 1 - for sample in range(nb_tests): - np.set_printoptions(precision=2) - feat_sample = xgb.X_test[sample] - print("Considering a sample with features:", feat_sample) - if (False): - feat_sample[4] = 3000 - y_pred_prob_sample = xgb.model.predict_proba([feat_sample]) - print(y_pred_prob_sample) - print("\t Predictions:", y_pred_prob[sample]) - exp = explainer.explain_instance(feat_sample, - predict_fn_xgb, - num_features= xgb.num_class, - top_labels = 1, - labels = list(range(xgb.num_class))) - for i in range(xgb.num_class): - if (i != y_pred[sample]): - continue - print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[sample][i], ")") - print("\t \t Features in explanations: ", exp.as_list(label=i)) - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - return diff --git a/pages/application/RandomForest/utils/data.py b/pages/application/RandomForest/utils/data.py index d86e8ecbf6360b8959a17b3282cb8f7303b0dc51..322c5e5bc0d9c532f2aa2038f6384ff28d2efe46 100644 --- a/pages/application/RandomForest/utils/data.py +++ b/pages/application/RandomForest/utils/data.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#-*- coding:utf-8 -*- +# -*- coding:utf-8 -*- ## ## data.py ## @@ -9,24 +9,24 @@ ## # -#============================================================================== +# ============================================================================== from __future__ import print_function import collections import itertools -import pickle +import os, pickle import six from six.moves import range import numpy as np # -#============================================================================== +# ============================================================================== class Data(object): """ Class for representing data (transactions). """ - def __init__(self, file=None, mapfile=None, separator=',', use_categorical = False): + def __init__(self, file=None, mapfile=None, separator=',', use_categorical=False): """ Constructor and parser. """ @@ -58,14 +58,14 @@ class Data(object): # reading preamble self.names = lines[0].strip().split(separator) self.feats = [set([]) for n in self.names] - del(lines[0]) + del (lines[0]) # filling name to id mapping self.nm2id = {name: i for i, name in enumerate(self.names)} self.nonbin2bin = {} for name in self.nm2id: - spl = name.rsplit(':',1) + spl = name.rsplit(':', 1) if (spl[0] not in self.nonbin2bin): self.nonbin2bin[spl[0]] = [name] else: diff --git a/pages/application/RandomForest/utils/lime_wrap/__init__.py b/pages/application/RandomForest/utils/lime_wrap/__init__.py deleted file mode 100644 index 32487979bf8f14f3ab733fa02b82973843e3078b..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/lime_wrap/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .lime_wrap import * diff --git a/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py b/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py deleted file mode 100644 index 0146ed1f3ae756ea8a0d7bf0f8d6a79449b951fb..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## lime_wrap.py (reuses parts of the code of SHAP) -## -## Created on: Dec 12, 2018 -## Author: Nina Narodytska, Alexey Ignatiev -## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -import json -import numpy as np -import xgboost as xgb -import math -import lime -import lime.lime_tabular -import resource - - -# -#============================================================================== -def lime_call(xgb, sample = None, nb_samples = 5, feats='all', - nb_features_in_exp=5): - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # we need a way to say that features are categorical ? - # we do not have this informations. - predict_fn_xgb = lambda x: xgb.model.predict_proba(xgb.transform(x)).astype(float) - explainer = lime.lime_tabular.LimeTabularExplainer( - xgb.X_train, - feature_names=xgb.feature_names, - categorical_features=xgb.categorical_features if xgb.use_categorical else None, - class_names=xgb.target_name, - discretize_continuous=True, - ) - - f2imap = {} - for i, f in enumerate(xgb.feature_names): - f2imap[f.strip()] = i - - if (sample is not None): - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except: - print("Cannot parse input sample:", sample) - exit() - print("\n\n\nStarting LIME explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(xgb.X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = xgb.transform(feat_sample_exp) - y_pred = xgb.model.predict(feat_sample_exp)[0] - y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] - - exp = explainer.explain_instance(feat_sample, - predict_fn_xgb, - num_features = nb_features_in_exp, - top_labels = 1)#, - #labels = list(range(xgb.num_class))) - - expl = [] - - # choose which features in the explanation to focus on - if feats in ('p', 'pos', '+'): - feats = 1 - elif feats in ('n', 'neg', '-'): - feats = -1 - else: - feats = 0 - - for i in range(xgb.num_class): - if (i != y_pred): - continue - print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[i], ")") - print("\t \t Features in explanations: ", exp.as_list(label=i)) - - s_human_readable = "" - for k, v in enumerate(exp.as_list(label=i)): - if (feats == 1 and v[1] < 0) or (feats == -1 and v[1] >= 0): - continue - - if not (('<' in v[0]) or ('>' in v[0])): - a = v[0].split('=') - f = a[0].strip() - l = a[1].strip() - u = l - - if (xgb.use_categorical): - fid = f2imap[f] - fvid = int(a[1]) - #s_human_readable = s_human_readable + f + " = [" + str(xgb.categorical_names[fid][fvid]) +"," + str(v[1])+ "] " - s_human_readable = s_human_readable + "\t \t id = {}, name = {}, score = {}\n".format(fid, f, str(v[1])) - - - else: - a = v[0].split('<') - - if len(a) == 1: - a = v[0].split('>') - - if len(a) == 2: - f = a[0].strip() - - if '>' in v[0]: - l, u = float(a[1].strip(' =')), None - else: - l, u = None, float(a[1].strip(' =')) - else: - l = float(a[0].strip()) - f = a[1].strip(' =') - u = float(a[2].strip(' =')) - - # expl.append(tuple([f2imap[f], l, u, v[1] >= 0])) - expl.append(f2imap[f]) - - if (xgb.use_categorical): - if (len(s_human_readable) > 0): - print("\t \t Features in explanations (with provided categorical labels): \n", s_human_readable) - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - - return sorted(expl) - - ###################################### TESTING - max_sample = nb_samples - y_pred_prob = xgb.model.predict_proba(xgb.X_test) - y_pred = xgb.model.predict(xgb.X_test) - - nb_tests = min(max_sample,len(xgb.Y_test)) - top_labels = 1 - for sample in range(nb_tests): - np.set_printoptions(precision=2) - feat_sample = xgb.X_test[sample] - print("Considering a sample with features:", feat_sample) - if (False): - feat_sample[4] = 3000 - y_pred_prob_sample = xgb.model.predict_proba([feat_sample]) - print(y_pred_prob_sample) - print("\t Predictions:", y_pred_prob[sample]) - exp = explainer.explain_instance(feat_sample, - predict_fn_xgb, - num_features= xgb.num_class, - top_labels = 1, - labels = list(range(xgb.num_class))) - for i in range(xgb.num_class): - if (i != y_pred[sample]): - continue - print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[sample][i], ")") - print("\t \t Features in explanations: ", exp.as_list(label=i)) - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - return diff --git a/pages/application/RandomForest/utils/options.py b/pages/application/RandomForest/utils/options.py index 8ba5a91707abf42368d67f65044806199cf4c8d4..446eb71702f91876d0f8bef64fd90d048dd95282 100644 --- a/pages/application/RandomForest/utils/options.py +++ b/pages/application/RandomForest/utils/options.py @@ -33,19 +33,12 @@ class Options(object): self.train = False self.encode = 'none' self.explain = '' - self.useanchor = False - self.uselime = False - self.useshap = False - self.limefeats = 5 - self.validate = False + self.xtype = 'abd' self.use_categorical = False - self.preprocess_categorical = False - self.preprocess_categorical_files = "" # training options self.accmin = 0.95 self.n_estimators = 100 - self.num_boost_round = 10 self.maxdepth = 3 self.testsplit = 0.2 self.seed = 7 @@ -56,14 +49,9 @@ class Options(object): self.mapfile = None self.separator = ',' self.smallest = False - self.solver = 'z3' + self.solver = 'g3' self.verb = 0 - # random forest - self.rf = False - self.pi_check = False - self.repair = False - self.refine = False if command: self.parse(command) @@ -77,36 +65,11 @@ class Options(object): try: opts, args = getopt.getopt(command[1:], - 'a:ce:d:hL:lm:Mn:o:pPr:Rqs:tvVwx:', - ['accmin=', - 'encode=', - 'help', - 'map-file=', - 'use-anchor=', - 'lime-feats=', - 'use-lime=', - 'use-shap=', - 'use-categorical=', - 'preprocess-categorical=', - 'pfiles=', - 'maxdepth=', - 'minimum', - 'nbestims=', - 'output=', - 'prime-implicant', - 'rounds=', - 'random-forest', - 'repair', - 'refine', - 'seed=', - 'sep=', - 'solver=', - 'testsplit=', - 'train', - 'validate', - 'verbose', - 'explain=' - ]) + 'e:hc:d:Mn:o:s:tvx:X:', + ['encode=', 'help', 'use-categorical=', + 'maxdepth=', 'minimum', 'nbestims=', + 'output=', 'seed=', 'solver=', 'testsplit=', + 'train', 'verbose', 'explain=', 'xtype=' ]) except getopt.GetoptError as err: sys.stderr.write(str(err).capitalize()) self.usage() @@ -124,30 +87,14 @@ class Options(object): elif opt in ('-h', '--help'): self.usage() sys.exit(0) - elif opt in ('-l', '--use-lime'): - self.uselime = True - elif opt in ('-L', '--lime-feats'): - self.limefeats = 0 if arg == 'all' else int(arg) - elif opt in ('-m', '--map-file'): - self.mapfile = str(arg) + elif opt in ('-M', '--minimum'): self.smallest = True elif opt in ('-n', '--nbestims'): self.n_estimators = int(arg) elif opt in ('-o', '--output'): self.output = str(arg) - elif opt in ('-q', '--use-anchor'): - self.useanchor = True - elif opt in ('-P', '--prime-implicant'): - self.pi_check = True - elif opt in ('-r', '--rounds'): - self.num_boost_round = int(arg) - elif opt in ('-R', '--random-forest'): - self.rf = True - elif opt == '--repair': - self.repair = True - elif opt == '--refine': - self.refine = True + elif opt == '--seed': self.seed = int(arg) elif opt == '--sep': @@ -158,18 +105,12 @@ class Options(object): self.testsplit = float(arg) elif opt in ('-t', '--train'): self.train = True - elif opt in ('-V', '--validate'): - self.validate = True elif opt in ('-v', '--verbose'): self.verb += 1 - elif opt in ('-w', '--use-shap'): - self.useshap = True elif opt in ('-x', '--explain'): self.explain = str(arg) - elif opt in ('-p', '--preprocess-categorical'): - self.preprocess_categorical = True - elif opt in ('--pfiles'): - self.preprocess_categorical_files = str(arg) #train_file, test_file(or empty, resulting file + elif opt in ('-X', '--xtype'): + self.xtype = str(arg) else: assert False, 'Unhandled option: {0} {1}'.format(opt, arg) @@ -185,40 +126,29 @@ class Options(object): print('Usage: ' + os.path.basename(self.command[0]) + ' [options] input-file') print('Options:') - print(' -a, --accmin=<float> Minimal accuracy') - print(' Available values: [0.0, 1.0] (default = 0.95)') - print(' -c, --use-categorical Treat categorical features as categorical (with categorical features info if available)') + #print(' -a, --accmin=<float> Minimal accuracy') + #print(' Available values: [0.0, 1.0] (default = 0.95)') + #print(' -c, --use-categorical Treat categorical features as categorical (with categorical features info if available)') print(' -d, --maxdepth=<int> Maximal depth of a tree') print(' Available values: [1, INT_MAX] (default = 3)') - print(' -e, --encode=<smt> Encode a previously trained model') - print(' Available values: smt, smtbool, none (default = none)') + #print(' -e, --encode=<smt> Encode a previously trained model') + #print(' Available values: sat, maxsat, none (default = none)') print(' -h, --help Show this message') - print(' -l, --use-lime Use LIME to compute an explanation') - print(' -L, --lime-feats Instruct LIME to compute an explanation of this size') - print(' Available values: [1, INT_MAX], all (default = 5)') - print(' -m, --map-file=<string> Path to a file containing a mapping to original feature values. (default: none)') - print(' -M, --minimum Compute a smallest size explanation (instead of a subset-minimal one)') - print(' -n, --nbestims=<int> Number of trees per class') + + #print(' -m, --map-file=<string> Path to a file containing a mapping to original feature values. (default: none)') + #print(' -M, --minimum Compute a smallest size explanation (instead of a subset-minimal one)') + print(' -n, --nbestims=<int> Number of trees in the ensemble') print(' Available values: [1, INT_MAX] (default = 100)') print(' -o, --output=<string> Directory where output files will be stored (default: \'temp\')') - print(' -p, Preprocess categorical data') - print(' --pfiles Filenames to use when preprocessing') - print(' --prime-implicant Check explanation if it is a prime implicant') - print(' -q, --use-anchor Use Anchor to compute an explanation') - print(' -r, --rounds=<int> Number of training rounds') - print(' -R, --random-forest Use Random Forest model') - print(' --refine try to refine the (optimistic) local explanation') - print(' --repair try to repair the (pessimistic) local explanation') - print(' Available values: [1, INT_MAX] (default = 10)') + print(' --seed=<int> Seed for random splitting') print(' Available values: [1, INT_MAX] (default = 7)') print(' --sep=<string> Field separator used in input file (default = \',\')') - print(' -s, --solver=<string> An SMT reasoner to use') - print(' Available values: cvc4, mathsat, yices, z3 (default = z3)') + print(' -s, --solver=<string> A SAT oracle to use') + print(' Available values: glucose3, minisat (default = g3)') print(' -t, --train Train a model of a given dataset') print(' --testsplit=<float> Training and test sets split') print(' Available values: [0.0, 1.0] (default = 0.2)') print(' -v, --verbose Increase verbosity level') - print(' -V, --validate Validate explanation (show that it is too optimistic)') - print(' -w, --use-shap Use SHAP to compute an explanation') print(' -x, --explain=<string> Explain a decision for a given comma-separated sample (default: none)') + print(' -X, --xtype=<string> Type of explanation to compute: abductive or contrastive') diff --git a/pages/application/RandomForest/utils/shap_wrap/__init__.py b/pages/application/RandomForest/utils/shap_wrap/__init__.py deleted file mode 100644 index 845cbd2bb73191edd950fc9eb09a641ab8cf2088..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/shap_wrap/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .shap_wrap import * diff --git a/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py b/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py deleted file mode 100644 index 4eadc21f76cc03f78c656fcf7d7cde1ed4ea2a3b..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## shap_wrap.py (reuses parts of the code of SHAP) -## -## Created on: Sep 25, 2019 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -import json -import numpy as np -import xgboost as xgb -import math -import shap -import resource - - -# -#============================================================================== -def shap_call(xgb, sample = None, feats='all', nb_features_in_exp = None): - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - f2imap = {} - for i, f in enumerate(xgb.feature_names): - f2imap[f.strip()] = i - - if (sample is not None): - if (nb_features_in_exp is None): - nb_features_in_exp = len(sample) - - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except: - print("Cannot parse input sample:", sample) - exit() - print("\n\nStarting SHAP explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(xgb.X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = xgb.transform(feat_sample_exp) - y_pred = xgb.model.predict(feat_sample_exp)[0] - y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] - - # No need to pass dataset as it is recored in model - # https://shap.readthedocs.io/en/latest/ - - explainer = shap.TreeExplainer(xgb.model) - shap_values = explainer.shap_values(feat_sample_exp) - - shap_values_sample = shap_values[-1] - transformed_sample = feat_sample_exp[-1] - - - - - # we need to sum values per feature - # https://github.com/slundberg/shap/issues/397 - sum_values = [] - if (xgb.use_categorical): - p = 0 - for f in xgb.categorical_features: - nb_values = len(xgb.categorical_names[f]) - sum_v = 0 - for i in range(nb_values): - sum_v = sum_v + shap_values_sample[p+i] - p = p + nb_values - sum_values.append(sum_v) - else: - sum_values = shap_values_sample - expl = [] - - # choose which features in the explanation to focus on - if feats in ('p', 'pos', '+'): - feats = 1 - elif feats in ('n', 'neg', '-'): - feats = -1 - else: - feats = 0 - - print("\t \t Explanations for the winner class", y_pred, " (xgboost confidence = ", y_pred_prob[int(y_pred)], ")") - print("base_value = {}, predicted_value = {}".format(explainer.expected_value, np.sum(sum_values) + explainer.expected_value)) - - abs_sum_values = np.abs(sum_values) - sorted_by_abs_sum_values =np.argsort(-abs_sum_values) - - for k1, v1 in enumerate(sorted_by_abs_sum_values): - - k = v1 - v = sum_values[v1] - - if (feats == 1 and v < 0) or (feats == -1 and v >= 0): - continue - - expl.append(f2imap[xgb.feature_names[k]]) - print("id = {}, name = {}, score = {}".format(f2imap[xgb.feature_names[k]], xgb.feature_names[k], v)) - - if (len(expl) == nb_features_in_exp): - break - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - - return sorted(expl[:nb_features_in_exp]) diff --git a/pages/application/RandomForest/utils/xgbooster/__init__.py b/pages/application/RandomForest/utils/xgbooster/__init__.py deleted file mode 100644 index 88bdad8d34e0f8a335ca167595de25c965e8b021..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .encode import * -from .tree import * -from .xgbooster import * -from .preprocess import * \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xgbooster/encode.py b/pages/application/RandomForest/utils/xgbooster/encode.py deleted file mode 100644 index bf98a6b94477d7cad42b84716783ed364ef83ae7..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/encode.py +++ /dev/null @@ -1,351 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## encode.py -## -## Created on: Dec 7, 2018 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import collections -from pysat.formula import IDPool -from pysmt.smtlib.parser import SmtLibParser -from pysmt.shortcuts import And, BOOL, Iff, Implies, Not, Or, Symbol, get_model -from pysmt.shortcuts import Equals, ExactlyOne, LT, Plus, REAL, Real, write_smtlib -from .tree import TreeEnsemble, scores_tree -import six -from six.moves import range - -try: # for Python2 - from cStringIO import StringIO -except ImportError: # for Python3 - from io import StringIO - - -# -#============================================================================== -class SMTEncoder(object): - """ - Encoder of XGBoost tree ensembles into SMT. - """ - - def __init__(self, model, feats, nof_classes, xgb, from_file=None): - """ - Constructor. - """ - - self.model = model - self.feats = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - - # xgbooster will also be needed - self.xgb = xgb - - # for interval-based encoding - self.intvs, self.imaps, self.ivars = None, None, None - - if from_file: - self.load_from(from_file) - - def traverse(self, tree, tvar, prefix=[]): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - pos, neg = self.encode_node(tree) - - self.traverse(tree.children[0], tvar, prefix + [pos]) - self.traverse(tree.children[1], tvar, prefix + [neg]) - else: # leaf node - if prefix: - self.enc.append(Implies(And(prefix), Equals(tvar, Real(tree.values)))) - else: - self.enc.append(Equals(tvar, Real(tree.values))) - - def encode_node(self, node): - """ - Encode a node of a tree. - """ - - if '_' not in node.name: - # continuous features => expecting an upper bound - # feature and its upper bound (value) - f, v = node.name, node.threshold - - existing = True if tuple([f, v]) in self.idmgr.obj2id else False - vid = self.idmgr.id(tuple([f, v])) - bv = Symbol('bvar{0}'.format(vid), typename=BOOL) - - if not existing: - if self.intvs: - d = self.imaps[f][v] + 1 - pos, neg = self.ivars[f][:d], self.ivars[f][d:] - self.enc.append(Iff(bv, Or(pos))) - self.enc.append(Iff(Not(bv), Or(neg))) - else: - fvar, fval = Symbol(f, typename=REAL), Real(v) - self.enc.append(Iff(bv, LT(fvar, fval))) - - return bv, Not(bv) - else: - # all features are expected to be categorical and - # encoded with one-hot encoding into Booleans - # each node is expected to be of the form: f_i < 0.5 - bv = Symbol(node.name, typename=BOOL) - - # left branch is positive, i.e. bv is true - # right branch is negative, i.e. bv is false - return Not(bv), bv - - def compute_intervals(self): - """ - Traverse all trees in the ensemble and extract intervals for each - feature. - - At this point, the method only works for numerical datasets! - """ - - def traverse_intervals(tree): - """ - Auxiliary function. Recursive tree traversal. - """ - - if tree.children: - f = tree.name - v = tree.threshold - self.intvs[f].add(v) - - traverse_intervals(tree.children[0]) - traverse_intervals(tree.children[1]) - - # initializing the intervals - self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} - - for tree in self.ensemble.trees: - traverse_intervals(tree) - - # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} - - self.imaps, self.ivars = {}, {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - - ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) - self.ivars[feat].append(ivar) - - def encode(self): - """ - Do the job. - """ - - self.enc = [] - - # getting a tree ensemble - self.ensemble = TreeEnsemble(self.model, - self.xgb.extended_feature_names_as_array_strings, - nb_classes=self.nofcl) - - # introducing class score variables - csum = [] - for j in range(self.nofcl): - cvar = Symbol('class{0}_score'.format(j), typename=REAL) - csum.append(tuple([cvar, []])) - - # if targeting interval-based encoding, - # traverse all trees and extract all possible intervals - # for each feature - # if self.optns.encode == 'smtbool': - self.compute_intervals() - - # traversing and encoding each tree - for i, tree in enumerate(self.ensemble.trees): - # getting class id - clid = i % self.nofcl - - # encoding the tree - tvar = Symbol('tr{0}_score'.format(i + 1), typename=REAL) - self.traverse(tree, tvar, prefix=[]) - - # this tree contributes to class with clid - csum[clid][1].append(tvar) - - # encoding the sums - for pair in csum: - cvar, tvars = pair - self.enc.append(Equals(cvar, Plus(tvars))) - - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' in f: - categories[f.split('_')[0]].append(Symbol(name=f, typename=BOOL)) - for c, feats in six.iteritems(categories): - self.enc.append(ExactlyOne(feats)) - - # number of assertions - nof_asserts = len(self.enc) - - # making conjunction - self.enc = And(self.enc) - - # number of variables - nof_vars = len(self.enc.get_free_variables()) - - return self.enc, self.intvs, self.imaps, self.ivars - - def test_sample(self, sample, solver=None): - """ - Check whether or not the encoding "predicts" the same class - as the classifier given an input sample. - """ - - # first, compute the scores for all classes as would be - # predicted by the classifier - - # score arrays computed for each class - csum = [[] for c in range(self.nofcl)] - - sample_internal = list(self.xgb.transform(sample)[0]) - - # traversing all trees - for i, tree in enumerate(self.ensemble.trees): - # getting class id - clid = i % self.nofcl - - # a score computed by the current tree - score = scores_tree(tree, sample_internal) - - # this tree contributes to class with clid - csum[clid].append(score) - - # final scores for each class - cscores = [sum(scores) for scores in csum] - - # second, get the scores computed with the use of the encoding - - # asserting the sample - hypos = [] - - if not self.intvs: - for i, fval in enumerate(sample_internal): - feat, vid = self.xgb.transform_inverse_by_index(i) - fid = self.feats[feat] - - if vid == None: - fvar = Symbol('f{0}'.format(fid), typename=REAL) - hypos.append(Equals(fvar, Real(float(fval)))) - else: - fvar = Symbol('f{0}_{1}'.format(fid, vid), typename=BOOL) - if int(fval) == 1: - hypos.append(fvar) - else: - hypos.append(Not(fvar)) - else: - for i, fval in enumerate(sample_internal): - feat, _ = self.xgb.transform_inverse_by_index(i) - feat = 'f{0}'.format(self.feats[feat]) - - # determining the right interval and the corresponding variable - for ub, fvar in zip(self.intvs[feat], self.ivars[feat]): - if ub == '+' or fval < ub: - hypos.append(fvar) - break - else: - assert 0, 'No proper interval found for {0}'.format(feat) - - # now, getting the model - escores = [] - model = get_model(And(self.enc, *hypos), solver_name=solver) - for c in range(self.nofcl): - v = Symbol('class{0}_score'.format(c), typename=REAL) - escores.append(float(model.get_py_value(v))) - - assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \ - 'wrong prediction: {0} vs {1}'.format(cscores, escores) - - def save_to(self, outfile): - """ - Save the encoding into a file with a given name. - """ - - if outfile.endswith('.txt'): - outfile = outfile[:-3] + 'smt2' - - write_smtlib(self.enc, outfile) - - # appending additional information - with open(outfile, 'r') as fp: - contents = fp.readlines() - - # comments - comments = ['; features: {0}\n'.format(', '.join(self.feats)), - '; classes: {0}\n'.format(self.nofcl)] - - if self.intvs: - for f in self.xgb.extended_feature_names_as_array_strings: - c = '; i {0}: '.format(f) - c += ', '.join(['{0}<->{1}'.format(u, v) for u, v in zip(self.intvs[f], self.ivars[f])]) - comments.append(c + '\n') - - contents = comments + contents - with open(outfile, 'w') as fp: - fp.writelines(contents) - - def load_from(self, infile): - """ - Loads the encoding from an input file. - """ - - with open(infile, 'r') as fp: - file_content = fp.readlines() - - # empty intervals for the standard encoding - self.intvs, self.imaps, self.ivars = {}, {}, {} - - for line in file_content: - if line[0] != ';': - break - elif line.startswith('; i '): - f, arr = line[4:].strip().split(': ', 1) - f = f.replace('-', '_') - self.intvs[f], self.imaps[f], self.ivars[f] = [], {}, [] - - for i, pair in enumerate(arr.split(', ')): - ub, symb = pair.split('<->') - - if ub[0] != '+': - ub = float(ub) - symb = Symbol(symb, typename=BOOL) - - self.intvs[f].append(ub) - self.ivars[f].append(symb) - self.imaps[f][ub] = i - - elif line.startswith('; features:'): - self.feats = line[11:].strip().split(', ') - elif line.startswith('; classes:'): - self.nofcl = int(line[10:].strip()) - - parser = SmtLibParser() - script = parser.get_script(StringIO(''.join(file_content))) - - self.enc = script.get_last_formula() - - def access(self): - """ - Get access to the encoding, features names, and the number of - classes. - """ - - return self.enc, self.intvs, self.imaps, self.ivars, self.feats, self.nofcl diff --git a/pages/application/RandomForest/utils/xgbooster/explain.py b/pages/application/RandomForest/utils/xgbooster/explain.py deleted file mode 100644 index d522b38fe71c362a8cabd51390a8d00610c826f2..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/explain.py +++ /dev/null @@ -1,292 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- -## -## explain.py -## -## Created on: Dec 14, 2018 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -# ============================================================================== -from __future__ import print_function -import numpy as np -import os -from pysat.examples.hitman import Hitman -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GT, Int, Real, REAL -import resource -from six.moves import range -import sys - - - -# -#============================================================================== -class SMTExplainer(object): - """ - An SMT-inspired minimal explanation extractor for XGBoost models. - """ - - def __init__(self, formula, intvs, imaps, ivars, feats, nof_classes, - solver, xgb): - """ - Constructor. - """ - - self.feats = feats - self.intvs = intvs - self.imaps = imaps - self.ivars = ivars - self.nofcl = nof_classes - self.idmgr = IDPool() - - # saving XGBooster - self.xgb = xgb - - self.oracle = Solver(name=solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - - # current selector - self.selv = None - - def prepare(self, sample): - """ - Prepare the oracle for computing an explanation. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - self.sel2fid = {} # selectors to original feature ids - self.sel2vid = {} # selectors to categorical feature ids - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - if selv not in self.sel2fid: - self.sel2fid[selv] = int(feat[1:]) - self.sel2vid[selv] = [i - 1] - else: - self.sel2vid[selv].append(i - 1) - - # adding relaxed hypotheses to the oracle - if not self.intvs: - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - else: - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - inp = inp.symbol_name() - # determining the right interval and the corresponding variable - for ub, fvar in zip(self.intvs[inp], self.ivars[inp]): - if ub == '+' or val < ub: - hypo = Implies(self.selv, Implies(sel, fvar)) - break - - self.oracle.add_assertion(hypo) - - # in case of categorical data, there are selector duplicates - # and we need to remove them - self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - self.out_id = maxoval[1] - self.output = self.xgb.target_name[self.out_id] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != self.out_id: - disj.append(GT(self.outs[i], self.outs[self.out_id])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - inpvals = self.xgb.readable_sample(sample) - - self.preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in str(v): - self.preamble.append('{0} = {1}'.format(f, v)) - else: - self.preamble.append(v) - - return "IF {0} THEN {1}".format(' AND '.join(self.preamble), self.output) - - def explain(self, sample, smallest, expl_ext=None, prefer_ext=False): - """ - Hypotheses minimization. - """ - - # adapt the solver to deal with the current sample - explanation_dic = {} - explanation_dic["Instance :"] = self.prepare(sample) - - # saving external explanation to be minimized further - if expl_ext == None or prefer_ext: - self.to_consider = [True for h in self.rhypos] - else: - eexpl = set(expl_ext) - self.to_consider = [True if i in eexpl else False for i, h in enumerate(self.rhypos)] - - # if satisfiable, then the observation is not implied by the hypotheses - if self.oracle.solve([self.selv] + [h for h, c in zip(self.rhypos, self.to_consider) if c]): - explanation_dic["no implication"] = self.oracle.get_model() - else : - if not smallest: - self.compute_minimal(prefer_ext=prefer_ext) - else: - self.compute_smallest() - - explanation = sorted([self.sel2fid[h] for h in self.rhypos]) - self.preamble = [self.preamble[i] for i in explanation] - explanation_dic["explanation: "] = "IF {0} THEN {1}".format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id]) - explanation_dic["Hyphothesis left"] = str(len(self.rhypos)) - - return explanation_dic - - def compute_minimal(self, prefer_ext=False): - """ - Compute any subset-minimal explanation. - """ - - i = 0 - - if not prefer_ext: - # here, we want to reduce external explanation - - # filtering out unnecessary features if external explanation is given - self.rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if c] - else: - # here, we want to compute an explanation that is preferred - # to be similar to the given external one - # for that, we try to postpone removing features that are - # in the external explanation provided - - rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if not c] - rhypos += [h for h, c in zip(self.rhypos, self.to_consider) if c] - self.rhypos = rhypos - - # simple deletion-based linear search - while i < len(self.rhypos): - to_test = self.rhypos[:i] + self.rhypos[(i + 1):] - - if self.oracle.solve([self.selv] + to_test): - i += 1 - else: - self.rhypos = to_test - - def compute_smallest(self): - """ - Compute a cardinality-minimal explanation. - """ - - # result - rhypos = [] - - with Hitman(bootstrap_with=[[i for i in range(len(self.rhypos)) if self.to_consider[i]]]) as hitman: - # computing unit-size MCSes - for i, hypo in enumerate(self.rhypos): - if self.to_consider[i] == False: - continue - - if self.oracle.solve([self.selv] + self.rhypos[:i] + self.rhypos[(i + 1):]): - hitman.hit([i]) - - # main loop - iters = 0 - while True: - hset = hitman.get() - iters += 1 - - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset]): - to_hit = [] - satisfied, unsatisfied = [], [] - - removed = list(set(range(len(self.rhypos))).difference(set(hset))) - - model = self.oracle.get_model() - for h in removed: - i = self.sel2fid[self.rhypos[h]] - if '_' not in self.inps[i].symbol_name(): - # feature variable and its expected value - var, exp = self.inps[i], self.sample[i] - - # true value - true_val = float(model.get_py_value(var)) - - if not exp - 0.001 <= true_val <= exp + 0.001: - unsatisfied.append(h) - else: - hset.append(h) - else: - for vid in self.sel2vid[self.rhypos[h]]: - var, exp = self.inps[vid], int(self.sample[vid]) - - # true value - true_val = int(model.get_py_value(var)) - - if exp != true_val: - unsatisfied.append(h) - break - else: - hset.append(h) - - # computing an MCS (expensive) - for h in unsatisfied: - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset] + [self.rhypos[h]]): - hset.append(h) - else: - to_hit.append(h) - - hitman.hit(to_hit) - else: - self.rhypos = [self.rhypos[i] for i in hset] - break \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xgbooster/preprocess.py b/pages/application/RandomForest/utils/xgbooster/preprocess.py deleted file mode 100644 index d66cc338fe6b232fff0155aea5055e7fe9241dac..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/preprocess.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## preprocess.py -## -## Created on: Jan 10, 2019 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -import json -import numpy as np -import xgboost as xgb -import math -import pandas as pd -import numpy as np -import sklearn -import pickle - - -# -#============================================================================== -def preprocess_dataset(raw_data_path, files, use_categ=True): - print("preprocess dataset from ", raw_data_path) - files = files.split(",") - - data_file = files[0] - dataset_name = files[1] - - categorical_features = [] - if use_categ: - try: - catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) - categorical_features = np.concatenate(catcols.values).tolist() - - print(categorical_features) - except Exception as e: - print("Please provide info about categorical columns/original datasets or omit option -p", e) - exit() - - try: - data_raw = pd.read_csv(raw_data_path + data_file, sep=',', na_values= ['']) - #catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) - #categorical_features = np.concatenate(catcols.values).tolist() - - - for i in range(len(data_raw.values[0])): - if i in categorical_features: - data_raw.fillna('',inplace=True) - else: - data_raw.fillna(0,inplace=True) - dataset_all = data_raw - dataset = dataset_all.values.copy() - - print(categorical_features) - except Exception as e: - print("Please provide info about categorical columns/original datasets or omit option -p", e) - exit() - - # move categrorical columns forward - - feature_names = dataset_all.columns - print(feature_names) - - ############################## - extra_info = {} - categorical_names = {} - print(categorical_features) - dataset_new = dataset_all.values.copy() - for feature in categorical_features: - print("feature", feature) - print(dataset[:, feature]) - le = sklearn.preprocessing.LabelEncoder() - le.fit(dataset[:, feature]) - categorical_names[feature] = le.classes_ - dataset_new[:, feature] = le.transform(dataset[:, feature]) - - ###################################3 - # target as categorical - labels_new = [] - - le = sklearn.preprocessing.LabelEncoder() - le.fit(dataset[:, -1]) - dataset_new[:, -1]= le.transform(dataset[:, -1]) - class_names = le.classes_ - ######################################33 - - - if (False): - dataset_new = np.delete(dataset_new, -1, axis=1) - oneencoder = sklearn.preprocessing.OneHotEncoder() - oneencoder.fit(dataset_new[:, categorical_features]) - print(oneencoder.categories_) - n_transformed_features = sum([len(cats) for cats in oneencoder.categories_]) - print(n_transformed_features) - print(dataset_new.shape) - X = dataset_new[:,categorical_features][0] - print(X) - x = np.expand_dims(X, axis=0) - print("x", x, x.shape) - y = dataset_new[0].copy() - print(y.shape, oneencoder.transform(x).shape) - y[categorical_features] = oneencoder.transform(x).toarray() - - print("y", y, y.shape) - - z = oneencoder.inverse_transform(y) - print(z.shape) - exit() - - ###########################################################################3 - extra_info = {"categorical_features": categorical_features, - "categorical_names": categorical_names, - "feature_names": feature_names, - "class_names": class_names} - - new_file_train = raw_data_path + dataset_name + '_data.csv' - df = pd.DataFrame(data=dataset_new) - df.columns = list(feature_names) - df.to_csv(new_file_train, mode = 'w', index=False) - print("new dataset", new_file_train) - - - f = open(raw_data_path + dataset_name + '_data.csv.pkl', "wb") - pickle.dump(extra_info, f) - f.close() diff --git a/pages/application/RandomForest/utils/xgbooster/tree.py b/pages/application/RandomForest/utils/xgbooster/tree.py deleted file mode 100644 index ebaf24dbf4672e985943f242f238dad8b841e604..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/tree.py +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## tree.py (reuses parts of the code of SHAP) -## -## Created on: Dec 7, 2018 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -from anytree import Node, RenderTree,AsciiStyle -import json -import numpy as np -import xgboost as xgb -import math - - -# -#============================================================================== -class xgnode(Node): - def __init__(self, id, parent = None): - Node.__init__(self, id, parent) - self.id = id # The node value - self.name = None - self.left_node_id = -1 # Left child - self.right_node_id = -1 # Right child - self.missing_node_id = -1 - - self.feature = -1 - self.threshold = -1 - - self.cover = -1 - self.values = -1 - - def __str__(self): - pref = ' ' * self.depth - if (len(self.children) == 0): - return (pref+ "leaf: {} {}".format(self.id, self.values)) - else: - if(self.name is None): - return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) - else: - return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) - - -# -#============================================================================== -def build_tree(json_tree, node = None, feature_names = None, inverse = False): - def max_id(node): - if "children" in node: - return max(node["nodeid"], *[max_id(n) for n in node["children"]]) - else: - return node["nodeid"] - m = max_id(json_tree) + 1 - def extract_data(json_node, root = None, feature_names = None): - i = json_node["nodeid"] - if (root is None): - node = xgnode(i) - else: - node = xgnode(i, parent = root) - node.cover = json_node["cover"] - if "children" in json_node: - - node.left_node_id = json_node["yes"] - node.right_node_id = json_node["no"] - node.missing_node_id = json_node["missing"] - node.feature = json_node["split"] - if (feature_names is not None): - node.name = feature_names[node.feature] - node.threshold = json_node["split_condition"] - for c, n in enumerate(json_node["children"]): - child = extract_data(n, node, feature_names) - elif "leaf" in json_node: - node.values = json_node["leaf"] - if(inverse): - node.values = -node.values - return node - - root = extract_data(json_tree, None, feature_names) - return root - -# -#============================================================================== -def walk_tree(node): - if (len(node.children) == 0): - # leaf - print(node) - else: - print(node) - walk_tree(node.children[0]) - walk_tree(node.children[1]) - -def count_nodes(root): - def count(node): - if len(node.children): - return sum([1+count(n) for n in node.children]) - else: - return 0 - m = count(root) + 1 - return m - -# -#============================================================================== -def scores_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feature_branch = node.feature - sample_value = sample[feature_branch] - assert(sample_value is not None) - if(sample_value < node.threshold): - return scores_tree(node.children[0], sample) - else: - return scores_tree(node.children[1], sample) - - -# -#============================================================================== -class TreeEnsemble: - """ An ensemble of decision trees. - - This object provides a common interface to many different types of models. - """ - def __init__(self, model, feature_names = None, nb_classes = 0): - self.model_type = "xgboost" - self.original_model = model.get_booster() - self.base_offset = None - json_trees = get_xgboost_json(self.original_model) - self.trees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - if(nb_classes == 2): - # NASTY trick for binary - # We change signs of values in leaves so that we can just sum all the values in leaves for class X - # and take max to get the right class - self.otrees = [build_tree(json.loads(t), None, feature_names, inverse = True) for t in json_trees] - self.itrees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - self.trees = [] - for i,_ in enumerate(self.otrees): - self.trees.append(self.otrees[i]) - self.trees.append(self.itrees[i]) - self.feature_names = feature_names - self.sz = sum([count_nodes(dt) for dt in self.trees]) - def print_tree(self): - for i,t in enumerate(self.trees): - print("tree number: ", i) - walk_tree(t) - - def invert_tree_prob(self, node): - if (len(node.children) == 0): - node.values = -node.values - return node - else: - self.invert_tree_prob(node.children[0]) - self.invert_tree_prob(node.children[1]) - return node - def predict(self, samples, nb_classes): - # https://github.com/dmlc/xgboost/issues/1746#issuecomment-290130695 - prob = [] - for sample in np.asarray(samples): - scores = [] - for i,t in enumerate(self.trees): - s = scores_tree(t, sample) - scores.append((s)) - scores = np.asarray(scores) - class_scores = [] - if (nb_classes == 2): - - for i in range(nb_classes): - class_scores.append(math.exp(-(scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor - s0 = class_scores[0] - s1 = class_scores[1] - v0 = 1/(1 + s0) - v1 = 1/(1 + s1) - class_scores[0] = v0 - class_scores[1] = v1 - else: - for i in range(nb_classes): - class_scores.append(math.exp((scores[i::nb_classes]).sum())) - class_scores = np.asarray(class_scores) - prob.append(class_scores/class_scores.sum()) - return np.asarray(prob).reshape((-1, nb_classes)) - - -# -#============================================================================== -def get_xgboost_json(model): - """ REUSED FROM SHAP - This gets a JSON dump of an XGBoost model while ensuring the feature names are their indexes. - """ - fnames = model.feature_names - model.feature_names = None - json_trees = model.get_dump(with_stats=True, dump_format="json") - model.feature_names = fnames - return json_trees diff --git a/pages/application/RandomForest/utils/xgbooster/validate.py b/pages/application/RandomForest/utils/xgbooster/validate.py deleted file mode 100644 index c3c6f82a6f241a7f158ead0694b1718893d89c5c..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/validate.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## validate.py -## -## Created on: Jan 4, 2019 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import getopt -import numpy as np -import os -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTValidator(object): - """ - Validating Anchor's explanations using SMT solving. - """ - - def __init__(self, formula, feats, nof_classes, xgb): - """ - Constructor. - """ - - self.ftids = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - self.optns = xgb.options - - # xgbooster will also be needed - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=self.xgb.options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - - # current selector - self.selv = None - - def prepare(self, sample, expl): - """ - Prepare the oracle for validating an explanation given a sample. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - - # adding relaxed hypotheses to the oracle - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - true_output = maxoval[1] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != true_output: - disj.append(GT(self.outs[i], self.outs[true_output])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - # removing all hypotheses except for those in the explanation - hypos = [] - for i, hypo in enumerate(self.rhypos): - j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] - if j in expl: - hypos.append(hypo) - self.rhypos = hypos - - if self.verbose: - inpvals = self.xgb.readable_sample(sample) - - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) - - def validate(self, sample, expl): - """ - Make an effort to show that the explanation is too optimistic. - """ - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.prepare(sample, expl) - - # if satisfiable, then there is a counterexample - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - inpvals = [float(model.get_py_value(i)) for i in self.inps] - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - inpvals = self.xgb.transform_inverse(np.array(inpvals))[0] - self.coex = tuple([inpvals, maxoval[1]]) - inpvals = self.xgb.readable_sample(inpvals) - - if self.verbose: - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation is incorrect') - print(' counterexample: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[maxoval[1]])) - else: - self.coex = None - - if self.verbose: - print(' explanation is correct') - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - if self.verbose: - print(' time: {0:.2f}'.format(self.time)) - - return self.coex diff --git a/pages/application/RandomForest/utils/xgbooster/xgbooster.py b/pages/application/RandomForest/utils/xgbooster/xgbooster.py deleted file mode 100644 index 37e86c4f1f7f779a62b8e9ac3ad3a5e03121b575..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/xgbooster.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/us/bin/env python -# -*- coding:utf-8 -*- -## -## xgbooster.py -## -## Created on: Dec 7, 2018 -## Author: Nina Narodytska, Alexey Ignatiev -## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt -## - -# -# ============================================================================== -from __future__ import print_function - -import random - -import numpy as np -from six.moves import range -from xgboost import XGBClassifier - -from .encode import SMTEncoder -from .explain import SMTExplainer -from .validate import SMTValidator - - -# -# ============================================================================== -class XGBooster(object): - """ - The main class to train/encode/explain XGBoost models. - """ - - def __init__(self, options, from_model=None): - """ - Constructor. - """ - np.random.seed(random.randint(1, 100)) - - self.model = XGBClassifier() - self.model = from_model - - self.feature_names = options["feature_names"] - self.num_class = options["n_classes"] - self.nb_features = options["n_features"] - - self.mapping_features() - - def encode(self, test_on=None): - """ - Encode a tree ensemble trained previously. - """ - - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, self) - self.enc, self.intvs, self.imaps, self.ivars = encoder.encode() - - if test_on: - encoder.test_sample(np.array(test_on)) - - def explain_sample(self, sample, smallest, solver, use_lime=None, use_anchor=None, use_shap=None, - expl_ext=None, prefer_ext=False, nof_feats=5): - """ - Explain a prediction made for a given sample with a previously - trained tree ensemble. - """ - - if use_lime: - expl = use_lime(self, sample=sample, nb_samples=5, - nb_features_in_exp=nof_feats) - elif use_anchor: - expl = use_anchor(self, sample=sample, nb_samples=5, - nb_features_in_exp=nof_feats, threshold=0.95) - elif use_shap: - expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats) - else: - if 'x' not in dir(self): - self.x = SMTExplainer(self.enc, self.intvs, self.imaps, - self.ivars, self.feature_names, self.num_class, - solver, self) - - expl = self.x.explain(np.array(sample), smallest, expl_ext, prefer_ext) - - return expl - - def explain(self, samples, smallest, solver, use_lime=None, use_anchor=None, use_shap=None, - expl_ext=None, prefer_ext=False, nof_feats=5): - explanations = [] - for sample in samples : - explanations.append(self.explain_sample(sample, smallest, solver, use_lime, use_anchor, use_shap, - expl_ext, prefer_ext, nof_feats)) - - return explanations - - def validate(self, sample, expl): - """ - Make an attempt to show that a given explanation is optimistic. - """ - - # there must exist an encoding - if 'enc' not in dir(self): - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, - self) - self.enc, _, _, _ = encoder.encode() - - if 'v' not in dir(self): - self.v = SMTValidator(self.enc, self.feature_names, self.num_class, - self) - - # try to compute a counterexample - return self.v.validate(np.array(sample), expl) - - def mapping_features(self): - self.extended_feature_names = {} - self.extended_feature_names_as_array_strings = [] - counter = 0 - - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 diff --git a/pages/application/RandomForest/utils/xgbrf/__init__.py b/pages/application/RandomForest/utils/xgbrf/__init__.py deleted file mode 100644 index 7cf92d3ca20939258ac326649b20fc5c0a79abb7..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .encode import * -from .tree import * -from .xgb_rf import * -from .preprocess import * \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xgbrf/encode.py b/pages/application/RandomForest/utils/xgbrf/encode.py deleted file mode 100644 index 78816a8e5e5723ff99979cd918a9b4b4b27f113c..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/encode.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## encode.py -## -## Created on: Dec 7, 2018 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import collections -from pysat.formula import IDPool -from pysmt.smtlib.parser import SmtLibParser -from pysmt.shortcuts import And, BOOL, Iff, Implies, Not, Or, Symbol, get_model -from pysmt.shortcuts import Equals, ExactlyOne, LT, Plus, REAL, Real, write_smtlib -from .tree import TreeEnsemble, scores_tree -import six -from six.moves import range - -try: # for Python2 - from cStringIO import StringIO -except ImportError: # for Python3 - from io import StringIO - - -# -#============================================================================== -class SMTEncoder(object): - """ - Encoder of XGBoost tree ensembles into SMT. - """ - - def __init__(self, model, feats, nof_classes, xgb, from_file=None): - """ - Constructor. - """ - - self.model = model - self.features_names = feats - self.feats = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - - # xgbooster will also be needed - self.xgb = xgb - - # for interval-based encoding - self.intvs, self.imaps, self.ivars = None, None, None - - if from_file: - self.load_from(from_file) - - def traverse(self, tree, tvar, prefix=[]): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - pos, neg = self.encode_node(tree) - - self.traverse(tree.children[0], tvar, prefix + [pos]) - self.traverse(tree.children[1], tvar, prefix + [neg]) - else: # leaf node - if prefix: - self.enc.append(Implies(And(prefix), Equals(tvar, Real(tree.values)))) - else: - self.enc.append(Equals(tvar, Real(tree.values))) - - def encode_node(self, node): - """ - Encode a node of a tree. - """ - - if '_' not in node.name: - # continuous features => expecting an upper bound - # feature and its upper bound (value) - f, v = node.name, node.threshold - - existing = True if tuple([f, v]) in self.idmgr.obj2id else False - vid = self.idmgr.id(tuple([f, v])) - bv = Symbol('bvar{0}'.format(vid), typename=BOOL) - - if not existing: - if self.intvs: - d = self.imaps[f][v] + 1 - pos, neg = self.ivars[f][:d], self.ivars[f][d:] - self.enc.append(Iff(bv, Or(pos))) - self.enc.append(Iff(Not(bv), Or(neg))) - else: - fvar, fval = Symbol(f, typename=REAL), Real(v) - self.enc.append(Iff(bv, LT(fvar, fval))) - - return bv, Not(bv) - else: - # all features are expected to be categorical and - # encoded with one-hot encoding into Booleans - # each node is expected to be of the form: f_i < 0.5 - bv = Symbol(node.name, typename=BOOL) - - # left branch is positive, i.e. bv is true - # right branch is negative, i.e. bv is false - return Not(bv), bv - - def compute_intervals(self): - """ - Traverse all trees in the ensemble and extract intervals for each - feature. - - At this point, the method only works for numerical datasets! - """ - - def traverse_intervals(tree): - """ - Auxiliary function. Recursive tree traversal. - """ - - if tree.children: - f = tree.name - v = tree.threshold - self.intvs[f].add(v) - - traverse_intervals(tree.children[0]) - traverse_intervals(tree.children[1]) - - # initializing the intervals - self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} - - for tree in self.ensemble.trees: - traverse_intervals(tree) - - # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} - - self.imaps, self.ivars = {}, {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - - ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) - self.ivars[feat].append(ivar) - - def encode(self): - """ - Do the job. - """ - - self.enc = [] - - # getting a tree ensemble - self.ensemble = TreeEnsemble(self.model, - self.xgb.extended_feature_names_as_array_strings, - nb_classes=self.nofcl) - - # introducing class score variables - csum = [] - for j in range(self.nofcl): - cvar = Symbol('class{0}_score'.format(j), typename=REAL) - csum.append(tuple([cvar, []])) - - # if targeting interval-based encoding, - # traverse all trees and extract all possible intervals - # for each feature - # if self.optns.encode == 'smtbool': - self.compute_intervals() - - # traversing and encoding each tree - for i, tree in enumerate(self.ensemble.trees): - # getting class id - clid = i % self.nofcl - - # encoding the tree - tvar = Symbol('tr{0}_score'.format(i + 1), typename=REAL) - self.traverse(tree, tvar, prefix=[]) - - # this tree contributes to class with clid - csum[clid][1].append(tvar) - - # encoding the sums - for pair in csum: - cvar, tvars = pair - self.enc.append(Equals(cvar, Plus(tvars))) - - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' in f: - categories[f.split('_')[0]].append(Symbol(name=f, typename=BOOL)) - for c, feats in six.iteritems(categories): - self.enc.append(ExactlyOne(feats)) - - # number of assertions - nof_asserts = len(self.enc) - - # making conjunction - self.enc = And(self.enc) - - # number of variables - nof_vars = len(self.enc.get_free_variables()) - - return self.enc, self.intvs, self.imaps, self.ivars - - def test_sample(self, sample, solver): - """ - Check whether or not the encoding "predicts" the same class - as the classifier given an input sample. - """ - - # first, compute the scores for all classes as would be - # predicted by the classifier - - # score arrays computed for each class - csum = [[] for c in range(self.nofcl)] - - sample_internal = list(self.xgb.transform(sample)[0]) - - # traversing all trees - for i, tree in enumerate(self.ensemble.trees): - # getting class id - clid = i % self.nofcl - - # a score computed by the current tree - score = scores_tree(tree, sample_internal) - - # this tree contributes to class with clid - csum[clid].append(score) - - # final scores for each class - cscores = [sum(scores) for scores in csum] - - # second, get the scores computed with the use of the encoding - - # asserting the sample - hypos = [] - - if not self.intvs: - for i, fval in enumerate(sample_internal): - feat, vid = self.xgb.transform_inverse_by_index(i) - fid = self.feats[feat] - - if vid == None: - fvar = Symbol('f{0}'.format(fid), typename=REAL) - hypos.append(Equals(fvar, Real(float(fval)))) - else: - fvar = Symbol('f{0}_{1}'.format(fid, vid), typename=BOOL) - if int(fval) == 1: - hypos.append(fvar) - else: - hypos.append(Not(fvar)) - else: - for i, fval in enumerate(sample_internal): - feat, _ = self.xgb.transform_inverse_by_index(i) - feat = 'f{0}'.format(self.feats[feat]) - - # determining the right interval and the corresponding variable - for ub, fvar in zip(self.intvs[feat], self.ivars[feat]): - if ub == '+' or fval < ub: - hypos.append(fvar) - break - else: - assert 0, 'No proper interval found for {0}'.format(feat) - - # now, getting the model - escores = [] - model = get_model(And(self.enc, *hypos), solver_name=solver) - for c in range(self.nofcl): - v = Symbol('class{0}_score'.format(c), typename=REAL) - escores.append(float(model.get_py_value(v))) - - assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \ - 'wrong prediction: {0} vs {1}'.format(cscores, escores) - - def save_to(self, outfile): - """ - Save the encoding into a file with a given name. - """ - - if outfile.endswith('.txt'): - outfile = outfile[:-3] + 'smt2' - - write_smtlib(self.enc, outfile) - - # appending additional information - with open(outfile, 'r') as fp: - contents = fp.readlines() - - # comments - comments = ['; features: {0}\n'.format(', '.join(self.feats)), - '; classes: {0}\n'.format(self.nofcl)] - - if self.intvs: - for f in self.xgb.extended_feature_names_as_array_strings: - c = '; i {0}: '.format(f) - c += ', '.join(['{0}<->{1}'.format(u, v) for u, v in zip(self.intvs[f], self.ivars[f])]) - comments.append(c + '\n') - - contents = comments + contents - with open(outfile, 'w') as fp: - fp.writelines(contents) - - def load_from(self, infile): - """ - Loads the encoding from an input file. - """ - - with open(infile, 'r') as fp: - file_content = fp.readlines() - - # empty intervals for the standard encoding - self.intvs, self.imaps, self.ivars = {}, {}, {} - - for line in file_content: - if line[0] != ';': - break - elif line.startswith('; i '): - f, arr = line[4:].strip().split(': ', 1) - f = f.replace('-', '_') - self.intvs[f], self.imaps[f], self.ivars[f] = [], {}, [] - - for i, pair in enumerate(arr.split(', ')): - ub, symb = pair.split('<->') - - if ub[0] != '+': - ub = float(ub) - symb = Symbol(symb, typename=BOOL) - - self.intvs[f].append(ub) - self.ivars[f].append(symb) - self.imaps[f][ub] = i - - elif line.startswith('; features:'): - self.feats = line[11:].strip().split(', ') - elif line.startswith('; classes:'): - self.nofcl = int(line[10:].strip()) - - parser = SmtLibParser() - script = parser.get_script(StringIO(''.join(file_content))) - - self.enc = script.get_last_formula() - - def access(self): - """ - Get access to the encoding, features names, and the number of - classes. - """ - - return self.enc, self.intvs, self.imaps, self.ivars, self.feats, self.nofcl diff --git a/pages/application/RandomForest/utils/xgbrf/explain.py b/pages/application/RandomForest/utils/xgbrf/explain.py deleted file mode 100644 index dde8e46b6f312c42b47fd74f2f290eb827609c4a..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/explain.py +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## explain.py -## -## Created on: Dec 14, 2018 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import numpy as np -import os -from pysat.examples.hitman import Hitman -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GT, Int, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTExplainer(object): - """ - An SMT-inspired minimal explanation extractor for XGBoost models. - """ - - def __init__(self, formula, intvs, imaps, ivars, feats, nof_classes, - options, xgb): - """ - Constructor. - """ - - self.feats = feats - self.intvs = intvs - self.imaps = imaps - self.ivars = ivars - self.nofcl = nof_classes - self.optns = options - self.idmgr = IDPool() - - # saving XGBooster - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - - # current selector - self.selv = None - - def prepare(self, sample): - """ - Prepare the oracle for computing an explanation. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - self.sel2fid = {} # selectors to original feature ids - self.sel2vid = {} # selectors to categorical feature ids - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - if selv not in self.sel2fid: - self.sel2fid[selv] = int(feat[1:]) - self.sel2vid[selv] = [i - 1] - else: - self.sel2vid[selv].append(i - 1) - - # adding relaxed hypotheses to the oracle - if not self.intvs: - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - else: - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - inp = inp.symbol_name() - # determining the right interval and the corresponding variable - for ub, fvar in zip(self.intvs[inp], self.ivars[inp]): - if ub == '+' or val < ub: - hypo = Implies(self.selv, Implies(sel, fvar)) - break - - self.oracle.add_assertion(hypo) - - # in case of categorical data, there are selector duplicates - # and we need to remove them - self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - self.out_id = maxoval[1] - self.output = self.xgb.target_name[self.out_id] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != self.out_id: - disj.append(GT(self.outs[i], self.outs[self.out_id])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - inpvals = self.xgb.readable_sample(sample) - - self.preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - self.preamble.append('{0} = {1}'.format(f, v)) - else: - self.preamble.append(v) - - return "IF {0} THEN {1}".format(' AND '.join(self.preamble), self.output) - - def explain(self, sample, smallest, expl_ext=None, prefer_ext=False): - """ - Hypotheses minimization. - """ - - explanation_dic = {} - # adapt the solver to deal with the current sample - explanation_dic["Instance : "] = self.prepare(sample) - - # saving external explanation to be minimized further - if expl_ext == None or prefer_ext: - self.to_consider = [True for h in self.rhypos] - else: - eexpl = set(expl_ext) - self.to_consider = [True if i in eexpl else False for i, h in enumerate(self.rhypos)] - - # if satisfiable, then the observation is not implied by the hypotheses - if self.oracle.solve([self.selv] + [h for h, c in zip(self.rhypos, self.to_consider) if c]): - explanation_dic["no implication"] = self.oracle.get_model() - else: - if not smallest: - self.compute_minimal(prefer_ext=prefer_ext) - else: - self.compute_smallest() - - explanation = sorted([self.sel2fid[h] for h in self.rhypos]) - self.preamble = [self.preamble[i] for i in explanation] - explanation_dic['explanation: '] = "IF {0} THEN {1}".format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id]) - explanation_dic['# hypos left:'] = str(len(self.rhypos)) - - return explanation_dic - - def compute_minimal(self, prefer_ext=False): - """ - Compute any subset-minimal explanation. - """ - - i = 0 - - if not prefer_ext: - # here, we want to reduce external explanation - - # filtering out unnecessary features if external explanation is given - self.rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if c] - else: - # here, we want to compute an explanation that is preferred - # to be similar to the given external one - # for that, we try to postpone removing features that are - # in the external explanation provided - - rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if not c] - rhypos += [h for h, c in zip(self.rhypos, self.to_consider) if c] - self.rhypos = rhypos - - # simple deletion-based linear search - while i < len(self.rhypos): - to_test = self.rhypos[:i] + self.rhypos[(i + 1):] - - if self.oracle.solve([self.selv] + to_test): - i += 1 - else: - self.rhypos = to_test - - def compute_smallest(self): - """ - Compute a cardinality-minimal explanation. - """ - - # result - rhypos = [] - - with Hitman(bootstrap_with=[[i for i in range(len(self.rhypos)) if self.to_consider[i]]]) as hitman: - # computing unit-size MCSes - for i, hypo in enumerate(self.rhypos): - if self.to_consider[i] == False: - continue - - if self.oracle.solve([self.selv] + self.rhypos[:i] + self.rhypos[(i + 1):]): - hitman.hit([i]) - - # main loop - iters = 0 - while True: - hset = hitman.get() - iters += 1 - - if self.verbose > 1: - print('iter:', iters) - print('cand:', hset) - - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset]): - to_hit = [] - satisfied, unsatisfied = [], [] - - removed = list(set(range(len(self.rhypos))).difference(set(hset))) - - model = self.oracle.get_model() - for h in removed: - i = self.sel2fid[self.rhypos[h]] - if '_' not in self.inps[i].symbol_name(): - # feature variable and its expected value - var, exp = self.inps[i], self.sample[i] - - # true value - true_val = float(model.get_py_value(var)) - - if not exp - 0.001 <= true_val <= exp + 0.001: - unsatisfied.append(h) - else: - hset.append(h) - else: - for vid in self.sel2vid[self.rhypos[h]]: - var, exp = self.inps[vid], int(self.sample[vid]) - - # true value - true_val = int(model.get_py_value(var)) - - if exp != true_val: - unsatisfied.append(h) - break - else: - hset.append(h) - - # computing an MCS (expensive) - for h in unsatisfied: - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset] + [self.rhypos[h]]): - hset.append(h) - else: - to_hit.append(h) - - if self.verbose > 1: - print('coex:', to_hit) - - hitman.hit(to_hit) - else: - self.rhypos = [self.rhypos[i] for i in hset] - break diff --git a/pages/application/RandomForest/utils/xgbrf/pi_checker.py b/pages/application/RandomForest/utils/xgbrf/pi_checker.py deleted file mode 100644 index f3f0d33cbd6118a1d746ded0a478f6b4ba4a0d90..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/pi_checker.py +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## pi_checker.py -## -## Created on: -## Author: -## E-mail: -## - -# -#============================================================================== -from __future__ import print_function -import getopt -import numpy as np -import os -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTChecker(object): - """ - checking explanation if is a Prime Implicant using SMT solving. - """ - - def __init__(self, formula, feats, nof_classes, xgb): - """ - Constructor. - """ - - self.ftids = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - self.optns = xgb.options - - # xgbooster will also be needed - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=self.xgb.options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - #print('+++++ ',len(self.oracle._assertion_stack)) - - # current selector - self.selv = None - - def prepare(self, sample, expl): - """ - Prepare the oracle for validating an explanation given a sample. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - - - # adding relaxed hypotheses to the oracle - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - true_output = maxoval[1] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != true_output: - disj.append(GT(self.outs[i], self.outs[true_output])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - # removing all hypotheses except for those in the explanation - hypos = [] - for i, hypo in enumerate(self.rhypos): - j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] - if j in expl: - hypos.append(hypo) - self.rhypos = hypos - #print('assumps: ', self.rhypos) - #print('expl: ', expl) - - ''' - if self.verbose: - inpvals = self.xgb.readable_sample(sample) - - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) - ''' - #print('+++++ ',self.oracle._assertion_stack[len(self.oracle._assertion_stack)-1 : ]) - - def check(self, sample, expl): - """ - Check the explanation. - """ - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.prepare(sample, expl) - - # if satisfiable, then there is a counterexample - if self.oracle.solve([self.selv] + self.rhypos): - print('\n explanation is incorrect') - #print(self.oracle.get_model()) - return False - else: - if self.verbose: - print('\n explanation is correct') - - # in case of categorical data, there are selector duplicates - # and we need to remove them - self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) - #print(self.rhypos) - - i = 0 - # simple deletion-based linear search - while i < len(self.rhypos): - to_test = self.rhypos[:i] + self.rhypos[(i + 1):] - #print(self.rhypos[i]) - - if self.oracle.solve([self.selv] + to_test): - i += 1 - else: - print(' explanation is not a prime implicant') - return False - - - print(' explanation is a prime implicant') - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - if self.verbose: - print(' time: {0:.2f}'.format(self.time)) - - return True diff --git a/pages/application/RandomForest/utils/xgbrf/preprocess.py b/pages/application/RandomForest/utils/xgbrf/preprocess.py deleted file mode 100644 index cdcd2cb3976c8764a21b512d2a5c80a073b739f1..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/preprocess.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## preprocess.py -## -## Created on: Jan 10, 2019 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -import json -import numpy as np -import xgboost as xgb -import math -import pandas as pd -import numpy as np -import sklearn -import pickle - - -# -#============================================================================== -def preprocess_dataset(raw_data_path, files): - print("preprocess dataset from ", raw_data_path) - files = files.split(",") - - data_file = files[0] - dataset_name = files[1] - - try: - data_raw = pd.read_csv(raw_data_path + data_file, sep=',', na_values= ['']) - catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) - categorical_features = np.concatenate(catcols.values).tolist() - - - for i in range(len(data_raw.values[0])): - if i in categorical_features: - data_raw.fillna('',inplace=True) - else: - data_raw.fillna(0,inplace=True) - dataset_all = data_raw - dataset = dataset_all.values.copy() - - print(categorical_features) - except Exception as e: - print("Please provide info about categorical columns/original datasets or omit option -p", e) - exit() - - # move categrorical columns forward - - feature_names = dataset_all.columns - print(feature_names) - - ############################## - extra_info = {} - categorical_names = {} - print(categorical_features) - dataset_new = dataset_all.values.copy() - for feature in categorical_features: - print("feature", feature) - print(dataset[:, feature]) - le = sklearn.preprocessing.LabelEncoder() - le.fit(dataset[:, feature]) - categorical_names[feature] = le.classes_ - dataset_new[:, feature] = le.transform(dataset[:, feature]) - - ###################################3 - # target as categorical - labels_new = [] - - le = sklearn.preprocessing.LabelEncoder() - le.fit(dataset[:, -1]) - dataset_new[:, -1]= le.transform(dataset[:, -1]) - class_names = le.classes_ - ######################################33 - - - if (False): - dataset_new = np.delete(dataset_new, -1, axis=1) - oneencoder = sklearn.preprocessing.OneHotEncoder() - oneencoder.fit(dataset_new[:, categorical_features]) - print(oneencoder.categories_) - n_transformed_features = sum([len(cats) for cats in oneencoder.categories_]) - print(n_transformed_features) - print(dataset_new.shape) - X = dataset_new[:,categorical_features][0] - print(X) - x = np.expand_dims(X, axis=0) - print("x", x, x.shape) - y = dataset_new[0].copy() - print(y.shape, oneencoder.transform(x).shape) - y[categorical_features] = oneencoder.transform(x).toarray() - - print("y", y, y.shape) - - z = oneencoder.inverse_transform(y) - print(z.shape) - exit() - - ###########################################################################3 - extra_info = {"categorical_features": categorical_features, - "categorical_names": categorical_names, - "feature_names": feature_names, - "class_names": class_names} - - new_file_train = raw_data_path + dataset_name + '_data.csv' - df = pd.DataFrame(data=dataset_new) - df.columns = list(feature_names) - df.to_csv(new_file_train, mode = 'w', index=False) - print("new dataset", new_file_train) - - - f = open(raw_data_path + dataset_name + '_data.csv.pkl', "wb") - pickle.dump(extra_info, f) - f.close() diff --git a/pages/application/RandomForest/utils/xgbrf/tree.py b/pages/application/RandomForest/utils/xgbrf/tree.py deleted file mode 100644 index 6c7f7538508ac5c08d695d5caaab8ee12b9ce245..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/tree.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- -## -## tree.py (reuses parts of the code of SHAP) -## -## Created on: Dec 7, 2018 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -# ============================================================================== -from anytree import Node, RenderTree, AsciiStyle -import json -import numpy as np -import xgboost as xgb -import math - - -# -# ============================================================================== -class xgnode(Node): - def __init__(self, id, parent=None): - Node.__init__(self, id, parent) - self.id = id # The node value - self.name = None - self.left_node_id = -1 # Left child - self.right_node_id = -1 # Right child - self.missing_node_id = -1 - - self.feature = -1 - self.threshold = -1 - - self.cover = -1 - self.values = -1 - - def __str__(self): - pref = ' ' * self.depth - if (len(self.children) == 0): - return (pref + "leaf: {} {}".format(self.id, self.values)) - else: - if (self.name is None): - return (pref + "{} f{}<{}".format(self.id, self.feature, self.threshold)) - else: - return (pref + "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) - - -# -# ============================================================================== -def build_tree(json_tree, node=None, feature_names=None, inverse=False): - def max_id(node): - if "children" in node: - return max(node["nodeid"], *[max_id(n) for n in node["children"]]) - else: - return node["nodeid"] - - m = max_id(json_tree) + 1 - - def extract_data(json_node, root=None, feature_names=None): - i = json_node["nodeid"] - if (root is None): - node = xgnode(i) - else: - node = xgnode(i, parent=root) - node.cover = json_node["cover"] - if "children" in json_node: - - node.left_node_id = json_node["yes"] - node.right_node_id = json_node["no"] - node.missing_node_id = json_node["missing"] - node.feature = json_node["split"] - if (feature_names is not None): - node.name = feature_names[node.feature] - node.threshold = json_node["split_condition"] - for c, n in enumerate(json_node["children"]): - child = extract_data(n, node, feature_names) - elif "leaf" in json_node: - node.values = json_node["leaf"] - if (inverse): - node.values = -node.values - return node - - root = extract_data(json_tree, None, feature_names) - return root - - -# -# ============================================================================== -def walk_tree(node): - if (len(node.children) == 0): - # leaf - print(node) - else: - print(node) - walk_tree(node.children[0]) - walk_tree(node.children[1]) - - -# -# ============================================================================== -def scores_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feature_branch = node.feature - sample_value = sample[feature_branch] - assert (sample_value is not None) - if (sample_value < node.threshold): - return scores_tree(node.children[0], sample) - else: - return scores_tree(node.children[1], sample) - - -# -# ============================================================================== -class TreeEnsemble: - """ An ensemble of decision trees. - - This object provides a common interface to many different types of models. - """ - - def __init__(self, model, feature_names=None, nb_classes=0): - self.model_type = "xgboost" - # self.original_model = model.get_booster() - self.original_model = model - #### - self.base_offset = None - json_trees = get_xgboost_json(self.original_model, feature_names) - self.trees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - if (nb_classes == 2): - # NASTY trick for binary - # We change signs of values in leaves so that we can just sum all the values in leaves for class X - # and take max to get the right class - self.otrees = [build_tree(json.loads(t), None, feature_names, inverse=True) for t in json_trees] - self.itrees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - self.trees = [] - for i, _ in enumerate(self.otrees): - self.trees.append(self.otrees[i]) - self.trees.append(self.itrees[i]) - self.feature_names = feature_names - - def print_tree(self): - for i, t in enumerate(self.trees): - print("tree number: ", i) - walk_tree(t) - - def invert_tree_prob(self, node): - if (len(node.children) == 0): - node.values = -node.values - return node - else: - self.invert_tree_prob(node.children[0]) - self.invert_tree_prob(node.children[1]) - return node - - def predict(self, samples, nb_classes): - # https://github.com/dmlc/xgboost/issues/1746#issuecomment-290130695 - prob = [] - nb_estimators = int(len(self.trees) / nb_classes) - for sample in np.asarray(samples): - scores = [] - for i, t in enumerate(self.trees): - s = scores_tree(t, sample) - scores.append((s)) - scores = np.asarray(scores) - class_scores = [] - if (nb_classes == 2): - - for i in range(nb_classes): - class_scores.append(math.exp(-( - scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor - s0 = class_scores[0] - s1 = class_scores[1] - v0 = 1 / (1 + s0) - v1 = 1 / (1 + s1) - class_scores[0] = v0 - class_scores[1] = v1 - else: - for i in range(0, nb_classes * nb_estimators, nb_estimators): - class_scores.append(math.exp((scores[i:i + nb_estimators]).sum())) - # for i in range(nb_classes): - # class_scores.append(math.exp((scores[i::nb_classes]).sum())) - class_scores = np.asarray(class_scores) - prob.append(class_scores / class_scores.sum()) - return np.asarray(prob).reshape((-1, nb_classes)) - - -# -# ============================================================================== -def get_xgboost_json(model, feature_names): - """ REUSED FROM SHAP - This gets a JSON dump of an XGBoost model while ensuring the feature names are their indexes. - """ - fnames = feature_names - model.feature_names = None - json_trees = model.get_dump(with_stats=True, dump_format="json") - model.feature_names = fnames - return json_trees diff --git a/pages/application/RandomForest/utils/xgbrf/validate.py b/pages/application/RandomForest/utils/xgbrf/validate.py deleted file mode 100644 index 024a6800f454c7bae39104a0f17781c2755ce0ea..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/validate.py +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## validate.py -## -## Created on: Jan 4, 2019 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import getopt -import numpy as np -import os -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTValidator(object): - """ - Validating Anchor's explanations using SMT solving. - """ - - def __init__(self, formula, feats, nof_classes, xgb): - """ - Constructor. - """ - - self.ftids = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - self.optns = xgb.options - - # xgbooster will also be needed - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=self.xgb.options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - - # current selector - self.selv = None - - def prepare(self, sample, expl): - """ - Prepare the oracle for validating an explanation given a sample. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - - # adding relaxed hypotheses to the oracle - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - true_output = maxoval[1] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != true_output: - disj.append(GT(self.outs[i], self.outs[true_output])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - # removing all hypotheses except for those in the explanation - hypos = [] - for i, hypo in enumerate(self.rhypos): - j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] - if j in expl: - hypos.append(hypo) - self.rhypos = hypos - - if self.verbose: - inpvals = self.xgb.readable_sample(sample) - - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) - - def validate(self, sample, expl): - """ - Make an effort to show that the explanation is too optimistic. - """ - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.prepare(sample, expl) - - # if satisfiable, then there is a counterexample - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - inpvals = [float(model.get_py_value(i)) for i in self.inps] - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - inpvals = self.xgb.transform_inverse(np.array(inpvals))[0] - self.coex = tuple([inpvals, maxoval[1]]) - inpvals = self.xgb.readable_sample(inpvals) - - if self.verbose: - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation is incorrect') - print(' counterexample: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[maxoval[1]])) - else: - self.coex = None - - if self.verbose: - print(' explanation is correct') - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - if self.verbose: - print(' time: {0:.2f}'.format(self.time)) - - return self.coex - diff --git a/pages/application/RandomForest/utils/xgbrf/xgb_rf.py b/pages/application/RandomForest/utils/xgbrf/xgb_rf.py deleted file mode 100644 index 9c921742d744a8a0e819598cd129ff94ed0e1c01..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/xgb_rf.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/us/bin/env python -#-*- coding:utf-8 -*- -## -## xgb_rf.py -## -## Created on: May 23, 2020 -## Author: Yacine Izza -## E-mail: yacine.izza@univ-toulouse.fr -## - -# -#============================================================================== -from __future__ import print_function - -import random - -from .validate import SMTValidator -from .pi_checker import SMTChecker -from .encode import SMTEncoder -from .explain import SMTExplainer -import numpy as np -import os -import resource -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -import sklearn -# print('The scikit-learn version is {}.'.format(sklearn.__version__)) - -from sklearn.preprocessing import OneHotEncoder -import sys -from six.moves import range -from .tree import TreeEnsemble -import xgboost as xgb -from xgboost import XGBRFClassifier, Booster, plot_tree -import matplotlib.pyplot as plt -import pickle - - -# -#============================================================================== -class XGBRandomForest(object): - """ - The main class to train/encode/explain Random Forest models. - """ - - def __init__(self, options, from_model=None): - """ - Constructor. - """ - self.extended_feature_names = None - self.extended_feature_names_as_array_strings = None - np.random.seed(random.randint(1, 100)) - - self.model = XGBRFClassifier() - self.model = from_model - - self.feature_names = options["feature_names"] - self.num_class = options["n_classes"] - self.nb_features = options["n_features"] - - self.mapping_features() - - def encode(self, test_on=None): - """ - Encode a random forest trained previously. - """ - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, self) - self.enc, self.intvs, self.imaps, self.ivars = encoder.encode() - - if test_on: - encoder.test_sample(np.array(test_on)) - - def explain_sample(self, sample, smallest, solver, use_lime=None, use_anchor=None, use_shap=None, - expl_ext=None, prefer_ext=False, nof_feats=5): - """ - Explain a prediction made for a given sample with a previously - trained tree ensemble. - """ - - if use_lime: - expl = use_lime(self, sample=sample, nb_samples=5, - nb_features_in_exp=nof_feats) - elif use_anchor: - expl = use_anchor(self, sample=sample, nb_samples=5, - nb_features_in_exp=nof_feats, threshold=0.95) - elif use_shap: - expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats) - else: - self.x = SMTExplainer(self.enc, self.intvs, self.imaps, - self.ivars, self.feature_names, self.num_class, - solver, self) - expl = self.x.explain(np.array(sample), smallest, expl_ext, prefer_ext) - - return expl - - def explain(self, samples, smallest, solver, use_lime=None, use_anchor=None, use_shap=None, - expl_ext=None, prefer_ext=False, nof_feats=5): - explanations = [] - for sample in samples: - explanations.append(self.explain_sample(sample, smallest, solver, use_lime, use_anchor, use_shap, - expl_ext, prefer_ext, nof_feats)) - - return explanations - - def validate(self, sample, expl): - """ - Make an attempt to show that a given explanation is optimistic. - """ - - # there must exist an encoding - if 'enc' not in dir(self): - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, - self) - self.enc, _, _, _ = encoder.encode() - - if 'v' not in dir(self): - self.v = SMTValidator(self.enc, self.feature_names, self.num_class, - self) - - # try to compute a counterexample - return self.v.validate(np.array(sample), expl) - - def mapping_features(self): - self.extended_feature_names = {} - self.extended_feature_names_as_array_strings = [] - counter = 0 - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) - counter = counter + 1 - diff --git a/pages/application/RandomForest/utils/xpAnchor.py b/pages/application/RandomForest/utils/xpAnchor.py deleted file mode 100755 index 86958e2bfd42e79e43276b13124c9ffa0f3deed8..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xpAnchor.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python3 -#-*- coding:utf-8 -*- -## -## lime_wrap.py (reuses parts of the code of SHAP) -## - - -# -#============================================================================== -from __future__ import print_function -#from data import Data -from pages.application.RandomForest.utils.xrf import Dataset -import os -import sys -import pickle - - -import json -import numpy as np -import math -from anchor import utils -from anchor import anchor_tabular -import resource - - -# -#============================================================================== -def anchor_call(model, data, sample, threshold=0.95, verbose=0): - - - classifier_fn = lambda x: model.forest.predict(data.transform(x)).astype(float) - X_train, _, _, _ = data.train_test_split() - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - explainer = anchor_tabular.AnchorTabularExplainer( - class_names=data.target_name, - feature_names=data.feature_names, - train_data=data.X) - #print(explainer.d_train) - - if (sample is not None): - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except: - print("Cannot parse input sample:", sample) - exit() - if verbose: - print("\n\n\nStarting Anchor explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = data.transform(feat_sample_exp) - ##y_pred = model.forest.predict(feat_sample_exp)[0] - y_pred_prob = model.forest.predict_proba(feat_sample_exp)[0] - y_pred = np.argmax(y_pred_prob) - - - exp = explainer.explain_instance(feat_sample, - classifier_fn, - threshold=threshold) - if verbose: - print('Anchor: %s' % (' AND '.join(exp.names()))) - print('Precision: %.2f' % exp.precision()) - print('Coverage: %.2f' % exp.coverage()) - - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - if verbose: - print(' time: {0:.2f}'.format(timer)) - - expl = [] - return sorted(expl), timer - - -def pickle_load_file(filename): - try: - f = open(filename, "rb") - data = pickle.load(f) - f.close() - return data - except: - print("Cannot load from file", filename) - exit() - - -# -#============================================================================== -if __name__ == '__main__': - # parsing command-line options - options = Options(sys.argv) - - # making output unbuffered - if sys.version_info.major == 2: - sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) - - # showing head - #show_info() - print('Starting LIME explainer...') - - - if options.files: - cls = None - - print("loading data ...") - data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, use_categorical = False) - - - if options.explain: - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - - if not cls: - print("loading model ...") - cls = pickle_load_file(options.files[1]) - cls.print_accuracy(data) # print test accuray - - samps_file = options.explain.strip() - print(samps_file) - with open(samps_file, 'r') as fp: - lines = fp.readlines() - - # timers - atimes = [] - tested = set() - - for i, s in enumerate(lines): - sample = [float(v.strip()) for v in s.split(',')] - - if tuple(sample) in tested: - continue - - #print("inst#{0}".format(i+1)) - - tested.add(tuple(sample)) - #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) - - expl, time = anchor_call(cls, data, sample, verbose=options.verb) # call lime - - atimes.append(time) - - - #if i == 100: - # break - - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem - - - # reporting the time spent - print('') - print('tot time: {0:.2f}'.format(sum(atimes))) - print('max time: {0:.2f}'.format(max(atimes))) - print('min time: {0:.2f}'.format(min(atimes))) - print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) - print('') - print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) - \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xpLime.py b/pages/application/RandomForest/utils/xpLime.py deleted file mode 100755 index 9945bf7943e755633b688ff183efadea1d7a641a..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xpLime.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python3 -#-*- coding:utf-8 -*- -## -## lime_wrap.py (reuses parts of the code of SHAP) -## - - -# -#============================================================================== -from __future__ import print_function -#from data import Data -from pages.application.RandomForest.utils.xrf import Dataset -import os -import sys -import pickle - - -import json -import numpy as np -import math -import lime -import lime.lime_tabular -import resource - - -# -#============================================================================== -def lime_call(model, data, sample, nb_samples = 50, feats='all', - nb_features_in_exp=10, verbose=0): - - # we need a way to say that features are categorical ? - # we do not have this informations. - predict_fn_rf = lambda x: model.forest.predict_proba(data.transform(x)).astype(float) - X_train, _, _, _ = data.train_test_split() - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - explainer = lime.lime_tabular.LimeTabularExplainer( - X_train, - feature_names=data.feature_names, - categorical_features= None, - class_names=data.target_name, - discretize_continuous=True, - ) - - f2imap = {} - for i, f in enumerate(data.feature_names): - f2imap[f.strip()] = i - - if (sample is not None): - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except: - print("Cannot parse input sample:", sample) - exit() - if verbose: - print("\n\n\nStarting LIME explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = data.transform(feat_sample_exp) - #y_pred = model.forest.predict(feat_sample_exp)[0] - y_pred_prob = model.forest.predict_proba(feat_sample_exp)[0] - y_pred = np.argmax(y_pred_prob) - - - exp = explainer.explain_instance(feat_sample, - predict_fn_rf, - num_features = nb_features_in_exp, - top_labels = 1)#, - #labels = list(range(xgb.num_class))) - - expl = [] - - # choose which features in the explanation to focus on - if feats in ('p', 'pos', '+'): - feats = 1 - elif feats in ('n', 'neg', '-'): - feats = -1 - else: - feats = 0 - - for i in range(data.num_class): - if (i != y_pred): - continue - if verbose: - print("\t \t Explanations for the winner class", i, " ( confidence = ", y_pred_prob[i], ")") - print("\t \t Features in explanations: ", exp.as_list(label=i)) - - s_human_readable = "" - for k, v in enumerate(exp.as_list(label=i)): - if (feats == 1 and v[1] < 0) or (feats == -1 and v[1] >= 0): - continue - - if not (('<' in v[0]) or ('>' in v[0])): - a = v[0].split('=') - f = a[0].strip() - l = a[1].strip() - u = l - -# if (xgb.use_categorical): -# fid = f2imap[f] -# fvid = int(a[1]) -# s_human_readable = s_human_readable + "\t \t id = {}, name = {}, score = {}\n".format(fid, f, str(v[1])) - - - else: - a = v[0].split('<') - - if len(a) == 1: - a = v[0].split('>') - - if len(a) == 2: - f = a[0].strip() - - if '>' in v[0]: - l, u = float(a[1].strip(' =')), None - else: - l, u = None, float(a[1].strip(' =')) - else: - l = float(a[0].strip()) - f = a[1].strip(' =') - u = float(a[2].strip(' =')) - - # expl.append(tuple([f2imap[f], l, u, v[1] >= 0])) - expl.append(f2imap[f]) - -# if (xgb.use_categorical): -# if (len(s_human_readable) > 0): -# print("\t \t Features in explanations (with provided categorical labels): \n", s_human_readable) - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - if verbose: - print(' time: {0:.2f}'.format(timer)) - - - return sorted(expl), timer - - -def pickle_load_file(filename): - try: - f = open(filename, "rb") - data = pickle.load(f) - f.close() - return data - except: - print("Cannot load from file", filename) - exit() - - -# -#============================================================================== -if __name__ == '__main__': - # parsing command-line options - options = Options(sys.argv) - - # making output unbuffered - if sys.version_info.major == 2: - sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) - - # showing head - #show_info() - print('Starting LIME explainer...') - - - if options.files: - cls = None - - print("loading data ...") - data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, use_categorical = False) - - - if options.explain: - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - - if not cls: - print("loading model ...") - cls = pickle_load_file(options.files[1]) - cls.print_accuracy(data) # print test accuray - - samps_file = options.explain.strip() - print(samps_file) - with open(samps_file, 'r') as fp: - lines = fp.readlines() - - # timers - atimes = [] - tested = set() - - for i, s in enumerate(lines): - sample = [float(v.strip()) for v in s.split(',')] - - if tuple(sample) in tested: - continue - - #print("inst#{0}".format(i+1)) - - tested.add(tuple(sample)) - #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) - - expl, time = lime_call(cls, data, sample, verbose=options.verb) # call lime - - atimes.append(time) - - - #if i == 3: - # break - - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem - - - # reporting the time spent - print('') - print('tot time: {0:.2f}'.format(sum(atimes))) - print('max time: {0:.2f}'.format(max(atimes))) - print('min time: {0:.2f}'.format(min(atimes))) - print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) - print('') - print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) - \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xprf.py b/pages/application/RandomForest/utils/xprf.py deleted file mode 100755 index c22b6185f32da5e8f09777ec8689b4dcd22854d9..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xprf.py +++ /dev/null @@ -1,272 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding:utf-8 -*- -## -## xrf.py -## -## Created on: Oct 08, 2020 -## Author: Yacine Izza -## E-mail: yacine.izza@univ-toulouse.fr -## - -# -# ============================================================================== -from __future__ import print_function -from pages.application.RandomForest.utils.data import Data -import os -import sys -import pickle -import resource - -from pages.application.RandomForest.utils.xgbooster import preprocess_dataset - -from pages.application.RandomForest.utils.xrf import XRF, RF2001, Dataset, Checker -import numpy as np - -################## -from pages.application.RandomForest.utils.xpLime import lime_call -import math -import lime -import lime.lime_tabular -### -from pages.application.RandomForest.utils.xpAnchor import anchor_call -# from anchor import utils -from anchor import anchor_tabular - - -################ - -# -# ============================================================================== -def show_info(): - """ - Print info message. - """ - print("c XRF: eXplaining Random Forest.") - print('c') - - -# -# ============================================================================== -def pickle_save_file(filename, data): - try: - f = open(filename, "wb") - pickle.dump(data, f) - f.close() - except: - print("Cannot save to file", filename) - exit() - - -def pickle_load_file(filename): - try: - f = open(filename, "rb") - data = pickle.load(f) - f.close() - return data - except: - print("Cannot load from file", filename) - exit() - - # - - -# ============================================================================== -if __name__ == '__main__': - # parsing command-line options - options = Options(sys.argv) - - # making output unbuffered - if sys.version_info.major == 2: - sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) - - # showing head - show_info() - - if (options.preprocess_categorical): - preprocess_dataset(options.files[0], options.preprocess_categorical_files, options.use_categorical) - exit() - - if options.files: - cls = None - xrf = None - - print("loading data ...") - data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, use_categorical=options.use_categorical) - - if options.train: - ''' - data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, - use_categorical = options.use_categorical) - ''' - - cls = RF2001(options) - train_accuracy, test_accuracy = cls.train(data) - - if options.verb == 1: - print("----------------------") - print("Train accuracy: {0:.2f}".format(100. * train_accuracy)) - print("Test accuracy: {0:.2f}".format(100. * test_accuracy)) - print("----------------------") - - xrf = XRF(options, cls, data) - # xrf.test_tree_ensemble() - - bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] - bench_dir_name = options.output + "/RF/" + bench_name - try: - os.stat(bench_dir_name) - except: - os.mkdir(bench_dir_name) - - basename = (os.path.join(bench_dir_name, bench_name + - "_nbestim_" + str(options.n_estimators) + - "_maxdepth_" + str(options.maxdepth))) - - modfile = basename + '.mod.pkl' - print("saving model to ", modfile) - pickle_save_file(modfile, cls) - - # data_suffix = '.splitdata.pkl' - # filename_data = basename + data_suffix - # print("saving data to ", filename_data) - # pickle_save_file(filename_data, data) - - # read a sample from options.explain - # if options.explain: - # options.explain = [float(v.strip()) for v in options.explain.split(',')] - - ''' - if options.encode: - # encode it and save the encoding to another file - #xrf.encode(test_on=options.explain) - xrf.encode(options.explain) - ''' - if options.explain: - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - - if not xrf: - print("loading model ...") - cls = pickle_load_file(options.files[1]) - xrf = XRF(options, cls, data) - - # expl = xrf.explain(options.explain) - - # expl_checker = Checker(xrf.f, data.num_class, data.extended_feature_names_as_array_strings) - - cls.print_accuracy(data) # print test accuracy of the RF model - - samps_file = options.explain.strip() - print(samps_file) - with open(samps_file, 'r') as fp: - lines = fp.readlines() - - # timers - atimes = [] - lengths = [] - tested = set() - mSAT, mUNSAT = 0.0, 0.0 - stimes = [] - utimes = [] - nSatCalls = [] - nUnsCalls = [] - - ltimes = [] - ctimes = [] - wins = 0 - - for i, s in enumerate(lines): - sample = [float(v.strip()) for v in s.split(',')] - - if tuple(sample) in tested: - continue - - # print("inst#{0}".format(i+1)) - - tested.add(tuple(sample)) - # print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) - - xrf.encode(sample) - expl = xrf.explain(sample) - atimes.append(xrf.x.time) - lengths.append(len(expl)) - - nvars = xrf.enc.cnf.nv - nclauses = len(xrf.enc.cnf.clauses) - - # mSAT = max(xrf.x.stimes+[mSAT]) - # mUNSAT = max(xrf.x.utimes+[mUNSAT]) - if len(xrf.x.stimes): - stimes.append(max(xrf.x.stimes)) - if len(xrf.x.utimes): - utimes.append(max(xrf.x.utimes)) - nSatCalls.append(xrf.x.nsat) - nUnsCalls.append(xrf.x.nunsat) - - # inst = data.transform(np.array(sample))[0] - # expl_checker.check(np.array(inst), expl) - #####check_expl(np.array(inst), expl, xrf.enc.forest, xrf.enc.intvs) - - del xrf.enc - del xrf.x - - #####################LIME########### - ''' - _, ltime = lime_call(cls, data, sample, verbose=options.verb) # call lime - ltimes.append(ltime) - #wins += 1 - if atimes[-1] < ltime: - wins += 1 - ''' - - _, ctime = anchor_call(cls, data, sample, verbose=options.verb) # call lime - ctimes.append(ctime) - if atimes[-1] < ctime: - wins += 1 - - # if i == 1: - # break - - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem - - # reporting the time spent - print('') - print('tot time: {0:.2f}'.format(sum(atimes))) - print('max time: {0:.2f}'.format(max(atimes))) - print('min time: {0:.2f}'.format(min(atimes))) - print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) - print('') - #### - print('avg length: {0:.0f}'.format(round(sum(lengths) / len(lengths)) * 100 / len(sample))) - # print('max SAT: {0:.2f}'.format(mSAT)) - # print('max UNSAT: {0:.2f}'.format(mUNSAT)) - print('max SAT: {0:.2f}'.format(max(stimes))) - print('max UNSAT: {0:.2f}'.format(max(utimes))) - print('avg #SAT: {0:.0f}'.format(sum(nSatCalls) / len(nSatCalls))) - print('avg #UNSAT: {0:.0f}'.format(sum(nUnsCalls) / len(nUnsCalls))) - print('') - # reporting nof_vars and nof_clauses - print('c nof vars: {0}'.format(nvars)) - print('c nof clauses: {0}'.format(nclauses)) - # - print('c nof instances: {0}'.format(len(tested))) - print("c mem used: {0:.2f} Mb".format(mem / (1024 * 1024))) - - # LIME runtimes - ''' - print('') - print('min time for Lime: {0:.2f}'.format(min(ltimes))) - print('avg time for Lime: {0:.2f}'.format(sum(ltimes) / len(ltimes))) - print('#wins {0} out of {1}'.format(wins, len(tested)) ) - ''' - - # Anchor runtimes - print('') - print('tot time for Anchor: {0:.2f}'.format(sum(ctimes))) - print('max time for Anchor: {0:.2f}'.format(max(ctimes))) - print('min time for Anchor: {0:.2f}'.format(min(ctimes))) - print('avg time for Anchor: {0:.2f}'.format(sum(ctimes) / len(ctimes))) - print('#wins {0} out of {1}'.format(wins, len(tested))) diff --git a/pages/application/RandomForest/utils/xrf/__init__.py b/pages/application/RandomForest/utils/xrf/__init__.py index 7ae37dec55e919f83a87d18f7c9271aa7baab3c2..6ee5be877c0d58ab744fc6afd13695558a9245df 100644 --- a/pages/application/RandomForest/utils/xrf/__init__.py +++ b/pages/application/RandomForest/utils/xrf/__init__.py @@ -1,5 +1,4 @@ #from .encode import * -#from .tree import * +from .tree import * from .rndmforest import * -#from .checker import check_expl -from .checker import Checker \ No newline at end of file +from .xforest import * diff --git a/pages/application/RandomForest/utils/xrf/checker.py b/pages/application/RandomForest/utils/xrf/checker.py deleted file mode 100644 index 314e52e99061467c78f69974d80d4522b0e7ec48..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/checker.py +++ /dev/null @@ -1,339 +0,0 @@ -# -# ============================================================================== -import numpy as np -import math - -from .tree import Forest, dt_node -import six -from pysat.formula import CNF, IDPool -from pysat.solvers import Solver -from pysat.card import CardEnc, EncType - - -# from itertools import combinations - -# -# ============================================================================== -def predict_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feat = node.feature - sample_value = sample[feat] - if sample_value is None: - return predict_tree(node.children[0], sample) - elif (sample_value < node.threshold): - return predict_tree(node.children[0], sample) - else: - return predict_tree(node.children[1], sample) - - -# -# ============================================================================== -class Checker: - - def __init__(self, forest, num_class, feature_names): - self.forest = forest - self.num_class = num_class - self.feature_names = feature_names - self.cnf = None - self.vpool = IDPool() - - self.intvs = None - self.intvs = {'{0}'.format(f): set([]) for f in feature_names if '_' not in f} - for tree in self.forest.trees: - self.traverse_intervals(tree) - self.intvs = {f: sorted(self.intvs[f]) + - ([math.inf] if len(self.intvs[f]) else []) - for f in six.iterkeys(self.intvs)} - self.imaps, self.ivars = {}, {} - self.thvars = {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - self.thvars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - ivar = self.newVar('{0}_intv{1}'.format(feat, i)) - self.ivars[feat].append(ivar) - if ub != math.inf: - thvar = self.newVar('{0}_th{1}'.format(feat, i)) - self.thvars[feat].append(thvar) - - self.cnf = CNF() - #### - cvars = [self.newVar('class{0}'.format(i)) for i in range(num_class)] - num_tree = len(self.forest.trees) - ctvars = [[] for t in range(num_tree)] - for k in range(num_tree): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j, k)) - ctvars[k].append(var) - ##### - for k, tree in enumerate(self.forest.trees): - self.traverse(tree, k, []) - card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - ###### - for f, intvs in six.iteritems(self.ivars): - if not len(intvs): - continue - self.cnf.append(intvs) - card = CardEnc.atmost(lits=intvs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - for f, threshold in six.iteritems(self.thvars): - for j, thvar in enumerate(threshold): - d = j + 1 - pos, neg = self.ivars[f][d:], self.ivars[f][:d] - if j == 0: - self.cnf.append([thvar, neg[-1]]) - self.cnf.append([-thvar, -neg[-1]]) - else: - self.cnf.append([thvar, neg[-1], -threshold[j - 1]]) - self.cnf.append([-thvar, threshold[j - 1]]) - self.cnf.append([-thvar, -neg[-1]]) - - if j == len(threshold) - 1: - self.cnf.append([-thvar, pos[0]]) - self.cnf.append([thvar, -pos[0]]) - else: - self.cnf.append([-thvar, pos[0], threshold[j + 1]]) - self.cnf.append([thvar, -pos[0]]) - self.cnf.append([thvar, -threshold[j + 1]]) - - def newVar(self, name): - if name in self.vpool.obj2id: # var has been already created - return self.vpool.obj2id[name] - var = self.vpool.id('{0}'.format(name)) - return var - - def traverse(self, tree, k, clause): - if tree.children: - f = tree.name - v = tree.threshold - pos = neg = [] - if f in self.intvs: - d = self.imaps[f][v] - pos, neg = self.thvars[f][d], -self.thvars[f][d] - else: - var = self.newVar(tree.name) - pos, neg = var, -var - self.traverse(tree.children[0], k, clause + [-neg]) - self.traverse(tree.children[1], k, clause + [-pos]) - else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values, k)) - self.cnf.append(clause + [cvar]) - # self.printLits(clause + [cvar]) - - def traverse_intervals(self, tree): - if tree.children: - f = tree.name - v = tree.threshold - if f in self.intvs: - self.intvs[f].add(v) - self.traverse_intervals(tree.children[0]) - self.traverse_intervals(tree.children[1]) - - def check(self, sample, expl): - print("check PI-expl") - slv = Solver(name="glucose3") - slv.append_formula(self.cnf) - - pred = self.forest.predict_inst(sample) - num_tree = len(self.forest.trees) - ##### - cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] - ctvars = [[] for t in range(num_tree)] - for k in range(num_tree): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j, k)) - ctvars[k].append(var) - # - rhs = num_tree - 1 - for j in range(pred): - lhs = [ctvars[k][j] for k in range(num_tree)] + [- ctvars[k][pred] for k in range(num_tree)] - atms = CardEnc.atmost(lits=lhs, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - # add maj class selector to activate/deactivate eq atmsk - # self.cnf.extend([cl + [-cvars[pred]] for cl in atms]) - slv.append_formula([cl + [-cvars[pred]] for cl in atms]) - rhs = num_tree - for j in range(pred + 1, self.num_class): - lhs = [ctvars[k][j] for k in range(num_tree)] + [- ctvars[k][pred] for k in range(num_tree)] - atms = CardEnc.atmost(lits=lhs, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - # self.cnf.extend([cl + [-cvars[pred]] for cl in atms]) - slv.append_formula([cl + [-cvars[pred]] for cl in atms]) - ######## - ######## - rhs = num_tree - for j in range(pred): - lhs = [- ctvars[k][j] for k in range(num_tree)] + [ctvars[k][pred] for k in range(num_tree)] - atms = CardEnc.atmost(lits=lhs, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - # self.cnf.extend([cl+[-cvars[j]] for cl in atms]) - slv.append_formula([cl + [-cvars[j]] for cl in atms]) - rhs = num_tree - 1 - for j in range(pred + 1, self.num_class): - lhs = [- ctvars[k][j] for k in range(num_tree)] + [ctvars[k][pred] for k in range(num_tree)] - atms = CardEnc.atmost(lits=lhs, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - # self.cnf.extend([cl+[-cvars[j]] for cl in atms]) - slv.append_formula([cl + [-cvars[j]] for cl in atms]) - ############ - # self.cnf.append(cvars) - card = CardEnc.atmost(lits=cvars, vpool=self.vpool, encoding=EncType.cardnetwrk) # AtMostOne constraint - # self.cnf.extend(card.clauses) - slv.add_clause(cvars) - slv.append_formula(card.clauses) - - assums = [] # var selectors to be used as assumptions - # sel2fid = {} # selectors to original feature ids - # sel2vid = {} # selectors to categorical feature ids - # sel2v = {} # selectors to (categorical/interval) values - sel_expl = [] - - # inps = ['f{0}'.format(f) for f in range(len(sample))] # works only with pure continuous feats - inps = self.feature_names - - for i, (inp, val) in enumerate(zip(inps, sample)): - if len(self.intvs[inp]): - v = next((intv for intv in self.intvs[inp] if intv > val), None) - assert (v is not None) - selv = self.newVar('selv_{0}'.format(inp)) - assums.append(selv) - ## - if i in expl: - sel_expl.append(selv) - # print('{0}={1}'.format('selv_{0}'.format(inp), val)) - ## - for j, p in enumerate(self.ivars[inp]): - cl = [-selv] - if j == self.imaps[inp][v]: - cl += [p] - # self.sel2v[selv] = p - else: - cl += [-p] - # self.cnf.append(cl) - slv.add_clause(cl) - assums = sorted(set(assums)) - # print(sel_expl, assums) - sel_pred = cvars[pred] - - # slv = Solver(name="glucose3") - # slv.append_formula(self.cnf) - - assert (slv.solve(assumptions=sel_expl + [sel_pred])), '{0} is not an explanation.'.format(expl) - print('expl:{0} is valid'.format(expl)) - - for i, p in enumerate(sel_expl): - # print(i,p) - to_test = sel_expl[:i] + sel_expl[(i + 1):] + [-sel_pred] - print(to_test) - assert slv.solve(assumptions=to_test), '{0} is not minimal explanation.'.format(expl) - - # delete sat solver - slv.delete() - slv = None - - print('expl:{0} is minimal'.format(expl)) - print() - - -def check_expl(sample, expl, forest, intvs): - print("check PI-expl") - - pred = forest.predict_inst(sample) - - sample_expl = [None] * len(sample) - for p in expl: - sample_expl[p] = sample[p] - - # initializing the intervals - # intvs = {'f{0}'.format(f): set([]) for f in range(len(sample))} - # for tree in forest.trees: - # traverse_intervals(tree) - - # first, check if expl is an explanation - scores = [predict_tree(dt, sample_expl) for dt in forest.trees] - scores = np.asarray(scores) - maj = np.argmax(np.bincount(scores)) - - assert maj == pred, '{0} is not an explanation.'.format(expl) - - print('expl:{0} is valid'.format(expl)) - print("pred = ", pred) - - sample_expl = sample - - feats = ['f{0}'.format(f) for f in expl] - univ = [(i, f) for i, f in enumerate(intvs) if (len(intvs[f]) and (f not in feats))] - - # Now, check if expl is a minimal - for p, f in zip(expl, feats): - print("{0}={1}".format(f, sample_expl[p])) - print([-math.inf] + intvs[f]) - assert (len(intvs[f])) - - # calculate possible values for f - possible_val = [] - d = next((i for i, v in enumerate(intvs[f]) if v > sample_expl[p]), None) - assert (d is not None) - print("d=", d) - - if d: - # possible_val.append(intvs[f][0] - 1) - possible_val.append(-math.inf) - print(intvs[f][:d - 1]) - for i, v in enumerate(intvs[f][:d - 1]): - possible_val.append((v + intvs[f][i + 1]) * 0.5) - - for i, v in enumerate(intvs[f][d + 1:]): - # print('{0} + {1}'.format(v , intvs[f][d+i])) - possible_val.append((v + intvs[f][d + i]) * 0.5) - # if v == math.inf: - # assert(v == intvs[f][-1]) - # possible_val.append(v + 1) - # else: - # possible_val.append((v + intvs[f][i - 1]) * 0.5) - - ## print("{0} => {1} | {2} , {3}".format(f,sample_expl[p], [-math.inf]+intvs[f], possible_val)) - for v in possible_val: - sample_expl[p] = v - for uf in univ: - for x in ([-math.inf] + intvs[uf[1]]): - print('{0}={1}'.format(uf[1], x)) - sample_expl[uf[0]] = x - scores = [predict_tree(dt, sample_expl) for dt in forest.trees] - scores = np.asarray(scores) - maj = np.argmax(np.bincount(scores)) - # print("maj: {0} | {1}={2}".format( maj, f, v)) - if maj != pred: - break - sample_expl[uf[0]] = sample[p] - - print("maj: {0} | {1}={2}".format(maj, f, v)) - - else: - assert False, '{0} is not minimal explanation.'.format(expl) - - sample_expl[p] = sample[p] - - print('expl:{0} is minimal'.format(expl)) - print() - - return True - - -''' -def traverse_intervals(tree, intvs): - if tree.children: - f = tree.name - v = tree.threshold - if f in self.intvs: - intvs[p].add(v) - - l_intvs = traverse_intervals(tree.children[0]) - r_intvs = traverse_intervals(tree.children[1]) - return {**l_intvs, **r_intvs} - - else: - return intvs -''' diff --git a/pages/application/RandomForest/utils/xrf/rndmforest.py b/pages/application/RandomForest/utils/xrf/rndmforest.py index 187dd280752231383f3c74ab70af9b2d6cf74019..62dd80f373fc971f0414fbe825c0058d6f6149c2 100644 --- a/pages/application/RandomForest/utils/xrf/rndmforest.py +++ b/pages/application/RandomForest/utils/xrf/rndmforest.py @@ -9,129 +9,29 @@ import os import resource import collections +from itertools import combinations from six.moves import range import six import math -from pages.application.RandomForest.utils.data import Data -from .tree import Forest, predict_tree -# from .encode import SATEncoder -from pysat.formula import CNF, IDPool -from pysat.solvers import Solver -from pysat.card import CardEnc, EncType -from itertools import combinations - -# -# ============================================================================== -class Dataset(Data): - """ - Class for representing dataset (transactions). - """ - - def __init__(self, file=None, mapfile=None, separator=',', use_categorical=False): - super().__init__(file, mapfile, separator, use_categorical) - - # split data into X and y - self.feature_names = self.names[:-1] - self.nb_features = len(self.feature_names) - self.use_categorical = use_categorical - - samples = np.asarray(self.samps) - if not all(c.isnumeric() for c in samples[:, -1]): - le = LabelEncoder() - le.fit(samples[:, -1]) - samples[:, -1] = le.transform(samples[:, -1]) - # self.class_names = le.classes_ - - samples = np.asarray(samples, dtype=np.float32) - self.X = samples[:, 0: self.nb_features] - self.y = samples[:, self.nb_features] - self.num_class = len(set(self.y)) - self.target_name = list(range(self.num_class)) - - # check if we have info about categorical features - if (self.use_categorical): - self.target_name = self.class_names - self.binarizer = {} - for i in self.categorical_features: - self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , - self.binarizer[i].fit(self.X[:, [i]]) - else: - self.categorical_features = [] - self.categorical_names = [] - self.binarizer = [] - # feat map - self.mapping_features() - - def mapping_features(self): - self.extended_feature_names = {} - self.extended_feature_names_as_array_strings = [] - counter = 0 - if (self.use_categorical): - for i in range(self.nb_features): - if (i in self.categorical_features): - for j, _ in enumerate(self.binarizer[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append( - "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) - counter = counter + 1 - else: - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - else: - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - - def readable_sample(self, x): - readable_x = [] - for i, v in enumerate(x): - if (i in self.categorical_features): - readable_x.append(self.categorical_names[i][int(v)]) - else: - readable_x.append(v) - return np.asarray(readable_x) - - def transform(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.binarizer != []) - tx = [] - for i in range(self.nb_features): - #self.binarizer[i].drop = None - if (i in self.categorical_features): - self.binarizer[i].drop = None - tx_aux = self.binarizer[i].transform(x[:,[i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:,[i]]) - tx = np.hstack(tx) - return tx - else: - return x # -# ============================================================================== +#============================================================================== class VotingRF(VotingClassifier): """ Majority rule classifier """ - + def fit(self, X, y, sample_weight=None): self.estimators_ = [] for _, est in self.estimators: self.estimators_.append(est) - + self.le_ = LabelEncoder().fit(y) - self.classes_ = self.le_.classes_ - + self.classes_ = self.le_.classes_ + + def predict(self, X): """Predict class labels for X. Parameters @@ -143,670 +43,95 @@ class VotingRF(VotingClassifier): maj : array-like of shape (n_samples,) Predicted class labels. """ - # check_is_fitted(self) - + #check_is_fitted(self) + # 'hard' voting predictions = self._predict(X) - predictions = np.asarray(predictions, np.int64) # NEED TO BE CHECKED + predictions = np.asarray(predictions, np.int64) #NEED TO BE CHECKED maj = np.apply_along_axis( lambda x: np.argmax( np.bincount(x, weights=self._weights_not_none)), axis=1, arr=predictions) - + maj = self.le_.inverse_transform(maj) return maj - - + + # -# ============================================================================== +#============================================================================== class RF2001(object): """ The main class to train Random Forest Classifier (RFC). """ - def __init__(self, options): + def __init__(self, **options): """ Constructor. - """ + """ self.forest = None self.voting = None - self.opt = options - - param_dist = {'n_estimators': options.n_estimators, - 'max_depth': options.maxdepth, - 'criterion': 'entropy', - 'random_state': 324089} - + + param_dist = {'n_estimators':options['n_trees'], + 'max_depth':options['depth'], + 'criterion':'entropy', + 'random_state':324089} + self.forest = RandomForestClassifier(**param_dist) - + + def fit(self, X_train, y_train): + """ + building Breiman'01 Random Forest + (similar to train(dataset) fnc) + """ + self.forest.fit(X_train,y_train) + rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)] + self.voting = VotingRF(estimators=rtrees) + self.voting.fit(X_train,y_train) + + return self + + + def train(self, dataset, verb=0): + """ + Train a random forest. + """ + + X_train, X_test, y_train, y_test = dataset.train_test_split() + + X_train = dataset.transform(X_train) + X_test = dataset.transform(X_test) + + print("Build a random forest.") + self.forest.fit(X_train,y_train) + + rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)] + self.voting = VotingRF(estimators=rtrees) + self.voting.fit(X_train,y_train) + + train_acc = accuracy_score(self.predict(X_train), y_train) + test_acc = accuracy_score(self.predict(X_test), y_test) + + if verb > 1: + self.print_acc_vote(X_train, X_test, y_train, y_test) + self.print_acc_prob(X_train, X_test, y_train, y_test) + + return train_acc, test_acc + def predict(self, X): return self.voting.predict(X) - + def predict_prob(self, X): self.forest.predict(X) - + def estimators(self): - assert (self.forest.estimators_ is not None) + assert(self.forest.estimators_ is not None) return self.forest.estimators_ - + def n_estimators(self): return self.forest.n_estimators - - -# -# ============================================================================== -class XRF(object): - """ - class to encode and explain Random Forest classifiers. - """ - - def __init__(self, model, dataset): - self.cls = model - self.data = dataset - self.forest = Forest(model, dataset.extended_feature_names_as_array_strings) - - self.enc = SATEncoder(self.forest, self.data.feature_names, self.data.num_class, - self.data.extended_feature_names_as_array_strings) - - def encode(self, inst): - """ - Encode a tree ensemble trained previously. - """ - if 'f' not in dir(self): - self.f = Forest(self.cls, self.data.extended_feature_names_as_array_strings) - - self.enc = SATEncoder(self.forest, self.data.feature_names, self.data.num_class, - self.data.extended_feature_names_as_array_strings) - - inst = self.data.transform(np.array(inst))[0] - formula, _, _, _ = self.enc.encode(inst) - - def explain_sample(self, inst): - """ - Explain a prediction made for a given sample with a previously - trained RF. - """ - - if 'enc' not in dir(self): - self.encode(inst) - - inpvals = self.data.readable_sample(inst) - preamble = [] - for f, v in zip(self.data.feature_names, inpvals): - if f not in str(v): - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - inps = self.data.extended_feature_names_as_array_strings # input (feature value) variables - # print("inps: {0}".format(inps)) - - self.x = SATExplainer(self.enc, inps, preamble, self.data.target_name) - inst = self.data.transform(np.array(inst))[0] - expl = self.x.explain(np.array(inst)) - - return expl - - # copie de git - def explain(self, samples): - # timers - lengths = [] - tested = set() - mSAT, mUNSAT = 0.0, 0.0 - stimes = [] - utimes = [] - nSatCalls = [] - nUnsCalls = [] - - ltimes = [] - ctimes = [] - wins = 0 - - multiple_expls = [] - for i, s in enumerate(samples): - sample = [float(v.strip()) for v in s.split(',')] - - if tuple(sample) in tested: - continue - - # print("inst#{0}".format(i+1)) - - tested.add(tuple(sample)) - # print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) - - self.encode(sample) - expl = self.explain_sample(sample) - multiple_expls.append(expl) - lengths.append(len(expl)) - - nvars = self.enc.cnf.nv - nclauses = len(self.enc.cnf.clauses) - - # mSAT = max(xrf.x.stimes+[mSAT]) - # mUNSAT = max(xrf.x.utimes+[mUNSAT]) - if len(self.x.stimes): - stimes.append(max(self.x.stimes)) - if len(self.x.utimes): - utimes.append(max(self.x.utimes)) - nSatCalls.append(self.x.nsat) - nUnsCalls.append(self.x.nunsat) - - del self.enc - del self.x - - return multiple_expls - - - # ============================================================================== -class SATEncoder(object): - """ - Encoder of Random Forest classifier into SAT. - """ - - def __init__(self, forest, feats, nof_classes, extended_feature_names, from_file=None): - self.forest = forest - # self.feats = {f: i for i, f in enumerate(feats)} - self.num_class = nof_classes - self.vpool = IDPool() - self.extended_feature_names = extended_feature_names - - # encoding formula - self.cnf = None - - # for interval-based encoding - self.intvs, self.imaps, self.ivars, self.thvars = None, None, None, None - - def newVar(self, name): - - if name in self.vpool.obj2id: # var has been already created - return self.vpool.obj2id[name] - - var = self.vpool.id('{0}'.format(name)) - return var - - def printLits(self, lits): - print(["{0}{1}".format("-" if p < 0 else "", self.vpool.obj(abs(p))) for p in lits]) - - def traverse(self, tree, k, clause): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - f = tree.name - v = tree.threshold - pos = neg = [] - if f in self.intvs: - d = self.imaps[f][v] - pos, neg = self.thvars[f][d], -self.thvars[f][d] - else: - var = self.newVar(tree.name) - pos, neg = var, -var - # print("{0} => {1}".format(tree.name, var)) - - assert (pos and neg) - self.traverse(tree.children[0], k, clause + [-neg]) - self.traverse(tree.children[1], k, clause + [-pos]) - else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values, k)) - self.cnf.append(clause + [cvar]) - # self.printLits(clause + [cvar]) - - def compute_intervals(self): - """ - Traverse all trees in the ensemble and extract intervals for each - feature. - At this point, the method only works for numerical datasets! - """ - - def traverse_intervals(tree): - """ - Auxiliary function. Recursive tree traversal. - """ - - if tree.children: - f = tree.name - v = tree.threshold - if f in self.intvs: - self.intvs[f].add(v) - - traverse_intervals(tree.children[0]) - traverse_intervals(tree.children[1]) - - # initializing the intervals - self.intvs = {'{0}'.format(f): set([]) for f in self.extended_feature_names if '_' not in f} - - for tree in self.forest.trees: - traverse_intervals(tree) - - # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ([math.inf] if len(self.intvs[f]) else []) for f in - six.iterkeys(self.intvs)} - - self.imaps, self.ivars = {}, {} - self.thvars = {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - self.thvars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - - ivar = self.newVar('{0}_intv{1}'.format(feat, i)) - self.ivars[feat].append(ivar) - # print('{0}_intv{1}'.format(feat, i)) - - if ub != math.inf: - # assert(i < len(intvs)-1) - thvar = self.newVar('{0}_th{1}'.format(feat, i)) - self.thvars[feat].append(thvar) - # print('{0}_th{1}'.format(feat, i)) - - def encode(self, sample): - """ - Do the job. - """ - - ###print('Encode RF into SAT ...') - - self.cnf = CNF() - # getting a tree ensemble - # self.forest = Forest(self.model, self.extended_feature_names) - num_tree = len(self.forest.trees) - self.forest.predict_inst(sample) - - # introducing class variables - # cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] - - # define Tautology var - vtaut = self.newVar('Tautology') - - # introducing class-tree variables - ctvars = [[] for t in range(num_tree)] - for k in range(num_tree): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j, k)) - ctvars[k].append(var) - - # traverse all trees and extract all possible intervals - # for each feature - ###print("compute intervarls ...") - self.compute_intervals() - - # print(self.intvs) - # print([len(self.intvs[f]) for f in self.intvs]) - # print(self.imaps) - # print(self.ivars) - # print(self.thvars) - # print(ctvars) - - ##print("encode trees ...") - # traversing and encoding each tree - for k, tree in enumerate(self.forest.trees): - # print("Encode tree#{0}".format(k)) - # encoding the tree - self.traverse(tree, k, []) - # exactly one class var is true - # self.printLits(ctvars[k]) - card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - - # calculate the majority class - self.cmaj = self.forest.predict_inst(sample) - - ##print("encode majority class ...") - # Cardinality constraint AtMostK to capture a j_th class - - if (self.num_class == 2): - rhs = math.floor(num_tree / 2) + 1 - if (self.cmaj == 1 and not num_tree % 2): - rhs = math.floor(num_tree / 2) - lhs = [ctvars[k][1 - self.cmaj] for k in range(num_tree)] - atls = CardEnc.atleast(lits=lhs, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(atls) - else: - zvars = [] - zvars.append([self.newVar('z_0_{0}'.format(k)) for k in range(num_tree)]) - zvars.append([self.newVar('z_1_{0}'.format(k)) for k in range(num_tree)]) - ## - rhs = num_tree - lhs0 = zvars[0] + [- ctvars[k][self.cmaj] for k in range(num_tree)] - ##self.printLits(lhs0) - atls = CardEnc.atleast(lits=lhs0, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(atls) - ## - # rhs = num_tree - 1 - rhs = num_tree + 1 - ########### - lhs1 = zvars[1] + [- ctvars[k][self.cmaj] for k in range(num_tree)] - ##self.printLits(lhs1) - atls = CardEnc.atleast(lits=lhs1, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(atls) - # - pvars = [self.newVar('p_{0}'.format(k)) for k in range(self.num_class + 1)] - ##self.printLits(pvars) - for k, p in enumerate(pvars): - for i in range(num_tree): - if k == 0: - z = zvars[0][i] - # self.cnf.append([-p, -z, vtaut]) - self.cnf.append([-p, z, -vtaut]) - # self.printLits([-p, z, -vtaut]) - # print() - elif k == self.cmaj + 1: - z = zvars[1][i] - self.cnf.append([-p, z, -vtaut]) - - # self.printLits([-p, z, -vtaut]) - # print() - - else: - z = zvars[0][i] if (k < self.cmaj + 1) else zvars[1][i] - self.cnf.append([-p, -z, ctvars[i][k - 1]]) - self.cnf.append([-p, z, -ctvars[i][k - 1]]) - - # self.printLits([-p, -z, ctvars[i][k-1] ]) - # self.printLits([-p, z, -ctvars[i][k-1] ]) - # print() - - # - self.cnf.append([-pvars[0], -pvars[self.cmaj + 1]]) - ## - lhs1 = pvars[:(self.cmaj + 1)] - ##self.printLits(lhs1) - eqls = CardEnc.equals(lits=lhs1, bound=1, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(eqls) - - lhs2 = pvars[(self.cmaj + 1):] - ##self.printLits(lhs2) - eqls = CardEnc.equals(lits=lhs2, bound=1, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(eqls) - - ##print("exactly-one feat const ...") - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.extended_feature_names: - if '_' in f: - categories[f.split('_')[0]].append(self.newVar(f)) - for c, feats in six.iteritems(categories): - # exactly-one feat is True - self.cnf.append(feats) - card = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - # lits of intervals - for f, intvs in six.iteritems(self.ivars): - if not len(intvs): - continue - self.cnf.append(intvs) - card = CardEnc.atmost(lits=intvs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - # self.printLits(intvs) - - for f, threshold in six.iteritems(self.thvars): - for j, thvar in enumerate(threshold): - d = j + 1 - pos, neg = self.ivars[f][d:], self.ivars[f][:d] - - if j == 0: - assert (len(neg) == 1) - self.cnf.append([thvar, neg[-1]]) - self.cnf.append([-thvar, -neg[-1]]) - else: - self.cnf.append([thvar, neg[-1], -threshold[j - 1]]) - self.cnf.append([-thvar, threshold[j - 1]]) - self.cnf.append([-thvar, -neg[-1]]) - - if j == len(threshold) - 1: - assert (len(pos) == 1) - self.cnf.append([-thvar, pos[0]]) - self.cnf.append([thvar, -pos[0]]) - else: - self.cnf.append([-thvar, pos[0], threshold[j + 1]]) - self.cnf.append([thvar, -pos[0]]) - self.cnf.append([thvar, -threshold[j + 1]]) - - return self.cnf, self.intvs, self.imaps, self.ivars - - -# -# ============================================================================== -class SATExplainer(object): - """ - An SAT-inspired minimal explanation extractor for Random Forest models. - """ - - def __init__(self, sat_enc, inps, preamble, target_name, verb=1): - """ - Constructor. - """ - - self.enc = sat_enc - self.inps = inps # input (feature value) variables - self.target_name = target_name - self.preamble = preamble - - self.verbose = verb - - self.slv = None - ##self.slv = Solver(name=options.solver) - ##self.slv = Solver(name="minisat22") - # self.slv = Solver(name="glucose3") - # CNF formula - # self.slv.append_formula(self.enc.cnf) - - def explain(self, sample, smallest=False): - """ - Hypotheses minimization. - """ - expl = {} - expl["Instance"] = '"IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj]) - - # create a SAT solver - self.slv = Solver(name="glucose3") - - # adapt the solver to deal with the current sample - # self.csel = [] - self.assums = [] # var selectors to be used as assumptions - self.sel2fid = {} # selectors to original feature ids - self.sel2vid = {} # selectors to categorical feature ids - self.sel2v = {} # selectors to (categorical/interval) values - - # for i in range(self.enc.num_class): - # self.csel.append(self.enc.newVar('class{0}'.format(i))) - # self.csel = self.enc.newVar('class{0}'.format(self.enc.cmaj)) - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, sample), 1): - if '_' in inp: - - assert (inp not in self.enc.intvs) - - feat = inp.split('_')[0] - selv = self.enc.newVar('selv_{0}'.format(feat)) - - self.assums.append(selv) - if selv not in self.sel2fid: - self.sel2fid[selv] = int(feat[1:]) - self.sel2vid[selv] = [i - 1] - else: - self.sel2vid[selv].append(i - 1) - - p = self.enc.newVar(inp) - if not val: - p = -p - else: - self.sel2v[selv] = p - - self.enc.cnf.append([-selv, p]) - - # self.enc.printLits([-selv, p]) - - elif len(self.enc.intvs[inp]): - # v = None - # for intv in self.enc.intvs[inp]: - # if intv > val: - # v = intv - # break - v = next((intv for intv in self.enc.intvs[inp] if intv > val), None) - assert (v is not None) - - selv = self.enc.newVar('selv_{0}'.format(inp)) - self.assums.append(selv) - - assert (selv not in self.sel2fid) - self.sel2fid[selv] = int(inp[1:]) - self.sel2vid[selv] = [i - 1] - - for j, p in enumerate(self.enc.ivars[inp]): - cl = [-selv] - if j == self.enc.imaps[inp][v]: - cl += [p] - self.sel2v[selv] = p - else: - cl += [-p] - - self.enc.cnf.append(cl) - # self.enc.printLits(cl) - ''' - with open("/tmp/pendigits.cnf", 'w') as fp: - fp.write('p cnf {0} {1}\n'.format(self.enc.cnf.nv, len(self.enc.cnf.clauses))) - for p in self.assums + [-self.csel]: - fp.write('{0} 0\n'.format(str(p))) - - for cl in self.enc.cnf.clauses: - fp.write(' '.join([str(p) for p in cl+[0]])) - fp.write('\n') - fp.close() - print(self.assums + [self.csel]) - ''' - - self.assums = sorted(set(self.assums)) - if self.verbose: - expl['hypos:'] = len(self.assums) - - # pass a CNF formula - self.slv.append_formula(self.enc.cnf) - - ''' - # if unsat, then the observation is not implied by the assumptions - if not self.slv.solve(assumptions=self.assums+[self.csel]): - print(' no implication!') - print(self.slv.get_core()) - sys.exit(1) - - if self.verbose > 1: - self.enc.printLits(self.assums+[self.csel]) - self.print_sat_model() - ''' - - if not smallest: - self.compute_minimal() - else: - raise NotImplementedError('Smallest explanation is not yet implemented.') - # self.compute_smallest() - - explanation = sorted([self.sel2fid[h] for h in self.assums if h > 0]) - assert len(explanation), 'PI-explanation cannot be an empty-set! otherwise the RF predicts only one class' - - # delete sat solver - self.slv.delete() - self.slv = None - - expl["expl selctors:"] = explanation - self.preamble = [self.preamble[i] for i in explanation] - expl["explanation :"] = "IF {0} THEN {1}".format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj]) - expl["hypos left :"] = len(explanation) - - return expl - - def compute_minimal(self): - """ - Compute any subset-minimal explanation. - """ - nsat, nunsat = 0, 0 - stimes, utimes = [], [] - - vtaut = self.enc.newVar('Tautology') - - # simple deletion-based linear search - for i, p in enumerate(self.assums): - to_test = [vtaut] + self.assums[:i] + self.assums[(i + 1):] + [-p, -self.sel2v[p]] - - t0 = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - sat = self.slv.solve(assumptions=to_test) - - if not sat: - self.assums[i] = -p - elif self.verbose > 1: - self.print_sat_model() - - t = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - t0 - # print("{0} {1:.2f}s".format("SAT" if sat else "UNSAT", t)) - - if sat: - nsat += 1 - stimes.append(t) - if self.verbose > 1: - self.enc.printLits(to_test) - print("SAT") - else: - # print("Core: ",self.slv.get_core()) - nunsat += 1 - utimes.append(t) - if self.verbose > 1: - self.enc.printLits(to_test) - print("UNSAT") - if self.verbose: - print('') - print('#SAT: {0} | #UNSAT: {1}'.format(len(stimes), len(utimes))) - if (nsat): - print('SAT: tot: {0:.2f} | m: {1:.2f} | M: {2:.2f} | avg: {3:.2f}'.format( - sum(stimes), min(stimes), max(stimes), sum(stimes) / len(stimes))) - if (nunsat): - print('UNSAT: tot: {0:.2f} | m: {1:.2f} | M: {2:.2f} | avg: {3:.2f}'.format( - sum(utimes), min(utimes), max(utimes), sum(utimes) / len(utimes))) - print('') - - self.stimes, self.utimes = stimes, utimes - self.nsat, self.nunsat = nsat, nunsat - - def print_sat_model(self): - assert (self.slv.get_model()) - model = [p for p in self.slv.get_model() if self.enc.vpool.obj(abs(p))] - str_model = [] - lits = [] - for p in model: - if self.enc.vpool.obj(abs(p)) in self.inps: - str_model.append((p, self.enc.vpool.obj(abs(p)))) - - elif ("class" in self.enc.vpool.obj(abs(p))): - str_model.append((p, self.enc.vpool.obj(abs(p)))) - - # elif ("intv" in self.enc.vpool.obj(abs(p))) : - # str_model.append((p, self.enc.vpool.obj(abs(p)))) - - if ("_tr" in self.enc.vpool.obj(abs(p))): - lits.append(p) - - if ("p_" in self.enc.vpool.obj(abs(p))): - str_model.append((p, self.enc.vpool.obj(abs(p)))) - if ("z_" in self.enc.vpool.obj(abs(p))): - str_model.append((p, self.enc.vpool.obj(abs(p)))) - - print("Model:", str_model) - ###print(self.slv.get_model()) - - num_tree = len(self.enc.forest.trees) - num_class = self.enc.num_class - occ = [0] * num_class - - for p in lits: - if p > 0: - j = int(self.enc.vpool.obj(abs(p))[5]) - occ[j] += 1 - print(occ) \ No newline at end of file + + def print_accuracy(self, X_test, y_test): + test_acc = accuracy_score(self.predict(X_test), y_test) + print("c Model accuracy: {0:.2f}".format(100. * test_acc)) + #print("----------------------") \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/tree.py b/pages/application/RandomForest/utils/xrf/tree.py index bcac5b2721f9640cbbca33699855168ed172e325..7b31681275baf3fcb53c0bb8c5f9b39c8fcf2b3a 100644 --- a/pages/application/RandomForest/utils/xrf/tree.py +++ b/pages/application/RandomForest/utils/xrf/tree.py @@ -1,6 +1,6 @@ # -# ============================================================================== -from anytree import Node, RenderTree, AsciiStyle +#============================================================================== +from anytree import Node, RenderTree,AsciiStyle import json import numpy as np import math @@ -8,34 +8,34 @@ import os # -# ============================================================================== +#============================================================================== class dt_node(Node): - def __init__(self, id, parent=None): + def __init__(self, id, parent = None): Node.__init__(self, id, parent) self.id = id # The node value self.name = None - self.left_node_id = -1 # Left child + self.left_node_id = -1 # Left child self.right_node_id = -1 # Right child self.feature = -1 self.threshold = None - self.values = -1 - # iai - # self.split = None + self.values = -1 + #iai + #self.split = None def __str__(self): pref = ' ' * self.depth - if len(self.children) == 0: - return (pref + "leaf: {} {}".format(self.id, self.values)) + if (len(self.children) == 0): + return (pref+ "leaf: {} {}".format(self.id, self.values)) else: - if (self.name is None): - return (pref + "{} f{}<{}".format(self.id, self.feature, self.threshold)) + if(self.name is None): + return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) else: - return (pref + "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) + return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) -# ============================================================================== -def build_tree(tree_, feature_names=None): +#============================================================================== +def build_tree(tree_, feature_names = None): ## feature = tree_.feature threshold = tree_.threshold @@ -49,29 +49,29 @@ def build_tree(tree_, feature_names=None): while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 - + # If we have a test node if (children_left[node_id] != children_right[node_id]): stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: - is_leaf[node_id] = True - ## - - m = tree_.node_count + is_leaf[node_id] = True + ## + + m = tree_.node_count assert (m > 0), "Empty tree" - - def extract_data(idx, root=None, feature_names=None): + + def extract_data(idx, root = None, feature_names = None): i = idx assert (i < m), "Error index node" if (root is None): node = dt_node(i) else: - node = dt_node(i, parent=root) - # node.cover = json_node["cover"] + node = dt_node(i, parent = root) + #node.cover = json_node["cover"] if is_leaf[i]: node.values = np.argmax(values[i]) - # if(inverse): + #if(inverse): # node.values = -node.values else: node.feature = feature[i] @@ -80,17 +80,17 @@ def build_tree(tree_, feature_names=None): node.threshold = threshold[i] node.left_node_id = children_left[i] node.right_node_id = children_right[i] - extract_data(node.left_node_id, node, feature_names) # feat < threshold ( < 0.5 False) - extract_data(node.right_node_id, node, feature_names) # feat >= threshold ( >= 0.5 True) + extract_data(node.left_node_id, node, feature_names) #feat < threshold ( < 0.5 False) + extract_data(node.right_node_id, node, feature_names) #feat >= threshold ( >= 0.5 True) return node - + root = extract_data(0, None, feature_names) - + return root -# ============================================================================== +#============================================================================== def walk_tree(node): if (len(node.children) == 0): # leaf @@ -100,20 +100,17 @@ def walk_tree(node): walk_tree(node.children[0]) walk_tree(node.children[1]) - def count_nodes(root): def count(node): if len(node.children): - return sum([1 + count(n) for n in node.children]) + return sum([1+count(n) for n in node.children]) else: return 0 - m = count(root) + 1 return m - # -# ============================================================================== +#============================================================================== def predict_tree(node, sample): if (len(node.children) == 0): # leaf @@ -121,33 +118,32 @@ def predict_tree(node, sample): else: feature_branch = node.feature sample_value = sample[feature_branch] - assert (sample_value is not None) - if (sample_value < node.threshold): + assert(sample_value is not None) + if(sample_value < node.threshold): return predict_tree(node.children[0], sample) else: return predict_tree(node.children[1], sample) - + # -# ============================================================================== +#============================================================================== class Forest: """ An ensemble of decision trees. This object provides a common interface to many different types of models. """ - - def __init__(self, rf, feature_names=None): - # self.rf = rf - - self.trees = [build_tree(dt.tree_, feature_names) for dt in rf.estimators()] + def __init__(self, rf, feature_names = None): + #self.rf = rf + self.trees = [ build_tree(dt.tree_, feature_names) for dt in rf.estimators()] self.sz = sum([dt.tree_.node_count for dt in rf.estimators()]) self.md = max([dt.tree_.max_depth for dt in rf.estimators()]) #### nb_nodes = [dt.tree_.node_count for dt in rf.estimators()] - assert ([dt.tree_.node_count for dt in rf.estimators()] == [count_nodes(dt) for dt in self.trees]) - + assert([dt.tree_.node_count for dt in rf.estimators()] == [count_nodes(dt) for dt in self.trees]) + #self.print_trees() + def print_trees(self): - for i, t in enumerate(self.trees): + for i,t in enumerate(self.trees): print("tree number: ", i) walk_tree(t) @@ -156,20 +152,22 @@ class Forest: scores = np.asarray(scores) maj = np.argmax(np.bincount(scores)) return maj - - def predict(self, samples): + + + def predict(self, samples): predictions = [] print("#Trees: ", len(self.trees)) for sample in np.asarray(samples): scores = [] - for i, t in enumerate(self.trees): + for i,t in enumerate(self.trees): s = predict_tree(t, sample) scores.append((s)) scores = np.asarray(scores) predictions.append(scores) - predictions = np.asarray(predictions) - # print(predictions) - # np.bincount(x, weights=self._weights_not_none) + predictions = np.asarray(predictions) + #print(predictions) + #np.bincount(x, weights=self._weights_not_none) maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions) + + return maj - return maj diff --git a/pages/application/RandomForest/utils/xrf/xforest.py b/pages/application/RandomForest/utils/xrf/xforest.py new file mode 100644 index 0000000000000000000000000000000000000000..bcf9cd18a9cc106b651b27ce19180eb9d1e3409e --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/xforest.py @@ -0,0 +1,803 @@ +# from sklearn.ensemble._voting import VotingClassifier +# from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import OneHotEncoder, LabelEncoder +from sklearn.model_selection import train_test_split +# from sklearn.metrics import accuracy_score +import numpy as np + +import collections +from six.moves import range +import six +import math + +from pages.application.RandomForest.utils.data import Data +from pages.application.RandomForest.utils.xrf.tree import Forest + +# from .encode import SATEncoder +from pysat.formula import CNF, WCNF, IDPool +from pysat.solvers import Solver +from pysat.card import CardEnc, EncType +from pysat.examples.lbx import LBX +from pysat.examples.rc2 import RC2 + + +# +# ============================================================================== +class Dataset(Data): + """ + Class for representing dataset (transactions). + """ + + def __init__(self, file, mapfile=None, + separator=',', use_categorical=False): + super().__init__(file, mapfile, separator, use_categorical) + + # split data into X and y + self.feature_names = self.names[:-1] + self.nb_features = len(self.feature_names) + self.use_categorical = use_categorical + + samples = np.asarray(self.samps) + if not all(c.isnumeric() for c in samples[:, -1]): + le = LabelEncoder() + le.fit(samples[:, -1]) + samples[:, -1] = le.transform(samples[:, -1]) + self.class_names = le.classes_ + print(le.classes_) + print(samples[1:4, :]) + else : + self.class_names = samples[:, -1] + + samples = np.asarray(samples, dtype=np.float32) + self.X = samples[:, 0: self.nb_features] + self.y = samples[:, self.nb_features] + self.num_class = len(set(self.y)) + self.target_name = list(range(self.num_class)) + + # check if we have info about categorical features + if (self.use_categorical): + self.target_name = self.class_names + + self.binarizer = {} + for i in self.categorical_features: + self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , + self.binarizer[i].fit(self.X[:, [i]]) + else: + self.categorical_features = [] + self.categorical_names = [] + self.binarizer = [] + # feat map + self.mapping_features() + + def train_test_split(self, test_size=0.2, seed=0): + return train_test_split(self.X, self.y, test_size=test_size, random_state=seed) + + def transform(self, x): + if (len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert (self.binarizer != []) + tx = [] + for i in range(self.nb_features): + # self.binarizer[i].drop = None + if (i in self.categorical_features): + self.binarizer[i].drop = None + tx_aux = self.binarizer[i].transform(x[:, [i]]) + tx_aux = np.vstack(tx_aux) + tx.append(tx_aux) + else: + tx.append(x[:, [i]]) + tx = np.hstack(tx) + return tx + else: + return x + + def transform_inverse(self, x): + if (len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert (self.binarizer != []) + inverse_x = [] + for i, xi in enumerate(x): + inverse_xi = np.zeros(self.nb_features) + for f in range(self.nb_features): + if f in self.categorical_features: + nb_values = len(self.categorical_names[f]) + v = xi[:nb_values] + v = np.expand_dims(v, axis=0) + iv = self.binarizer[f].inverse_transform(v) + inverse_xi[f] = iv + xi = xi[nb_values:] + + else: + inverse_xi[f] = xi[0] + xi = xi[1:] + inverse_x.append(inverse_xi) + return inverse_x + else: + return x + + def transform_inverse_by_index(self, idx): + if (idx in self.extended_feature_names): + return self.extended_feature_names[idx] + else: + print("Warning there is no feature {} in the internal mapping".format(idx)) + return None + + def transform_by_value(self, feat_value_pair): + if (feat_value_pair in self.extended_feature_names.values()): + keys = ( + list(self.extended_feature_names.keys())[ + list(self.extended_feature_names.values()).index(feat_value_pair)]) + return keys + else: + print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) + return None + + def mapping_features(self): + self.extended_feature_names = {} + self.extended_feature_names_as_array_strings = [] + counter = 0 + if (self.use_categorical): + for i in range(self.nb_features): + if (i in self.categorical_features): + for j, _ in enumerate(self.binarizer[i].categories_[0]): + self.extended_feature_names.update({counter: (self.feature_names[i], j)}) + self.extended_feature_names_as_array_strings.append( + "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) + counter = counter + 1 + else: + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) + counter = counter + 1 + else: + for i in range(self.nb_features): + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) + counter = counter + 1 + + def readable_sample(self, x): + readable_x = [] + for i, v in enumerate(x): + if (i in self.categorical_features): + readable_x.append(self.categorical_names[i][int(v)]) + else: + readable_x.append(v) + return np.asarray(readable_x) + + def test_encoding_transformes(self, X_train): + # test encoding + + X = X_train[[0], :] + + print("Sample of length", len(X[0]), " : ", X) + enc_X = self.transform(X) + print("Encoded sample of length", len(enc_X[0]), " : ", enc_X) + inv_X = self.transform_inverse(enc_X) + print("Back to sample", inv_X) + print("Readable sample", self.readable_sample(inv_X[0])) + assert ((inv_X == X).all()) + + ''' + for i in range(len(self.extended_feature_names)): + print(i, self.transform_inverse_by_index(i)) + for key, value in self.extended_feature_names.items(): + print(value, self.transform_by_value(value)) + ''' + # + + +# ============================================================================== +class XRF(object): + """ + class to encode and explain Random Forest classifiers. + """ + + def __init__(self, model, feature_names, class_names, verb=0): + self.cls = model + # self.data = dataset + self.verbose = verb + self.feature_names = feature_names + self.class_names = class_names + self.fnames = [f'f{i}' for i in range(len(feature_names))] + self.f = Forest(model, self.fnames) + + def __del__(self): + if 'enc' in dir(self): + del self.enc + if 'x' in dir(self): + if self.x.slv is not None: + self.x.slv.delete() + del self.x + del self.f + self.f = None + del self.cls + self.cls = None + + def encode(self, inst): + """ + Encode a tree ensemble trained previously. + """ + if 'f' not in dir(self): + self.f = Forest(self.cls, self.fnames) + # self.f.print_tree() + + self.enc = SATEncoder(self.f, self.feature_names, len(self.class_names), self.fnames) + + # inst = self.data.transform(np.array(inst))[0] + formula, _, _, _ = self.enc.encode(np.array(inst)) + + def explain(self, inst, xtype='abd'): + """ + Explain a prediction made for a given sample with a previously + trained RF. + """ + + explanation_result = {} + + if 'enc' not in dir(self): + self.encode(inst) + + # inpvals = self.data.readable_sample(inst) + inpvals = np.asarray(inst) + preamble = [] + for f, v in zip(self.feature_names, inpvals): + if f not in str(v): + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + inps = self.fnames # input (feature value) variables + # print("inps: {0}".format(inps)) + + self.x = SATExplainer(self.enc, inps, preamble, self.class_names, verb=self.verbose) + # inst = self.data.transform(np.array(inst))[0] + explanation_result = self.x.explain(np.array(inst), xtype, explanation_result) + + return explanation_result + + def enumerate(self, inst, xtype='con', smallest=True): + """ + list all XPs + """ + if 'enc' not in dir(self): + self.encode(inst) + + if 'x' not in dir(self): + inpvals = np.asarray(inst) + preamble = [] + for f, v in zip(self.feature_names, inpvals): + if f not in str(v): + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + inps = self.fnames + self.x = SATExplainer(self.enc, inps, preamble, self.class_names) + + for expl in self.x.enumerate(np.array(inst), xtype, smallest): + yield expl + + +# +# ============================================================================== +class SATEncoder(object): + """ + Encoder of Random Forest classifier into SAT. + """ + + def __init__(self, forest, feats, nof_classes, extended_feature_names, from_file=None): + self.forest = forest + # self.feats = {f: i for i, f in enumerate(feats)} + self.num_class = nof_classes + self.vpool = IDPool() + self.extended_feature_names = extended_feature_names + + # encoding formula + self.cnf = None + + # for interval-based encoding + self.intvs, self.imaps, self.ivars, self.thvars = None, None, None, None + + def newVar(self, name): + """ + If a variable named 'name' already exists then + return its id; otherwise create a new var + """ + if name in self.vpool.obj2id: # var has been already created + return self.vpool.obj2id[name] + var = self.vpool.id('{0}'.format(name)) + return var + + def nameVar(self, vid): + """ + input a var id and return a var name + """ + return self.vpool.obj(abs(vid)) + + def printLits(self, lits): + print(["{0}{1}".format("-" if p < 0 else "", self.vpool.obj(abs(p))) for p in lits]) + + def traverse(self, tree, k, clause): + """ + Traverse a tree and encode each node. + """ + + if tree.children: + f = tree.name + v = tree.threshold + pos = neg = [] + if f in self.intvs: + d = self.imaps[f][v] + pos, neg = self.thvars[f][d], -self.thvars[f][d] + else: + var = self.newVar(tree.name) + pos, neg = var, -var + # print("{0} => {1}".format(tree.name, var)) + + assert (pos and neg) + self.traverse(tree.children[0], k, clause + [-neg]) + self.traverse(tree.children[1], k, clause + [-pos]) + else: # leaf node + cvar = self.newVar('class{0}_tr{1}'.format(tree.values, k)) + self.cnf.append(clause + [cvar]) + # self.printLits(clause + [cvar]) + + def compute_intervals(self): + """ + Traverse all trees in the ensemble and extract intervals for each + feature. + + At this point, the method only works for numerical datasets! + """ + + def traverse_intervals(tree): + """ + Auxiliary function. Recursive tree traversal. + """ + + if tree.children: + f = tree.name + v = tree.threshold + if f in self.intvs: + self.intvs[f].add(v) + + traverse_intervals(tree.children[0]) + traverse_intervals(tree.children[1]) + + # initializing the intervals + self.intvs = {'{0}'.format(f): set([]) for f in self.extended_feature_names if '_' not in f} + + for tree in self.forest.trees: + traverse_intervals(tree) + + # OK, we got all intervals; let's sort the values + self.intvs = {f: sorted(self.intvs[f]) + ([math.inf] if len(self.intvs[f]) else []) for f in + six.iterkeys(self.intvs)} + + self.imaps, self.ivars = {}, {} + self.thvars = {} + for feat, intvs in six.iteritems(self.intvs): + self.imaps[feat] = {} + self.ivars[feat] = [] + self.thvars[feat] = [] + for i, ub in enumerate(intvs): + self.imaps[feat][ub] = i + + ivar = self.newVar('{0}_intv{1}'.format(feat, i)) + self.ivars[feat].append(ivar) + # print('{0}_intv{1}'.format(feat, i)) + + if ub != math.inf: + # assert(i < len(intvs)-1) + thvar = self.newVar('{0}_th{1}'.format(feat, i)) + self.thvars[feat].append(thvar) + # print('{0}_th{1}'.format(feat, i)) + + def encode(self, sample): + """ + Do the job. + """ + + ###print('Encode RF into SAT ...') + + self.cnf = CNF() + # getting a tree ensemble + # self.forest = Forest(self.model, self.extended_feature_names) + num_tree = len(self.forest.trees) + self.forest.predict_inst(sample) + + # introducing class variables + # cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] + + # define Tautology var + vtaut = self.newVar('Tautology') + self.cnf.append([vtaut]) + + # introducing class-tree variables + ctvars = [[] for t in range(num_tree)] + for k in range(num_tree): + for j in range(self.num_class): + var = self.newVar('class{0}_tr{1}'.format(j, k)) + ctvars[k].append(var) + + # traverse all trees and extract all possible intervals + # for each feature + ###print("compute intervarls ...") + self.compute_intervals() + + # print(self.intvs) + # print([len(self.intvs[f]) for f in self.intvs]) + # print(self.imaps) + # print(self.ivars) + # print(self.thvars) + # print(ctvars) + + ##print("encode trees ...") + # traversing and encoding each tree + for k, tree in enumerate(self.forest.trees): + # print("Encode tree#{0}".format(k)) + # encoding the tree + self.traverse(tree, k, []) + # exactly one class var is true + # self.printLits(ctvars[k]) + card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(card.clauses) + + # calculate the majority class + self.cmaj = self.forest.predict_inst(sample) + + ##print("encode majority class ...") + # Cardinality constraint AtMostK to capture a j_th class + + if (self.num_class == 2): + rhs = math.floor(num_tree / 2) + 1 + if (self.cmaj == 1 and not num_tree % 2): + rhs = math.floor(num_tree / 2) + lhs = [ctvars[k][1 - self.cmaj] for k in range(num_tree)] + atls = CardEnc.atleast(lits=lhs, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(atls) + else: + zvars = [] + zvars.append([self.newVar('z_0_{0}'.format(k)) for k in range(num_tree)]) + zvars.append([self.newVar('z_1_{0}'.format(k)) for k in range(num_tree)]) + ## + rhs = num_tree + lhs0 = zvars[0] + [- ctvars[k][self.cmaj] for k in range(num_tree)] + ##self.printLits(lhs0) + atls = CardEnc.atleast(lits=lhs0, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(atls) + ## + # rhs = num_tree - 1 + rhs = num_tree + 1 + ########### + lhs1 = zvars[1] + [- ctvars[k][self.cmaj] for k in range(num_tree)] + ##self.printLits(lhs1) + atls = CardEnc.atleast(lits=lhs1, bound=rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(atls) + # + pvars = [self.newVar('p_{0}'.format(k)) for k in range(self.num_class + 1)] + ##self.printLits(pvars) + for k, p in enumerate(pvars): + for i in range(num_tree): + if k == 0: + z = zvars[0][i] + # self.cnf.append([-p, -z, vtaut]) + self.cnf.append([-p, z, -vtaut]) + # self.printLits([-p, z, -vtaut]) + # print() + elif k == self.cmaj + 1: + z = zvars[1][i] + self.cnf.append([-p, z, -vtaut]) + + # self.printLits([-p, z, -vtaut]) + # print() + + else: + z = zvars[0][i] if (k < self.cmaj + 1) else zvars[1][i] + self.cnf.append([-p, -z, ctvars[i][k - 1]]) + self.cnf.append([-p, z, -ctvars[i][k - 1]]) + + # self.printLits([-p, -z, ctvars[i][k-1] ]) + # self.printLits([-p, z, -ctvars[i][k-1] ]) + # print() + + # + self.cnf.append([-pvars[0], -pvars[self.cmaj + 1]]) + ## + lhs1 = pvars[:(self.cmaj + 1)] + ##self.printLits(lhs1) + eqls = CardEnc.equals(lits=lhs1, bound=1, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(eqls) + + lhs2 = pvars[(self.cmaj + 1):] + ##self.printLits(lhs2) + eqls = CardEnc.equals(lits=lhs2, bound=1, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(eqls) + + ##print("exactly-one feat const ...") + # enforce exactly one of the feature values to be chosen + # (for categorical features) + categories = collections.defaultdict(lambda: []) + for f in self.extended_feature_names: + if '_' in f: + categories[f.split('_')[0]].append(self.newVar(f)) + for c, feats in six.iteritems(categories): + # exactly-one feat is True + self.cnf.append(feats) + card = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(card.clauses) + # lits of intervals + for f, intvs in six.iteritems(self.ivars): + if not len(intvs): + continue + self.cnf.append(intvs) + card = CardEnc.atmost(lits=intvs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(card.clauses) + # self.printLits(intvs) + + for f, threshold in six.iteritems(self.thvars): + for j, thvar in enumerate(threshold): + d = j + 1 + pos, neg = self.ivars[f][d:], self.ivars[f][:d] + + if j == 0: + assert (len(neg) == 1) + self.cnf.append([thvar, neg[-1]]) + self.cnf.append([-thvar, -neg[-1]]) + else: + self.cnf.append([thvar, neg[-1], -threshold[j - 1]]) + self.cnf.append([-thvar, threshold[j - 1]]) + self.cnf.append([-thvar, -neg[-1]]) + + if j == len(threshold) - 1: + assert (len(pos) == 1) + self.cnf.append([-thvar, pos[0]]) + self.cnf.append([thvar, -pos[0]]) + else: + self.cnf.append([-thvar, pos[0], threshold[j + 1]]) + self.cnf.append([thvar, -pos[0]]) + self.cnf.append([thvar, -threshold[j + 1]]) + + return self.cnf, self.intvs, self.imaps, self.ivars + + +# +# ============================================================================== +class SATExplainer(object): + """ + An SAT-inspired minimal explanation extractor for Random Forest models. + """ + + def __init__(self, sat_enc, inps, preamble, target_name, verb=1): + """ + Constructor. + """ + self.enc = sat_enc + self.inps = inps # input (feature value) variables + self.target_name = target_name + self.preamble = preamble + self.verbose = verb + self.slv = None + + def prepare_selectors(self, sample): + # adapt the solver to deal with the current sample + # self.csel = [] + self.assums = [] # var selectors to be used as assumptions + self.sel2fid = {} # selectors to original feature ids + self.sel2vid = {} # selectors to categorical feature ids + self.sel2v = {} # selectors to (categorical/interval) values + + # for i in range(self.enc.num_class): + # self.csel.append(self.enc.newVar('class{0}'.format(i))) + # self.csel = self.enc.newVar('class{0}'.format(self.enc.cmaj)) + + # preparing the selectors + for i, (inp, val) in enumerate(zip(self.inps, sample), 1): + if '_' in inp: + # binarized (OHE) features + assert (inp not in self.enc.intvs) + + feat = inp.split('_')[0] + selv = self.enc.newVar('selv_{0}'.format(feat)) + + self.assums.append(selv) + if selv not in self.sel2fid: + self.sel2fid[selv] = int(feat[1:]) + self.sel2vid[selv] = [i - 1] + else: + self.sel2vid[selv].append(i - 1) + + p = self.enc.newVar(inp) + if not val: + p = -p + else: + self.sel2v[selv] = p + + self.enc.cnf.append([-selv, p]) + # self.enc.printLits([-selv, p]) + + elif len(self.enc.intvs[inp]): + # v = None + # for intv in self.enc.intvs[inp]: + # if intv > val: + # v = intv + # break + v = next((intv for intv in self.enc.intvs[inp] if intv > val), None) + assert (v is not None) + + selv = self.enc.newVar('selv_{0}'.format(inp)) + self.assums.append(selv) + + assert (selv not in self.sel2fid) + self.sel2fid[selv] = int(inp[1:]) + self.sel2vid[selv] = [i - 1] + + for j, p in enumerate(self.enc.ivars[inp]): + cl = [-selv] + if j == self.enc.imaps[inp][v]: + cl += [p] + self.sel2v[selv] = p + else: + cl += [-p] + + self.enc.cnf.append(cl) + # self.enc.printLits(cl) + + def explain(self, sample, xtype='abd', explanation_result=None, smallest=False): + """ + Hypotheses minimization. + """ + explanation_result["Explaining"] = "IF {0} THEN {1}".format(' AND '.join(self.preamble), + self.target_name[self.enc.cmaj]) + self.prepare_selectors(sample) + + if xtype == 'abd': + # abductive (PI-) explanation + explanation_result = self.compute_axp(explanation_result) + else: + # contrastive explanation + explanation_result = self.compute_cxp(explanation_result) + + # delete sat solver + self.slv.delete() + self.slv = None + + return explanation_result + + def compute_axp(self, explanation_result, smallest=False): + """ + Compute an Abductive eXplanation + """ + self.assums = sorted(set(self.assums)) + + # create a SAT solver + self.slv = Solver(name="glucose3") + + # pass a CNF formula + self.slv.append_formula(self.enc.cnf) + + def minimal(): + vtaut = self.enc.newVar('Tautology') + # simple deletion-based linear search + for i, p in enumerate(self.assums): + to_test = [vtaut] + self.assums[:i] + self.assums[(i + 1):] + [-p, -self.sel2v[p]] + sat = self.slv.solve(assumptions=to_test) + if not sat: + self.assums[i] = -p + return + + if not smallest: + minimal() + else: + raise NotImplementedError('Smallest explanation is not yet implemented.') + # self.compute_smallest() + + expl = sorted([self.sel2fid[h] for h in self.assums if h > 0]) + assert len( + expl), 'Abductive explanation cannot be an empty-set! otherwise RF fcn is const, i.e. predicts only one class' + + preamble = [self.preamble[i] for i in expl] + explanation_result["Explanation"] = "IF {0} THEN {1}".format(' AND '.join(preamble), self.target_name[self.enc.cmaj]) + + return explanation_result + + def compute_cxp(self, explanation_result, smallest=True): + """ + Compute a Contrastive eXplanation + """ + self.assums = sorted(set(self.assums)) + if self.verbose: + print(' # hypos:', len(self.assums)) + + wcnf = WCNF() + for cl in self.enc.cnf: + wcnf.append(cl) + for p in self.assums: + wcnf.append([p], weight=1) + + if not smallest: + # mcs solver + self.slv = LBX(wcnf, use_cld=True, solver_name='g3') + mcs = self.slv.compute() + expl = sorted([self.sel2fid[self.assums[i - 1]] for i in mcs]) + else: + # mxsat solver + self.slv = RC2(wcnf) + model = self.slv.compute() + model = [p for p in model if abs(p) in self.assums] + expl = sorted([self.sel2fid[-p] for p in model if p < 0]) + + assert len(expl), 'Contrastive explanation cannot be an empty-set!' + preamble = [self.preamble[i] for i in expl] + pred = self.target_name[self.enc.cmaj] + explanation_result["Explanation"] = f'"IF {" AND ".join([f"!({p})" for p in preamble])} THEN !(class = {pred})"' + + return explanation_result + + def enumerate(self, sample, xtype='con', smallest=True): + """ + list all CXp's or AXp's + """ + if xtype == 'abd': + raise NotImplementedError('Enumerate abductive explanations is not yet implemented.') + + if 'assums' not in dir(self): + self.prepare_selectors(sample) + self.assums = sorted(set(self.assums)) + # + + # compute CXp's/AE's + if self.slv is None: + wcnf = WCNF() + for cl in self.enc.cnf: + wcnf.append(cl) + for p in self.assums: + wcnf.append([p], weight=1) + if smallest: + # incremental maxsat solver + self.slv = RC2(wcnf, adapt=True, exhaust=True, minz=True) + else: + # mcs solver + self.slv = LBX(wcnf, use_cld=True, solver_name='g3') + # self.slv = MCSls(wcnf, use_cld=True, solver_name='g3') + + if smallest: + print('smallest') + for model in self.slv.enumerate(block=-1): + # model = [p for p in model if abs(p) in self.assums] + expl = sorted([self.sel2fid[-p] for p in model if (p < 0 and (-p in self.assums))]) + cxp_feats = [f'f{j}' for j in expl] + advx = [] + for f in cxp_feats: + ps = [p for p in model if (p > 0 and (p in self.enc.ivars[f]))] + assert (len(ps) == 1) + advx.append(tuple([f, self.enc.nameVar(ps[0])])) + # yield expl + print(cxp_feats, advx) + yield advx + else: + print('LBX') + for mcs in self.slv.enumerate(): + expl = sorted([self.sel2fid[self.assums[i - 1]] for i in mcs]) + assumptions = [-p if (i in mcs) else p for i, p in enumerate(self.assums, 1)] + # for k, model in enumerate(self.slv.oracle.enum_models(assumptions), 1): + assert (self.slv.oracle.solve(assumptions)) + model = self.slv.oracle.get_model() + cxp_feats = [f'f{j}' for j in expl] + advx = [] + for f in cxp_feats: + ps = [p for p in model if (p > 0 and (p in self.enc.ivars[f]))] + assert (len(ps) == 1) + advx.append(tuple([f, self.enc.nameVar(ps[0])])) + yield advx + self.slv.block(mcs) + # yield expl + + # + self.slv.delete() + self.slv = None diff --git a/pages/application/application.py b/pages/application/application.py index 6d3303091ec5785980d9b9fc424137248e20b274..2edf40634df0cb01fa9c35eaceb234fca7352689 100644 --- a/pages/application/application.py +++ b/pages/application/application.py @@ -70,6 +70,9 @@ class Model: self.component = self.component_class(self.pretrained_model, info=self.model_info, type_info=model_info_filename) + def update_tree_to_plot(self, id_tree): + self.component.update_plotted_tree(id_tree) + def update_instance(self, instance): self.instance = instance self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, @@ -214,9 +217,16 @@ class View: id='cont_expl_choice', className="dropdown")])]) + self.tree_to_plot = html.Div(id="choosing_tree", hidden=True, + children=[html.H5("Choose a tree to plot: "), + html.Div(children=[dcc.Slider(0, 100, 1, + value=0, + id='choice_tree')])]) + self.layout = dbc.Row([dbc.Col([self.sidebar], width=3, class_name="sidebar"), dbc.Col([dbc.Row(id="graph", children=[]), - dbc.Row(self.expl_choice)], + dbc.Row(self.expl_choice), + dbc.Row(self.tree_to_plot)], width=5, class_name="column_graph"), dbc.Col(html.Main(id="explanation", children=[], hidden=True), width=4)]) diff --git a/utils.py b/utils.py index 4b76ea3834d9fc79d7147e5a0a245e4e4a50858d..fc6dfe1a5ce67a3a9e1f5092745f9b7529e7e39b 100644 --- a/utils.py +++ b/utils.py @@ -8,11 +8,9 @@ from dash import html from pages.application.RandomForest.utils import xrf from pages.application.RandomForest.utils.xrf import * - sys.modules['xrf'] = xrf -from pages.application.RandomForest.utils import options -from pages.application.RandomForest.utils.options import * +from pages.application.RandomForest.utils import options sys.modules['options'] = options @@ -63,7 +61,6 @@ def parse_contents_instance(contents, filename): features_names = str(features_names).strip().split(',') data = str(data).strip().split(',') data = list(tuple([features_names[i], np.float32(data[i])]) for i in range(len(data))) - print(data) elif '.txt' in filename: data = decoded.decode('utf-8') data = str(data).strip().split(',')