diff --git a/app.py b/app.py index cea3cfac3d586d456743acd0078c17c35f1f5338..d4becfdf72a575dc49d33620792892869f08f070 100644 --- a/app.py +++ b/app.py @@ -19,28 +19,29 @@ app = dash.Dash(external_stylesheets=[dbc.themes.LUX], suppress_callback_excepti models_data = open('data_retriever.json') data = json.load(models_data)["data"] -#For home directory +# For home directory page_home = dbc.Row([html.H3("Welcome")]) -#For course directory +# For course directory page_course = dbc.Row([]) -#For the application -names_models, dict_components = extract_data(data) -model_application = Model(names_models, dict_components) +# For the application +names_models, dict_components, dic_solvers, dic_xtypes = extract_data(data) +model_application = Model(names_models, dict_components, dic_solvers, dic_xtypes) view_application = View(model_application) page_application = Application(view_application) app.layout = html.Div([ dcc.Location(id='url', refresh=False), - html.Nav(id='navbar-container', - children=[dbc.NavbarSimple( - children=[ - dbc.NavItem(dbc.NavLink("Home", id="home-link", href="/")), - dbc.NavItem(dbc.NavLink("Course", id="course-link", href="/course")), - dbc.NavItem(dbc.NavLink("Application on explainable AI", id="application-link", href="/application")), - ], - brand="FX ToolKit", - color="primary", - dark=True,)]), + html.Nav(id='navbar-container', + children=[dbc.NavbarSimple( + children=[ + dbc.NavItem(dbc.NavLink("Home", id="home-link", href="/")), + dbc.NavItem(dbc.NavLink("Course", id="course-link", href="/course")), + dbc.NavItem( + dbc.NavLink("Application on explainable AI", id="application-link", href="/application")), + ], + brand="FX ToolKit", + color="primary", + dark=True, )]), html.Div(id='page-content') ]) diff --git a/callbacks.py b/callbacks.py index 25a794b3eb7b330dda86eeb322dc052bd9eb29b9..438d511ac3c1d1fee94043f75e0d4260a86f189b 100644 --- a/callbacks.py +++ b/callbacks.py @@ -1,6 +1,4 @@ import dash -import pandas as pd -from dash import Input, Output, State from dash.dependencies import Input, Output, State from dash.exceptions import PreventUpdate @@ -17,18 +15,28 @@ def register_callbacks(page_home, page_course, page_application, app): if pathname == '/': return page_home if pathname == '/application': - return page_application.view.layout + return page_application.view.layout if pathname == '/course': return page_course @app.callback(Output('home-link', 'active'), - Output('course-link', 'active'), - Output('application-link', 'active'), - Input('url', 'pathname')) + Output('course-link', 'active'), + Output('application-link', 'active'), + Input('url', 'pathname')) def navbar_state(pathname): active_link = ([pathname == f'/{i}' for i in page_list]) return active_link[0], active_link[1], active_link[2] + @app.callback(Output('solver_sat', 'options'), + Output('explanation_type', 'options'), + Input('ml_model_choice', 'value'), + prevent_initial_call=True + ) + def update_ml_type_options(value_ml_model): + model_application = page_application.model + model_application.update_ml_model(value_ml_model) + return model_application.solvers, model_application.xtypes + @app.callback( Output('pretrained_model_filename', 'children'), Output('info_filename', 'children'), @@ -49,84 +57,93 @@ def register_callbacks(page_home, page_course, page_application, app): Input('cont_expl_choice', 'value'), prevent_initial_call=True ) - def update_ml_type(value_ml_model, pretrained_model_contents, pretrained_model_filename, model_info, model_info_filename, \ - instance_contents, instance_filename, enum, xtype, solver, expl_choice, cont_expl_choice): + def update_ml_type(value_ml_model, pretrained_model_contents, pretrained_model_filename, model_info, + model_info_filename, + instance_contents, instance_filename, enum, xtype, solver, expl_choice, cont_expl_choice): ctx = dash.callback_context if ctx.triggered: ihm_id = ctx.triggered[0]['prop_id'].split('.')[0] model_application = page_application.model # Choice of model - if ihm_id == 'ml_model_choice' : + if ihm_id == 'ml_model_choice': model_application.update_ml_model(value_ml_model) return None, None, None, None, None # Choice of pkl pretrained model elif ihm_id == 'ml_pretrained_model_choice': - if model_application.ml_model is None : + if model_application.ml_model is None: raise PreventUpdate graph = parse_contents_graph(pretrained_model_contents, pretrained_model_filename) model_application.update_pretrained_model(graph) - if not model_application.add_info : + if not model_application.add_info: model_application.update_pretrained_model_layout() return pretrained_model_filename, None, None, model_application.component.network, None - else : + else: return pretrained_model_filename, None, None, None, None # Choice of information for the model elif ihm_id == 'model_info_choice': - if model_application.ml_model is None : + if model_application.ml_model is None: raise PreventUpdate model_info = parse_contents_data(model_info, model_info_filename) model_application.update_pretrained_model_layout_with_info(model_info, model_info_filename) return pretrained_model_filename, model_info_filename, None, model_application.component.network, None # Choice of instance to explain - elif ihm_id == 'ml_instance_choice' : - if model_application.ml_model is None or model_application.pretrained_model is None or model_application.enum<=0 or model_application.xtype is None : + elif ihm_id == 'ml_instance_choice': + if model_application.ml_model is None or model_application.pretrained_model is None or model_application.enum <= 0 or model_application.xtype is None: raise PreventUpdate instance = parse_contents_instance(instance_contents, instance_filename) model_application.update_instance(instance) - return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation + return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation - # Choice of number of expls - elif ihm_id == 'number_explanations' : - if model_application.ml_model is None or model_application.pretrained_model is None or len(model_application.instance)==0 or model_application.xtype is None: + # Choice of number of expls + elif ihm_id == 'number_explanations': + if model_application.ml_model is None or model_application.pretrained_model is None or len( + model_application.instance) == 0 or model_application.xtype is None: raise PreventUpdate model_application.update_enum(enum) - return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation + return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation - # Choice of AxP or CxP - elif ihm_id == 'explanation_type' : - if model_application.ml_model is None or model_application.pretrained_model is None or len(model_application.instance)==0 or model_application.enum<=0 : + # Choice of AxP or CxP + elif ihm_id == 'explanation_type': + if model_application.ml_model is None or model_application.pretrained_model is None or len( + model_application.instance) == 0 or model_application.enum <= 0: raise PreventUpdate model_application.update_xtype(xtype) return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation - + # Choice of solver - elif ihm_id == 'solver_sat' : - if model_application.ml_model is None or model_application.pretrained_model is None or len(model_application.instance)==0 or model_application.enum<=0 or len(model_application.xtype)==0: + elif ihm_id == 'solver_sat': + if model_application.ml_model is None or model_application.pretrained_model is None or len( + model_application.instance) == 0 or model_application.enum <= 0 or len( + model_application.xtype) == 0: raise PreventUpdate model_application.update_solver(solver) - return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation - - # Choice of AxP to draw - elif ihm_id == 'expl_choice' : - if model_application.ml_model is None or model_application.pretrained_model is None or len(model_application.instance)==0 or model_application.enum<=0 or len(model_application.xtype)==0: + return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation + + # Choice of AxP to draw + elif ihm_id == 'expl_choice': + if model_application.ml_model is None or model_application.pretrained_model is None or len( + model_application.instance) == 0 or model_application.enum <= 0 or len( + model_application.xtype) == 0: raise PreventUpdate model_application.update_expl(expl_choice) - return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation + return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation - # Choice of CxP to draw - elif ihm_id == 'cont_expl_choice' : - if model_application.ml_model is None or model_application.pretrained_model is None or len(model_application.instance)==0 or model_application.enum<=0 or len(model_application.xtype)==0: + # Choice of CxP to draw + elif ihm_id == 'cont_expl_choice': + if model_application.ml_model is None or model_application.pretrained_model is None or len( + model_application.instance) == 0 or model_application.enum <= 0 or len( + model_application.xtype) == 0: raise PreventUpdate model_application.update_cont_expl(cont_expl_choice) - return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation + return pretrained_model_filename, model_info_filename, instance_filename, model_application.component.network, model_application.component.explanation @app.callback( Output('explanation', 'hidden'), - Output('interaction_graph', 'hidden'), + Output('interaction_graph', 'hidden'), Output('expl_choice', 'options'), Output('cont_expl_choice', 'options'), Input('explanation', 'children'), @@ -134,17 +151,17 @@ def register_callbacks(page_home, page_course, page_application, app): prevent_initial_call=True ) def layout_buttons_navigate_expls(explanation, explanation_type): - if explanation is None or len(explanation_type)==0: + if explanation is None or len(explanation_type) == 0: return True, True, {}, {} elif "AXp" not in explanation_type and "CXp" in explanation_type: return False, True, {}, {} - else : + else: options_expls = {} options_cont_expls = {} model_application = page_application.model - for i in range (len(model_application.list_expls)): + for i in range(len(model_application.list_expls)): options_expls[str(model_application.list_expls[i])] = model_application.list_expls[i] - for i in range (len(model_application.list_cont_expls)): + for i in range(len(model_application.list_cont_expls)): options_cont_expls[str(model_application.list_cont_expls[i])] = model_application.list_cont_expls[i] return False, False, options_expls, options_cont_expls @@ -158,5 +175,5 @@ def register_callbacks(page_home, page_course, page_application, app): model_application.update_info_needed(add_info_model_choice) if add_info_model_choice: return False - else : + else: return True diff --git a/data_retriever.json b/data_retriever.json index 4f41926570e644654c47c43685059a4121b3af0e..5ef4f56851cd35ba4dfc85f6aea5860c14b5d05e 100644 --- a/data_retriever.json +++ b/data_retriever.json @@ -3,11 +3,26 @@ [ { "ml_type" : "DecisionTree", - "component" : "DecisionTreeComponent" + "component" : "DecisionTreeComponent", + "solvers" : [ + "g3", "g4", "lgl", "mcb", "mcm", "mpl", "m22", "mc", "mgh" + ], + "xtypes" : { + "AXp": "Abductive Explanation", "CXp": "Contrastive explanation"} }, { "ml_type" : "NaiveBayes", - "component" : "NaiveBayesComponent" + "component" : "NaiveBayesComponent", + "solvers" : [], + "xtypes" : { + "AXp": "Abductive Explanation", "CXp": "Contrastive explanation"} + + }, + { + "ml_type" : "RandomForest", + "component" : "RandomForestComponent", + "solvers" : ["LIME", "ANCHOR", "SHAP"], + "xtypes" : {"H": "Heuristic", "HV": "Heuristic and validation", "G": "Global"} } ] diff --git a/pages/application/DecisionTree/DecisionTreeComponent.py b/pages/application/DecisionTree/DecisionTreeComponent.py index f9d34e32c731e7216577b94948964e72b6b54ec5..c177652e7f2be781a7b1bdba4bccb94bec80e34d 100644 --- a/pages/application/DecisionTree/DecisionTreeComponent.py +++ b/pages/application/DecisionTree/DecisionTreeComponent.py @@ -1,36 +1,34 @@ -from os import path -import base64 - -import dash_bootstrap_components as dbc import dash_interactive_graphviz import numpy as np -from dash import dcc, html -from pages.application.DecisionTree.utils.upload_tree import UploadedDecisionTree +from dash import html + from pages.application.DecisionTree.utils.data import Data from pages.application.DecisionTree.utils.dtree import DecisionTree - from pages.application.DecisionTree.utils.dtviz import (visualize, visualize_expl, visualize_instance, visualize_contrastive_expl) +from pages.application.DecisionTree.utils.upload_tree import UploadedDecisionTree + class DecisionTreeComponent(): def __init__(self, tree, type_tree='SKL', info=None, type_info=''): - if info is not None and '.csv' in type_info: + if info is not None and '.csv' in type_info: self.categorical = True data = Data(info) fvmap = data.mapping_features() feature_names = data.names[:-1] - self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), feature_names=feature_names, nb_classes=tree.n_classes_) + self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), + feature_names=feature_names, nb_classes=tree.n_classes_) self.dt_format, self.map, features_names_mapping = self.uploaded_dt.dump(fvmap, feat_names=feature_names) - elif info is not None and '.txt' in type_info : + elif info is not None and '.txt' in type_info: self.categorical = True fvmap = {} feature_names = [] - for i,line in enumerate(info.split('\n')): + for i, line in enumerate(info.split('\n')): fid, TYPE = line.split(',')[:2] dom = line.split(',')[2:] assert (fid not in feature_names) @@ -38,36 +36,38 @@ class DecisionTreeComponent(): assert (TYPE in ['Binary', 'Categorical']) fvmap[f'f{i}'] = dict() dom = sorted(dom) - for j,v in enumerate(dom): + for j, v in enumerate(dom): fvmap[f'f{i}'][j] = (fid, True, v) - self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), feature_names=feature_names, nb_classes=tree.n_classes_) + self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), + feature_names=feature_names, nb_classes=tree.n_classes_) self.dt_format, self.map, features_names_mapping = self.uploaded_dt.dump(fvmap, feat_names=feature_names) - else : + else: self.categorical = False try: feature_names = tree.feature_names_in_ except: feature_names = [f'f{i}' for i in range(tree.n_features_in_)] - self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), feature_names=feature_names, nb_classes=tree.n_classes_) + self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(), + feature_names=feature_names, nb_classes=tree.n_classes_) self.dt_format, self.map, features_names_mapping = self.uploaded_dt.convert_dt(feat_names=feature_names) - + self.mapping_instance = self.create_fvmap_inverse(features_names_mapping) - self.dt = DecisionTree(from_dt=self.dt_format, mapfile = self.map, feature_names = feature_names) + self.dt = DecisionTree(from_dt=self.dt_format, mapfile=self.map, feature_names=feature_names) dot_source = visualize(self.dt) - self.network = html.Div([dash_interactive_graphviz.DashInteractiveGraphviz(dot_source=dot_source, style = {"width": "60%", - "height": "90%", - "background-color": "transparent"})]) + self.network = html.Div( + [dash_interactive_graphviz.DashInteractiveGraphviz(dot_source=dot_source, style={"width": "60%", + "height": "90%", + "background-color": "transparent"})]) self.explanation = [] - def create_fvmap_inverse(self, instance): - def create_fvmap_inverse_with_info(features_names_mapping) : + def create_fvmap_inverse_with_info(features_names_mapping): mapping_instance = {} - for feat in features_names_mapping : + for feat in features_names_mapping: feat_dic = {} feature_description = feat.split(',') - name_feat, id_feat = feature_description[1].split(':') + name_feat, id_feat = feature_description[1].split(':') for mapping in feature_description[2:]: real_value, mapped_value = mapping.split(':') @@ -76,95 +76,92 @@ class DecisionTreeComponent(): return mapping_instance - def create_fvmap_inverse_threashold(features_names_mapping) : + def create_fvmap_inverse_threashold(features_names_mapping): mapping_instance = {} - for feat in features_names_mapping : + for feat in features_names_mapping: feature_description = feat.split(',') - name_feat, id_feat = feature_description[1].split(':') + name_feat, id_feat = feature_description[1].split(':') mapping_instance[name_feat] = float(feature_description[2].split(':')[0]) return mapping_instance - if self.categorical : + if self.categorical: return create_fvmap_inverse_with_info(instance) - else : + else: return create_fvmap_inverse_threashold(instance) - def translate_instance(self, instance): def translate_instance_categorical(instance): instance_translated = [] - for feat, real_value in instance : + for feat, real_value in instance: instance_translated.append((feat, self.mapping_instance[feat][real_value])) return instance_translated - + def translate_instance_threasholds(instance): instance_translated = [] - for feat, real_value in instance : - try: + for feat, real_value in instance: + try: if real_value <= self.mapping_instance[feat]: instance_translated.append((feat, 0)) - else : + else: instance_translated.append((feat, 1)) except: instance_translated.append((feat, real_value)) return instance_translated - if self.categorical : + if self.categorical: return translate_instance_categorical(instance) - else : + else: return translate_instance_threasholds(instance) + def update_with_explicability(self, instance, enum, xtype, solver): - def update_with_explicability(self, instance, enum, xtype, solver) : - instance_translated = self.translate_instance(instance) self.explanation = [] - list_explanations_path=[] - explanation = self.dt.explain(instance_translated, enum=enum, xtype = xtype, solver=solver) + list_explanations_path = [] + explanation = self.dt.explain(instance_translated, enum=enum, xtype=xtype, solver=solver) dot_source = visualize_instance(self.dt, instance_translated) self.network = html.Div([dash_interactive_graphviz.DashInteractiveGraphviz( - dot_source=dot_source, style = {"width": "50%", - "height": "80%", - "background-color": "transparent"} + dot_source=dot_source, style={"width": "50%", + "height": "80%", + "background-color": "transparent"} )]) - - #Creating a clean and nice text component - #instance plotting + # Creating a clean and nice text component + # instance plotting self.explanation.append(html.H4("Instance : \n")) - self.explanation.append(html.P(str([str(instance[i]) for i in range (len(instance))]))) - for k in explanation.keys() : - if k != "List of path explanation(s)" and k!= "List of path contrastive explanation(s)" : - if k in ["List of abductive explanation(s)","List of contrastive explanation(s)"] : + self.explanation.append(html.P(str([str(instance[i]) for i in range(len(instance))]))) + for k in explanation.keys(): + if k != "List of path explanation(s)" and k != "List of path contrastive explanation(s)": + if k in ["List of abductive explanation(s)", "List of contrastive explanation(s)"]: self.explanation.append(html.H4(k)) - for expl in explanation[k] : + for expl in explanation[k]: self.explanation.append(html.Hr()) self.explanation.append(html.P(expl)) self.explanation.append(html.Hr()) - else : + else: self.explanation.append(html.P(k + explanation[k])) - else : + else: list_explanations_path = explanation["List of path explanation(s)"] list_contrastive_explanations_path = explanation["List of path contrastive explanation(s)"] return list_explanations_path, list_contrastive_explanations_path - def draw_explanation(self, instance, expl) : + def draw_explanation(self, instance, expl): instance = self.translate_instance(instance) dot_source = visualize_expl(self.dt, instance, expl) self.network = html.Div([dash_interactive_graphviz.DashInteractiveGraphviz( - dot_source=dot_source, - style = {"width": "50%", - "height": "80%", - "background-color": "transparent"})]) + dot_source=dot_source, + style={"width": "50%", + "height": "80%", + "background-color": "transparent"})]) - def draw_contrastive_explanation(self, instance, cont_expl) : + def draw_contrastive_explanation(self, instance, cont_expl): instance = self.translate_instance(instance) dot_source = visualize_contrastive_expl(self.dt, instance, cont_expl) self.network = html.Div([dash_interactive_graphviz.DashInteractiveGraphviz( - dot_source=dot_source, - style = {"width": "50%", - "height": "80%", - "background-color": "transparent"})]) + dot_source=dot_source, + style={"width": "50%", + "height": "80%", + "background-color": "transparent"})]) diff --git a/pages/application/NaiveBayes/utils/generator_cnbc.py b/pages/application/NaiveBayes/utils/generator_cnbc.py deleted file mode 100644 index 734e2759b160118e3b3cba50231082ee25179e5d..0000000000000000000000000000000000000000 --- a/pages/application/NaiveBayes/utils/generator_cnbc.py +++ /dev/null @@ -1,175 +0,0 @@ -import argparse -import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import CategoricalNB -from sklearn.preprocessing import LabelEncoder -from sklearn.metrics import accuracy_score -import pickle -import os -import numpy as np -from scipy.special import logsumexp - -def predict_proba(X, clf, precision=None): - if precision == None: - feature_priors = clf.feature_log_prob_ - class_priors = clf.class_log_prior_ - else: - feature_priors = list(map(lambda x: np.log(np.clip(np.round(np.exp(x), precision), 1e-12, None)), clf.feature_log_prob_)) - class_priors = list(map(lambda x: np.log(np.clip(np.round(np.exp(x), precision), 1e-12, None)), clf.class_log_prior_)) - jll = np.zeros((X.shape[0], 2)) - for i in range(X.shape[1]): - indices = X.values[:, i] - jll += feature_priors[i][:, indices].T - total_ll = jll + class_priors - - log_prob_x = logsumexp(total_ll, axis=1) - return np.argmax(np.exp(total_ll - np.atleast_2d(log_prob_x).T), axis=1) - -if __name__ == "__main__": - parser = argparse.ArgumentParser('Categorical NBC generator.') - parser.add_argument('-d', type=str, help="dataset path") - parser.add_argument('-op', type=str, help="output pickle classifier path", default="") - parser.add_argument('-oc', type=str, help="output NBC classifier path", default="") - parser.add_argument('-oi', type=str, help="output inst path", default="") - parser.add_argument('-ox', type=str, help="output xmap path", default="") - parser.add_argument('-v', type=int, help="verbose", default=0) - parser.add_argument('-p', type=int, help="precision of classifier", default=None) - args = parser.parse_args() - - df = pd.read_csv(args.d) - df.columns = [s.strip() for s in df.columns.values] - - encoders = dict() - min_categories = dict() - for column in df.columns: - if df[column].apply(type).eq(str).all(): - df[column] = df[column].str.strip() - enc = LabelEncoder() - enc.fit(df[column]) - df[column] = enc.transform(df[column]) - min_categories[column] = len(enc.classes_) - encoders[column] = enc - - X = df.drop(df.columns[-1], axis=1) - y = df[df.columns[-1]] - - X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) - clf = CategoricalNB(min_categories=np.array(list(min_categories.values())).astype(int)[:-1]) - clf.fit(X_train, y_train) - - if args.v: - print("----------------------") - print("Initial accuracy:") - print("Train accuracy: ", accuracy_score(clf.predict(X_train), y_train)) - print("Test accuracy: ", accuracy_score(clf.predict(X_test), y_test)) - print("----------------------") - - if args.p is not None: - print("----------------------") - print("Rounded accuracy (precision=" + str(args.p) + "):") - print("Train accuracy: ", accuracy_score(predict_proba(X_train, clf, args.p), y_train)) - print("Test accuracy: ", accuracy_score(predict_proba(X_test, clf, args.p), y_test)) - print("----------------------") - - if args.ox: - if not os.path.exists(os.path.dirname(args.ox)): - os.makedirs(os.path.dirname(args.ox)) - - with open(args.ox, "w") as f: - # --------- Target ----------- - enc = encoders[y.name] - C = len(enc.classes_) - f.write(str(C) + "\n") - for category, target in zip(enc.classes_, enc.transform(enc.classes_)): - f.write(str(target) + " " + str(category) + "\n") - - # --------- Features --------- - n = X.shape[1] - f.write(str(n) + "\n") - - f.write("0" + "\n") - f.write(str(n) + "\n") - for i, feature in enumerate(X.columns): - f.write(str(i) + " " + str(feature) + "\n") - enc = encoders[feature] - f.write(str(len(enc.classes_)) + "\n") - for category, label in zip(enc.classes_, enc.transform(enc.classes_)): - f.write(str(label) + " " + str(category) + "\n") - - """ - FUTURE DEVELOPMENT - # Get types of features (categorical or continuous (=real-valued)) - dtypes = dict() - for column in X.columns: - if len(X[column].unique()) < (X.shape[0] / 3): - dtypes[column] = "categorical" - else: - dtypes[column] = "continuous" - # Real-valued features - f.write(str(len(dict((k, v) for k, v in dtypes.items() if v == "continuous"))) + "\n") - for i, (feature, dtype) in enumerate(dtypes.items()): - if dtype == "continuous": - f.write(str(i) + " " + str(feature) + "\n") - enc = encoders[feature] - f.write(str(len(enc.classes_)) + "\n") - for category, label in zip(enc.classes_, enc.transform(enc.classes_)): - f.write(str(label) + " " + str(category) + "\n") - - # Categorical features - f.write(str(len(dict((k, v) for k, v in dtypes.items() if v == "categorical"))) + "\n") - for i, (feature, dtype) in enumerate(dtypes.items()): - if dtype == "categorical": - f.write(str(i) + " " + str(feature) + "\n") - enc = encoders[feature] - f.write(str(len(enc.classes_)) + "\n") - for category, label in zip(enc.classes_, enc.transform(enc.classes_)): - f.write(str(label) + " " + str(category) + "\n") - """ - - if args.op: - if not os.path.exists(os.path.dirname(args.op)): - os.makedirs(os.path.dirname(args.op)) - pickle.dump(clf, open(args.op, "wb")) - - if args.oc: - if not os.path.exists(os.path.dirname(args.oc)): - os.makedirs(os.path.dirname(args.oc)) - - with open(args.oc, "w") as f: - n = len(clf.classes_) - f.write(str(n) + "\n") - class_priors = np.exp(clf.class_log_prior_) - for i in class_priors: - if args.p is not None: - f.write(str(np.round(np.format_float_positional(i, trim='-'), args.p)) + "\n") - else: - f.write(str(np.format_float_positional(i, trim='-')) + "\n") - m = X.shape[1] - f.write(str(m) + "\n") - - feature_log_priors = clf.feature_log_prob_ - - for feature_log_prior in feature_log_priors: - feature_prior = np.exp(feature_log_prior) - f.write(str(feature_prior.shape[1]) + "\n") - for feature_class_prior in feature_prior: - for v in feature_class_prior: - if args.p is not None: - f.write(str(np.round(np.format_float_positional(v, trim='-'), args.p)) + " ") - else: - f.write(str(np.format_float_positional(v, trim='-')) + " ") - f.write("\n") - - if args.oi: - if not os.path.exists(os.path.dirname(args.oi)): - os.makedirs(os.path.dirname(args.oi)) - - name = next(s for s in reversed(args.oi.split("/")) if s) - for i, (_, sample) in enumerate(X.iterrows()): - path = os.path.join(args.oi, name + "." + str(i+1) + ".txt") - with open(path, "w") as f: - f.write(str(len(sample)) + "\n") - for value in sample: - f.write(str(value) + "\n") - f.write(str(clf.predict([sample])[0]) + "\n") - diff --git a/pages/application/NaiveBayes/utils/test.pl b/pages/application/NaiveBayes/utils/test.pl deleted file mode 100644 index 0736495ee74c54712523b07d421d015293610012..0000000000000000000000000000000000000000 --- a/pages/application/NaiveBayes/utils/test.pl +++ /dev/null @@ -1,3 +0,0 @@ -print "Called::\n"; - -my $f_err_msg = "Please check file name, existence, permissions, etc.\n"; diff --git a/pages/application/RandomForest/RandomForestComponent.py b/pages/application/RandomForest/RandomForestComponent.py new file mode 100644 index 0000000000000000000000000000000000000000..50dd5d332e9c3b3e1e66ed57007f10415152d221 --- /dev/null +++ b/pages/application/RandomForest/RandomForestComponent.py @@ -0,0 +1,83 @@ +import base64 + +import dash_bootstrap_components as dbc +import numpy as np +from dash import dcc, html + +from pages.application.RandomForest.utils.data import Data +from pages.application.RandomForest.utils.anchor_wrap import anchor_call +from pages.application.RandomForest.utils.lime_wrap import lime_call +from pages.application.RandomForest.utils.shap_wrap import shap_call +from pages.application.RandomForest.utils.xgbooster import XGBooster, preprocess_dataset +from pages.application.RandomForest.utils.xgbrf import XGBRandomForest + + +class RandomForestComponent: + + def __init__(self, model, type_model='SKL', info=None, type_info=''): + + if info is not None and '.csv' in type_info: + self.data = Data(info) + + # Conversion model + if type_model == "RF": + self.random_forest = XGBRandomForest(info, from_model=model) + else: + self.random_forest = XGBooster(info, from_model=model) + + # self.random_forest.encode(test_on=info) + + self.map_file = "" + + self.network = html.Div([]) + self.explanation = [] + + def update_with_explicability(self, instance, enum_feats=None, validation=None, xtype=None, solver=None, ): + + # Call explanation + + if not enum_feats and self.data is not None: + enum_feats = len(self.data.names) - 1 + + expl = self.random_forest.explain(instance, + use_lime=lime_call if solver == "lime" else None, + use_anchor=anchor_call if solver == "anchor" else None, + use_shap=shap_call if solver == "shap" else None, + nof_feats=enum_feats) + + if validation: + coex = self.random_forest.validate(instance, expl) + if coex: + # repairing the local explanation + gexpl = self.random_forest.explain(instance, expl_ext=expl, prefer_ext=True) + else: + # an attempt to refine the local explanation further + gexpl = self.random_forest.explain(instance, expl_ext=expl) + + print(expl) + + self.explanation = [] + list_explanations_path = [] + explanation = {} + + self.network = html.Div([]) + + # Creating a clean and nice text component + # instance plotting + self.explanation.append(html.H4("Instance : \n")) + self.explanation.append(html.P(str([str(instance[i]) for i in range(len(instance))]))) + for k in explanation.keys(): + if k != "List of path explanation(s)" and k != "List of path contrastive explanation(s)": + if k in ["List of abductive explanation(s)", "List of contrastive explanation(s)"]: + self.explanation.append(html.H4(k)) + for expl in explanation[k]: + self.explanation.append(html.Hr()) + self.explanation.append(html.P(expl)) + self.explanation.append(html.Hr()) + else: + self.explanation.append(html.P(k + explanation[k])) + else: + list_explanations_path = explanation["List of path explanation(s)"] + list_contrastive_explanations_path = explanation["List of path contrastive explanation(s)"] + + return list_explanations_path, list_contrastive_explanations_path diff --git a/pages/application/RandomForest/utils/anchor_wrap/__init__.py b/pages/application/RandomForest/utils/anchor_wrap/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d4bcf2c92b14c5fdda52e9d291aa6995e713bce --- /dev/null +++ b/pages/application/RandomForest/utils/anchor_wrap/__init__.py @@ -0,0 +1 @@ +from .anchor_wrap import * diff --git a/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py b/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py new file mode 100644 index 0000000000000000000000000000000000000000..f66ec0a8fd48dc0e25b2760968d20e16b392968f --- /dev/null +++ b/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## anchor_wrap.py (reuses parts of the code of SHAP) +## +## Created on: Jan 6, 2019 +## Author: Nina Narodytska, Alexey Ignatiev +## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt +## + +# +#============================================================================== +from __future__ import print_function +import json +import numpy as np +import xgboost as xgb +import math +import resource +from anchor import utils +from anchor import anchor_tabular +import sklearn +import sklearn.ensemble + + +# +#============================================================================== +def anchor_call(xgb, sample=None, nb_samples=5, feats='all', + nb_features_in_exp=5, threshold=0.95): + + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # we need a way to say that features are categorical ? + # we do not have this informations. + explainer = anchor_tabular.AnchorTabularExplainer( + class_names=xgb.target_name, + feature_names=xgb.feature_names, + train_data=xgb.X, + categorical_names=xgb.categorical_names if xgb.use_categorical else {}) + # if (len(xgb.X_test) != 0): + # explainer.fit(xgb.X_train, xgb.Y_train, xgb.X_test, xgb.Y_test) + # else: + # explainer.fit(xgb.X_train, xgb.Y_train, xgb.X_train, xgb.Y_train) + predict_fn_xgb = lambda x: xgb.model.predict(xgb.transform(x)).astype(int) + + f2imap = {} + for i, f in enumerate(xgb.feature_names): + f2imap[f.strip()] = i + + if (sample is not None): + try: + feat_sample = np.asarray(sample, dtype=np.float32) + except Exception as inst: + print("Cannot parse input sample:", sample, inst) + exit() + print("\n\n\nStarting Anchor explainer... \nConsidering a sample with features:", feat_sample) + if not (len(feat_sample) == len(xgb.X_train[0])): + print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) + exit() + + # compute boost predictions + feat_sample_exp = np.expand_dims(feat_sample, axis=0) + feat_sample_exp = xgb.transform(feat_sample_exp) + y_pred = xgb.model.predict(feat_sample_exp)[0] + y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] + #hack testiing that we use the same onehot encoding + # test_feat_sample_exp = explainer.encoder.transform(feat_sample_exp) + test_y_pred = xgb.model.predict(feat_sample_exp)[0] + test_y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] + assert(np.allclose(y_pred_prob, test_y_pred_prob)) + print('Prediction: ', explainer.class_names[predict_fn_xgb(feat_sample.reshape(1, -1))[0]]) + # exp = explainer.explain_instance(feat_sample, xgb.model.predict, threshold=threshold) + print('sample ====== ', feat_sample) + exp = explainer.explain_instance(feat_sample, predict_fn_xgb, threshold=threshold) + print('Anchor: %s' % (' AND '.join(exp.names()))) + print('Precision: %.2f' % exp.precision()) + print('Coverage: %.2f' % exp.coverage()) + + # explanation + expl = [] + + if (xgb.use_categorical): + for k, v in enumerate(exp.features()): + expl.append(v) + print("Clause ", k, end=": ") + print("feature (", v, ",", explainer.feature_names[v], end="); ") + print("value (", feat_sample[v], ",", explainer.categorical_names[v][int(feat_sample[v])] , ")") + else: + print("We only support datasets with categorical features for Anchor. Please pre-process your data.") + exit() + + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer + print(' time: {0:.2f}'.format(timer)) + + return sorted(expl) + + ###################################### TESTING + max_sample = nb_samples + y_pred_prob = xgb.model.predict_proba(xgb.X_test) + y_pred = xgb.model.predict(xgb.X_test) + + nb_tests = min(max_sample,len(xgb.Y_test)) + top_labels = 1 + for sample in range(nb_tests): + np.set_printoptions(precision=2) + feat_sample = xgb.X_test[sample] + print("Considering a sample with features:", feat_sample) + if (False): + feat_sample[4] = 3000 + y_pred_prob_sample = xgb.model.predict_proba([feat_sample]) + print(y_pred_prob_sample) + print("\t Predictions:", y_pred_prob[sample]) + exp = explainer.explain_instance(feat_sample, + predict_fn_xgb, + num_features= xgb.num_class, + top_labels = 1, + labels = list(range(xgb.num_class))) + for i in range(xgb.num_class): + if (i != y_pred[sample]): + continue + print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[sample][i], ")") + print("\t \t Features in explanations: ", exp.as_list(label=i)) + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer + print(' time: {0:.2f}'.format(timer)) + return diff --git a/pages/application/RandomForest/utils/data.py b/pages/application/RandomForest/utils/data.py new file mode 100644 index 0000000000000000000000000000000000000000..6c94e3da69d365d8270afa00f5f8fa7db1506ab7 --- /dev/null +++ b/pages/application/RandomForest/utils/data.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## data.py +## +## Created on: Sep 20, 2017 +## Author: Alexey Ignatiev, Nina Narodytska +## E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com +## + +# +#============================================================================== +from __future__ import print_function +import collections +import itertools +import pickle +import six +from six.moves import range +import numpy as np + + +# +#============================================================================== +class Data(object): + """ + Class for representing data (transactions). + """ + + def __init__(self, filename=None, fpointer=None, mapfile=None, + separator=' ', use_categorical = False): + """ + Constructor and parser. + """ + + self.names = None + self.nm2id = None + self.samps = None + self.wghts = None + self.feats = None + self.fvmap = None + self.ovmap = {} + self.fvars = None + self.fname = filename + self.mname = mapfile + self.deleted = set([]) + + if filename: + with open(filename, 'r') as fp: + self.parse(fp, separator) + elif fpointer: + self.parse(fpointer, separator) + + if self.mname: + self.read_orig_values() + + # check if we have extra info about categorical_features + + if (use_categorical): + extra_file = filename+".pkl" + try: + f = open(extra_file, "rb") + print("Attempt: loading extra data from ", extra_file) + extra_info = pickle.load(f) + print("loaded") + f.close() + self.categorical_features = extra_info["categorical_features"] + self.categorical_names = extra_info["categorical_names"] + self.class_names = extra_info["class_names"] + self.categorical_onehot_names = extra_info["categorical_names"].copy() + + for i, name in enumerate(self.class_names): + self.class_names[i] = str(name).replace("b'","'") + for c in self.categorical_names.items(): + clean_feature_names = [] + for i, name in enumerate(c[1]): + name = str(name).replace("b'","'") + clean_feature_names.append(name) + self.categorical_names[c[0]] = clean_feature_names + + except Exception as e: + f.close() + print("Please provide info about categorical features or omit option -c", e) + exit() + + def parse(self, fp, separator): + """ + Parse input file. + """ + + # reading data set from file + lines = fp.readlines() + + # reading preamble + self.names = lines[0].strip().split(separator) + self.feats = [set([]) for n in self.names] + del(lines[0]) + + # filling name to id mapping + self.nm2id = {name: i for i, name in enumerate(self.names)} + + self.nonbin2bin = {} + for name in self.nm2id: + spl = name.rsplit(':',1) + if (spl[0] not in self.nonbin2bin): + self.nonbin2bin[spl[0]] = [name] + else: + self.nonbin2bin[spl[0]].append(name) + + # reading training samples + self.samps, self.wghts = [], [] + + for line, w in six.iteritems(collections.Counter(lines)): + sample = line.strip().split(separator) + for i, f in enumerate(sample): + if f: + self.feats[i].add(f) + self.samps.append(sample) + self.wghts.append(w) + + # direct and opposite mappings for items + idpool = itertools.count(start=0) + FVMap = collections.namedtuple('FVMap', ['dir', 'opp']) + self.fvmap = FVMap(dir={}, opp={}) + + # mapping features to ids + for i in range(len(self.names) - 1): + feats = sorted(list(self.feats[i]), reverse=True) + if len(feats) > 2: + for l in feats: + self.fvmap.dir[(self.names[i], l)] = l + else: + self.fvmap.dir[(self.names[i], feats[0])] = 1 + if len(feats) == 2: + self.fvmap.dir[(self.names[i], feats[1])] = 0 + + # opposite mapping + for key, val in six.iteritems(self.fvmap.dir): + self.fvmap.opp[val] = key + + # determining feature variables (excluding class variables) + for v, pair in six.iteritems(self.fvmap.opp): + if pair[0] == self.names[-1]: + self.fvars = v - 1 + break + + def read_orig_values(self): + """ + Read original values for all the features. + (from a separate CSV file) + """ + + self.ovmap = {} + + for line in open(self.mname, 'r'): + featval, bits = line.strip().split(',') + feat, val = featval.split(':') + + for i, b in enumerate(bits): + f = '{0}:b{1}'.format(feat, i + 1) + v = self.fvmap.dir[(f, '1')] + + if v not in self.ovmap: + self.ovmap[v] = [feat] + + if -v not in self.ovmap: + self.ovmap[-v] = [feat] + + self.ovmap[v if b == '1' else -v].append(val) diff --git a/pages/application/RandomForest/utils/lime_wrap/__init__.py b/pages/application/RandomForest/utils/lime_wrap/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..32487979bf8f14f3ab733fa02b82973843e3078b --- /dev/null +++ b/pages/application/RandomForest/utils/lime_wrap/__init__.py @@ -0,0 +1 @@ +from .lime_wrap import * diff --git a/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py b/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py new file mode 100644 index 0000000000000000000000000000000000000000..0146ed1f3ae756ea8a0d7bf0f8d6a79449b951fb --- /dev/null +++ b/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## lime_wrap.py (reuses parts of the code of SHAP) +## +## Created on: Dec 12, 2018 +## Author: Nina Narodytska, Alexey Ignatiev +## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt +## + +# +#============================================================================== +import json +import numpy as np +import xgboost as xgb +import math +import lime +import lime.lime_tabular +import resource + + +# +#============================================================================== +def lime_call(xgb, sample = None, nb_samples = 5, feats='all', + nb_features_in_exp=5): + + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # we need a way to say that features are categorical ? + # we do not have this informations. + predict_fn_xgb = lambda x: xgb.model.predict_proba(xgb.transform(x)).astype(float) + explainer = lime.lime_tabular.LimeTabularExplainer( + xgb.X_train, + feature_names=xgb.feature_names, + categorical_features=xgb.categorical_features if xgb.use_categorical else None, + class_names=xgb.target_name, + discretize_continuous=True, + ) + + f2imap = {} + for i, f in enumerate(xgb.feature_names): + f2imap[f.strip()] = i + + if (sample is not None): + try: + feat_sample = np.asarray(sample, dtype=np.float32) + except: + print("Cannot parse input sample:", sample) + exit() + print("\n\n\nStarting LIME explainer... \nConsidering a sample with features:", feat_sample) + if not (len(feat_sample) == len(xgb.X_train[0])): + print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) + exit() + + # compute boost predictions + feat_sample_exp = np.expand_dims(feat_sample, axis=0) + feat_sample_exp = xgb.transform(feat_sample_exp) + y_pred = xgb.model.predict(feat_sample_exp)[0] + y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] + + exp = explainer.explain_instance(feat_sample, + predict_fn_xgb, + num_features = nb_features_in_exp, + top_labels = 1)#, + #labels = list(range(xgb.num_class))) + + expl = [] + + # choose which features in the explanation to focus on + if feats in ('p', 'pos', '+'): + feats = 1 + elif feats in ('n', 'neg', '-'): + feats = -1 + else: + feats = 0 + + for i in range(xgb.num_class): + if (i != y_pred): + continue + print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[i], ")") + print("\t \t Features in explanations: ", exp.as_list(label=i)) + + s_human_readable = "" + for k, v in enumerate(exp.as_list(label=i)): + if (feats == 1 and v[1] < 0) or (feats == -1 and v[1] >= 0): + continue + + if not (('<' in v[0]) or ('>' in v[0])): + a = v[0].split('=') + f = a[0].strip() + l = a[1].strip() + u = l + + if (xgb.use_categorical): + fid = f2imap[f] + fvid = int(a[1]) + #s_human_readable = s_human_readable + f + " = [" + str(xgb.categorical_names[fid][fvid]) +"," + str(v[1])+ "] " + s_human_readable = s_human_readable + "\t \t id = {}, name = {}, score = {}\n".format(fid, f, str(v[1])) + + + else: + a = v[0].split('<') + + if len(a) == 1: + a = v[0].split('>') + + if len(a) == 2: + f = a[0].strip() + + if '>' in v[0]: + l, u = float(a[1].strip(' =')), None + else: + l, u = None, float(a[1].strip(' =')) + else: + l = float(a[0].strip()) + f = a[1].strip(' =') + u = float(a[2].strip(' =')) + + # expl.append(tuple([f2imap[f], l, u, v[1] >= 0])) + expl.append(f2imap[f]) + + if (xgb.use_categorical): + if (len(s_human_readable) > 0): + print("\t \t Features in explanations (with provided categorical labels): \n", s_human_readable) + + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer + print(' time: {0:.2f}'.format(timer)) + + return sorted(expl) + + ###################################### TESTING + max_sample = nb_samples + y_pred_prob = xgb.model.predict_proba(xgb.X_test) + y_pred = xgb.model.predict(xgb.X_test) + + nb_tests = min(max_sample,len(xgb.Y_test)) + top_labels = 1 + for sample in range(nb_tests): + np.set_printoptions(precision=2) + feat_sample = xgb.X_test[sample] + print("Considering a sample with features:", feat_sample) + if (False): + feat_sample[4] = 3000 + y_pred_prob_sample = xgb.model.predict_proba([feat_sample]) + print(y_pred_prob_sample) + print("\t Predictions:", y_pred_prob[sample]) + exp = explainer.explain_instance(feat_sample, + predict_fn_xgb, + num_features= xgb.num_class, + top_labels = 1, + labels = list(range(xgb.num_class))) + for i in range(xgb.num_class): + if (i != y_pred[sample]): + continue + print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[sample][i], ")") + print("\t \t Features in explanations: ", exp.as_list(label=i)) + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer + print(' time: {0:.2f}'.format(timer)) + return diff --git a/pages/application/RandomForest/utils/options.py b/pages/application/RandomForest/utils/options.py new file mode 100644 index 0000000000000000000000000000000000000000..8ba5a91707abf42368d67f65044806199cf4c8d4 --- /dev/null +++ b/pages/application/RandomForest/utils/options.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## options.py +## +## Created on: Dec 7, 2018 +## Author: Alexey Ignatiev, Nina Narodytska +## E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com +## + +# +#============================================================================== +from __future__ import print_function +import getopt +import math +import os +import sys + + +# +#============================================================================== +class Options(object): + """ + Class for representing command-line options. + """ + + def __init__(self, command): + """ + Constructor. + """ + + # actions + self.train = False + self.encode = 'none' + self.explain = '' + self.useanchor = False + self.uselime = False + self.useshap = False + self.limefeats = 5 + self.validate = False + self.use_categorical = False + self.preprocess_categorical = False + self.preprocess_categorical_files = "" + + # training options + self.accmin = 0.95 + self.n_estimators = 100 + self.num_boost_round = 10 + self.maxdepth = 3 + self.testsplit = 0.2 + self.seed = 7 + + # other options + self.files = None + self.output = 'Classifiers' + self.mapfile = None + self.separator = ',' + self.smallest = False + self.solver = 'z3' + self.verb = 0 + + # random forest + self.rf = False + self.pi_check = False + self.repair = False + self.refine = False + + if command: + self.parse(command) + + def parse(self, command): + """ + Parser. + """ + + self.command = command + + try: + opts, args = getopt.getopt(command[1:], + 'a:ce:d:hL:lm:Mn:o:pPr:Rqs:tvVwx:', + ['accmin=', + 'encode=', + 'help', + 'map-file=', + 'use-anchor=', + 'lime-feats=', + 'use-lime=', + 'use-shap=', + 'use-categorical=', + 'preprocess-categorical=', + 'pfiles=', + 'maxdepth=', + 'minimum', + 'nbestims=', + 'output=', + 'prime-implicant', + 'rounds=', + 'random-forest', + 'repair', + 'refine', + 'seed=', + 'sep=', + 'solver=', + 'testsplit=', + 'train', + 'validate', + 'verbose', + 'explain=' + ]) + except getopt.GetoptError as err: + sys.stderr.write(str(err).capitalize()) + self.usage() + sys.exit(1) + + for opt, arg in opts: + if opt in ('-a', '--accmin'): + self.accmin = float(arg) + elif opt in ('-c', '--use-categorical'): + self.use_categorical = True + elif opt in ('-d', '--maxdepth'): + self.maxdepth = int(arg) + elif opt in ('-e', '--encode'): + self.encode = str(arg) + elif opt in ('-h', '--help'): + self.usage() + sys.exit(0) + elif opt in ('-l', '--use-lime'): + self.uselime = True + elif opt in ('-L', '--lime-feats'): + self.limefeats = 0 if arg == 'all' else int(arg) + elif opt in ('-m', '--map-file'): + self.mapfile = str(arg) + elif opt in ('-M', '--minimum'): + self.smallest = True + elif opt in ('-n', '--nbestims'): + self.n_estimators = int(arg) + elif opt in ('-o', '--output'): + self.output = str(arg) + elif opt in ('-q', '--use-anchor'): + self.useanchor = True + elif opt in ('-P', '--prime-implicant'): + self.pi_check = True + elif opt in ('-r', '--rounds'): + self.num_boost_round = int(arg) + elif opt in ('-R', '--random-forest'): + self.rf = True + elif opt == '--repair': + self.repair = True + elif opt == '--refine': + self.refine = True + elif opt == '--seed': + self.seed = int(arg) + elif opt == '--sep': + self.separator = str(arg) + elif opt in ('-s', '--solver'): + self.solver = str(arg) + elif opt == '--testsplit': + self.testsplit = float(arg) + elif opt in ('-t', '--train'): + self.train = True + elif opt in ('-V', '--validate'): + self.validate = True + elif opt in ('-v', '--verbose'): + self.verb += 1 + elif opt in ('-w', '--use-shap'): + self.useshap = True + elif opt in ('-x', '--explain'): + self.explain = str(arg) + elif opt in ('-p', '--preprocess-categorical'): + self.preprocess_categorical = True + elif opt in ('--pfiles'): + self.preprocess_categorical_files = str(arg) #train_file, test_file(or empty, resulting file + else: + assert False, 'Unhandled option: {0} {1}'.format(opt, arg) + + if self.encode == 'none': + self.encode = None + + self.files = args + + def usage(self): + """ + Print usage message. + """ + + print('Usage: ' + os.path.basename(self.command[0]) + ' [options] input-file') + print('Options:') + print(' -a, --accmin=<float> Minimal accuracy') + print(' Available values: [0.0, 1.0] (default = 0.95)') + print(' -c, --use-categorical Treat categorical features as categorical (with categorical features info if available)') + print(' -d, --maxdepth=<int> Maximal depth of a tree') + print(' Available values: [1, INT_MAX] (default = 3)') + print(' -e, --encode=<smt> Encode a previously trained model') + print(' Available values: smt, smtbool, none (default = none)') + print(' -h, --help Show this message') + print(' -l, --use-lime Use LIME to compute an explanation') + print(' -L, --lime-feats Instruct LIME to compute an explanation of this size') + print(' Available values: [1, INT_MAX], all (default = 5)') + print(' -m, --map-file=<string> Path to a file containing a mapping to original feature values. (default: none)') + print(' -M, --minimum Compute a smallest size explanation (instead of a subset-minimal one)') + print(' -n, --nbestims=<int> Number of trees per class') + print(' Available values: [1, INT_MAX] (default = 100)') + print(' -o, --output=<string> Directory where output files will be stored (default: \'temp\')') + print(' -p, Preprocess categorical data') + print(' --pfiles Filenames to use when preprocessing') + print(' --prime-implicant Check explanation if it is a prime implicant') + print(' -q, --use-anchor Use Anchor to compute an explanation') + print(' -r, --rounds=<int> Number of training rounds') + print(' -R, --random-forest Use Random Forest model') + print(' --refine try to refine the (optimistic) local explanation') + print(' --repair try to repair the (pessimistic) local explanation') + print(' Available values: [1, INT_MAX] (default = 10)') + print(' --seed=<int> Seed for random splitting') + print(' Available values: [1, INT_MAX] (default = 7)') + print(' --sep=<string> Field separator used in input file (default = \',\')') + print(' -s, --solver=<string> An SMT reasoner to use') + print(' Available values: cvc4, mathsat, yices, z3 (default = z3)') + print(' -t, --train Train a model of a given dataset') + print(' --testsplit=<float> Training and test sets split') + print(' Available values: [0.0, 1.0] (default = 0.2)') + print(' -v, --verbose Increase verbosity level') + print(' -V, --validate Validate explanation (show that it is too optimistic)') + print(' -w, --use-shap Use SHAP to compute an explanation') + print(' -x, --explain=<string> Explain a decision for a given comma-separated sample (default: none)') diff --git a/pages/application/RandomForest/utils/shap_wrap/__init__.py b/pages/application/RandomForest/utils/shap_wrap/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..845cbd2bb73191edd950fc9eb09a641ab8cf2088 --- /dev/null +++ b/pages/application/RandomForest/utils/shap_wrap/__init__.py @@ -0,0 +1 @@ +from .shap_wrap import * diff --git a/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py b/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py new file mode 100644 index 0000000000000000000000000000000000000000..4eadc21f76cc03f78c656fcf7d7cde1ed4ea2a3b --- /dev/null +++ b/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## shap_wrap.py (reuses parts of the code of SHAP) +## +## Created on: Sep 25, 2019 +## Author: Nina Narodytska +## E-mail: narodytska@vmware.com +## + +# +#============================================================================== +import json +import numpy as np +import xgboost as xgb +import math +import shap +import resource + + +# +#============================================================================== +def shap_call(xgb, sample = None, feats='all', nb_features_in_exp = None): + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + f2imap = {} + for i, f in enumerate(xgb.feature_names): + f2imap[f.strip()] = i + + if (sample is not None): + if (nb_features_in_exp is None): + nb_features_in_exp = len(sample) + + try: + feat_sample = np.asarray(sample, dtype=np.float32) + except: + print("Cannot parse input sample:", sample) + exit() + print("\n\nStarting SHAP explainer... \nConsidering a sample with features:", feat_sample) + if not (len(feat_sample) == len(xgb.X_train[0])): + print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) + exit() + + # compute boost predictions + feat_sample_exp = np.expand_dims(feat_sample, axis=0) + feat_sample_exp = xgb.transform(feat_sample_exp) + y_pred = xgb.model.predict(feat_sample_exp)[0] + y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] + + # No need to pass dataset as it is recored in model + # https://shap.readthedocs.io/en/latest/ + + explainer = shap.TreeExplainer(xgb.model) + shap_values = explainer.shap_values(feat_sample_exp) + + shap_values_sample = shap_values[-1] + transformed_sample = feat_sample_exp[-1] + + + + + # we need to sum values per feature + # https://github.com/slundberg/shap/issues/397 + sum_values = [] + if (xgb.use_categorical): + p = 0 + for f in xgb.categorical_features: + nb_values = len(xgb.categorical_names[f]) + sum_v = 0 + for i in range(nb_values): + sum_v = sum_v + shap_values_sample[p+i] + p = p + nb_values + sum_values.append(sum_v) + else: + sum_values = shap_values_sample + expl = [] + + # choose which features in the explanation to focus on + if feats in ('p', 'pos', '+'): + feats = 1 + elif feats in ('n', 'neg', '-'): + feats = -1 + else: + feats = 0 + + print("\t \t Explanations for the winner class", y_pred, " (xgboost confidence = ", y_pred_prob[int(y_pred)], ")") + print("base_value = {}, predicted_value = {}".format(explainer.expected_value, np.sum(sum_values) + explainer.expected_value)) + + abs_sum_values = np.abs(sum_values) + sorted_by_abs_sum_values =np.argsort(-abs_sum_values) + + for k1, v1 in enumerate(sorted_by_abs_sum_values): + + k = v1 + v = sum_values[v1] + + if (feats == 1 and v < 0) or (feats == -1 and v >= 0): + continue + + expl.append(f2imap[xgb.feature_names[k]]) + print("id = {}, name = {}, score = {}".format(f2imap[xgb.feature_names[k]], xgb.feature_names[k], v)) + + if (len(expl) == nb_features_in_exp): + break + + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer + print(' time: {0:.2f}'.format(timer)) + + return sorted(expl[:nb_features_in_exp]) diff --git a/pages/application/RandomForest/utils/xgbooster/__init__.py b/pages/application/RandomForest/utils/xgbooster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..88bdad8d34e0f8a335ca167595de25c965e8b021 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbooster/__init__.py @@ -0,0 +1,4 @@ +from .encode import * +from .tree import * +from .xgbooster import * +from .preprocess import * \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xgbooster/encode.py b/pages/application/RandomForest/utils/xgbooster/encode.py new file mode 100644 index 0000000000000000000000000000000000000000..6a77fb3afb792f0cd15276c11e03b4b4005f5109 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbooster/encode.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## encode.py +## +## Created on: Dec 7, 2018 +## Author: Alexey Ignatiev +## E-mail: aignatiev@ciencias.ulisboa.pt +## + +# +#============================================================================== +from __future__ import print_function +import collections +from pysat.formula import IDPool +from pysmt.smtlib.parser import SmtLibParser +from pysmt.shortcuts import And, BOOL, Iff, Implies, Not, Or, Symbol, get_model +from pysmt.shortcuts import Equals, ExactlyOne, LT, Plus, REAL, Real, write_smtlib +from .tree import TreeEnsemble, scores_tree +import six +from six.moves import range + +try: # for Python2 + from cStringIO import StringIO +except ImportError: # for Python3 + from io import StringIO + + +# +#============================================================================== +class SMTEncoder(object): + """ + Encoder of XGBoost tree ensembles into SMT. + """ + + def __init__(self, model, feats, nof_classes, xgb, from_file=None): + """ + Constructor. + """ + + self.model = model + self.feats = {f: i for i, f in enumerate(feats)} + self.nofcl = nof_classes + self.idmgr = IDPool() + self.optns = xgb.options + + # xgbooster will also be needed + self.xgb = xgb + + # for interval-based encoding + self.intvs, self.imaps, self.ivars = None, None, None + + if from_file: + self.load_from(from_file) + + def traverse(self, tree, tvar, prefix=[]): + """ + Traverse a tree and encode each node. + """ + + if tree.children: + pos, neg = self.encode_node(tree) + + self.traverse(tree.children[0], tvar, prefix + [pos]) + self.traverse(tree.children[1], tvar, prefix + [neg]) + else: # leaf node + if prefix: + self.enc.append(Implies(And(prefix), Equals(tvar, Real(tree.values)))) + else: + self.enc.append(Equals(tvar, Real(tree.values))) + + def encode_node(self, node): + """ + Encode a node of a tree. + """ + + if '_' not in node.name: + # continuous features => expecting an upper bound + # feature and its upper bound (value) + f, v = node.name, node.threshold + + existing = True if tuple([f, v]) in self.idmgr.obj2id else False + vid = self.idmgr.id(tuple([f, v])) + bv = Symbol('bvar{0}'.format(vid), typename=BOOL) + + if not existing: + if self.intvs: + d = self.imaps[f][v] + 1 + pos, neg = self.ivars[f][:d], self.ivars[f][d:] + self.enc.append(Iff(bv, Or(pos))) + self.enc.append(Iff(Not(bv), Or(neg))) + else: + fvar, fval = Symbol(f, typename=REAL), Real(v) + self.enc.append(Iff(bv, LT(fvar, fval))) + + return bv, Not(bv) + else: + # all features are expected to be categorical and + # encoded with one-hot encoding into Booleans + # each node is expected to be of the form: f_i < 0.5 + bv = Symbol(node.name, typename=BOOL) + + # left branch is positive, i.e. bv is true + # right branch is negative, i.e. bv is false + return Not(bv), bv + + def compute_intervals(self): + """ + Traverse all trees in the ensemble and extract intervals for each + feature. + + At this point, the method only works for numerical datasets! + """ + + def traverse_intervals(tree): + """ + Auxiliary function. Recursive tree traversal. + """ + + if tree.children: + f = tree.name + v = tree.threshold + self.intvs[f].add(v) + + traverse_intervals(tree.children[0]) + traverse_intervals(tree.children[1]) + + # initializing the intervals + self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} + + for tree in self.ensemble.trees: + traverse_intervals(tree) + + # OK, we got all intervals; let's sort the values + self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} + + self.imaps, self.ivars = {}, {} + for feat, intvs in six.iteritems(self.intvs): + self.imaps[feat] = {} + self.ivars[feat] = [] + for i, ub in enumerate(intvs): + self.imaps[feat][ub] = i + + ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) + self.ivars[feat].append(ivar) + + def encode(self): + """ + Do the job. + """ + + self.enc = [] + + # getting a tree ensemble + self.ensemble = TreeEnsemble(self.model, + self.xgb.extended_feature_names_as_array_strings, + nb_classes=self.nofcl) + + # introducing class score variables + csum = [] + for j in range(self.nofcl): + cvar = Symbol('class{0}_score'.format(j), typename=REAL) + csum.append(tuple([cvar, []])) + + # if targeting interval-based encoding, + # traverse all trees and extract all possible intervals + # for each feature + if self.optns.encode == 'smtbool': + self.compute_intervals() + + # traversing and encoding each tree + for i, tree in enumerate(self.ensemble.trees): + # getting class id + clid = i % self.nofcl + + # encoding the tree + tvar = Symbol('tr{0}_score'.format(i + 1), typename=REAL) + self.traverse(tree, tvar, prefix=[]) + + # this tree contributes to class with clid + csum[clid][1].append(tvar) + + # encoding the sums + for pair in csum: + cvar, tvars = pair + self.enc.append(Equals(cvar, Plus(tvars))) + + # enforce exactly one of the feature values to be chosen + # (for categorical features) + categories = collections.defaultdict(lambda: []) + for f in self.xgb.extended_feature_names_as_array_strings: + if '_' in f: + categories[f.split('_')[0]].append(Symbol(name=f, typename=BOOL)) + for c, feats in six.iteritems(categories): + self.enc.append(ExactlyOne(feats)) + + # number of assertions + nof_asserts = len(self.enc) + + # making conjunction + self.enc = And(self.enc) + + # number of variables + nof_vars = len(self.enc.get_free_variables()) + + if self.optns.verb: + print('encoding vars:', nof_vars) + print('encoding asserts:', nof_asserts) + + return self.enc, self.intvs, self.imaps, self.ivars + + def test_sample(self, sample): + """ + Check whether or not the encoding "predicts" the same class + as the classifier given an input sample. + """ + + # first, compute the scores for all classes as would be + # predicted by the classifier + + # score arrays computed for each class + csum = [[] for c in range(self.nofcl)] + + if self.optns.verb: + print('testing sample:', list(sample)) + + sample_internal = list(self.xgb.transform(sample)[0]) + + # traversing all trees + for i, tree in enumerate(self.ensemble.trees): + # getting class id + clid = i % self.nofcl + + # a score computed by the current tree + score = scores_tree(tree, sample_internal) + + # this tree contributes to class with clid + csum[clid].append(score) + + # final scores for each class + cscores = [sum(scores) for scores in csum] + + # second, get the scores computed with the use of the encoding + + # asserting the sample + hypos = [] + + if not self.intvs: + for i, fval in enumerate(sample_internal): + feat, vid = self.xgb.transform_inverse_by_index(i) + fid = self.feats[feat] + + if vid == None: + fvar = Symbol('f{0}'.format(fid), typename=REAL) + hypos.append(Equals(fvar, Real(float(fval)))) + else: + fvar = Symbol('f{0}_{1}'.format(fid, vid), typename=BOOL) + if int(fval) == 1: + hypos.append(fvar) + else: + hypos.append(Not(fvar)) + else: + for i, fval in enumerate(sample_internal): + feat, _ = self.xgb.transform_inverse_by_index(i) + feat = 'f{0}'.format(self.feats[feat]) + + # determining the right interval and the corresponding variable + for ub, fvar in zip(self.intvs[feat], self.ivars[feat]): + if ub == '+' or fval < ub: + hypos.append(fvar) + break + else: + assert 0, 'No proper interval found for {0}'.format(feat) + + # now, getting the model + escores = [] + model = get_model(And(self.enc, *hypos), solver_name=self.optns.solver) + for c in range(self.nofcl): + v = Symbol('class{0}_score'.format(c), typename=REAL) + escores.append(float(model.get_py_value(v))) + + assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \ + 'wrong prediction: {0} vs {1}'.format(cscores, escores) + + if self.optns.verb: + print('xgb scores:', cscores) + print('enc scores:', escores) + + def save_to(self, outfile): + """ + Save the encoding into a file with a given name. + """ + + if outfile.endswith('.txt'): + outfile = outfile[:-3] + 'smt2' + + write_smtlib(self.enc, outfile) + + # appending additional information + with open(outfile, 'r') as fp: + contents = fp.readlines() + + # comments + comments = ['; features: {0}\n'.format(', '.join(self.feats)), + '; classes: {0}\n'.format(self.nofcl)] + + if self.intvs: + for f in self.xgb.extended_feature_names_as_array_strings: + c = '; i {0}: '.format(f) + c += ', '.join(['{0}<->{1}'.format(u, v) for u, v in zip(self.intvs[f], self.ivars[f])]) + comments.append(c + '\n') + + contents = comments + contents + with open(outfile, 'w') as fp: + fp.writelines(contents) + + def load_from(self, infile): + """ + Loads the encoding from an input file. + """ + + with open(infile, 'r') as fp: + file_content = fp.readlines() + + # empty intervals for the standard encoding + self.intvs, self.imaps, self.ivars = {}, {}, {} + + for line in file_content: + if line[0] != ';': + break + elif line.startswith('; i '): + f, arr = line[4:].strip().split(': ', 1) + f = f.replace('-', '_') + self.intvs[f], self.imaps[f], self.ivars[f] = [], {}, [] + + for i, pair in enumerate(arr.split(', ')): + ub, symb = pair.split('<->') + + if ub[0] != '+': + ub = float(ub) + symb = Symbol(symb, typename=BOOL) + + self.intvs[f].append(ub) + self.ivars[f].append(symb) + self.imaps[f][ub] = i + + elif line.startswith('; features:'): + self.feats = line[11:].strip().split(', ') + elif line.startswith('; classes:'): + self.nofcl = int(line[10:].strip()) + + parser = SmtLibParser() + script = parser.get_script(StringIO(''.join(file_content))) + + self.enc = script.get_last_formula() + + def access(self): + """ + Get access to the encoding, features names, and the number of + classes. + """ + + return self.enc, self.intvs, self.imaps, self.ivars, self.feats, self.nofcl diff --git a/pages/application/RandomForest/utils/xgbooster/explain.py b/pages/application/RandomForest/utils/xgbooster/explain.py new file mode 100644 index 0000000000000000000000000000000000000000..ca078749949fa2ad3fa3d01cb080653965effdf5 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbooster/explain.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## explain.py +## +## Created on: Dec 14, 2018 +## Author: Alexey Ignatiev +## E-mail: aignatiev@ciencias.ulisboa.pt +## + +# +#============================================================================== +from __future__ import print_function +import numpy as np +import os +from pysat.examples.hitman import Hitman +from pysat.formula import IDPool +from pysmt.shortcuts import Solver +from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol +from pysmt.shortcuts import Equals, GT, Int, Real, REAL +import resource +from six.moves import range +import sys + + +# +#============================================================================== +class SMTExplainer(object): + """ + An SMT-inspired minimal explanation extractor for XGBoost models. + """ + + def __init__(self, formula, intvs, imaps, ivars, feats, nof_classes, + options, xgb): + """ + Constructor. + """ + + self.feats = feats + self.intvs = intvs + self.imaps = imaps + self.ivars = ivars + self.nofcl = nof_classes + self.optns = options + self.idmgr = IDPool() + + # saving XGBooster + self.xgb = xgb + + self.verbose = self.optns.verb + self.oracle = Solver(name=options.solver) + + self.inps = [] # input (feature value) variables + for f in self.xgb.extended_feature_names_as_array_strings: + if '_' not in f: + self.inps.append(Symbol(f, typename=REAL)) + else: + self.inps.append(Symbol(f, typename=BOOL)) + + self.outs = [] # output (class score) variables + for c in range(self.nofcl): + self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) + + # theory + self.oracle.add_assertion(formula) + + # current selector + self.selv = None + + def prepare(self, sample): + """ + Prepare the oracle for computing an explanation. + """ + + if self.selv: + # disable the previous assumption if any + self.oracle.add_assertion(Not(self.selv)) + + # creating a fresh selector for a new sample + sname = ','.join([str(v).strip() for v in sample]) + + # the samples should not repeat; otherwise, they will be + # inconsistent with the previously introduced selectors + assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) + self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) + + self.rhypos = [] # relaxed hypotheses + + # transformed sample + self.sample = list(self.xgb.transform(sample)[0]) + + self.sel2fid = {} # selectors to original feature ids + self.sel2vid = {} # selectors to categorical feature ids + + # preparing the selectors + for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): + feat = inp.symbol_name().split('_')[0] + selv = Symbol('selv_{0}'.format(feat)) + val = float(val) + + self.rhypos.append(selv) + if selv not in self.sel2fid: + self.sel2fid[selv] = int(feat[1:]) + self.sel2vid[selv] = [i - 1] + else: + self.sel2vid[selv].append(i - 1) + + # adding relaxed hypotheses to the oracle + if not self.intvs: + for inp, val, sel in zip(self.inps, self.sample, self.rhypos): + if '_' not in inp.symbol_name(): + hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) + else: + hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) + + self.oracle.add_assertion(hypo) + else: + for inp, val, sel in zip(self.inps, self.sample, self.rhypos): + inp = inp.symbol_name() + # determining the right interval and the corresponding variable + for ub, fvar in zip(self.intvs[inp], self.ivars[inp]): + if ub == '+' or val < ub: + hypo = Implies(self.selv, Implies(sel, fvar)) + break + + self.oracle.add_assertion(hypo) + + # in case of categorical data, there are selector duplicates + # and we need to remove them + self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) + + # propagating the true observation + if self.oracle.solve([self.selv] + self.rhypos): + model = self.oracle.get_model() + else: + assert 0, 'Formula is unsatisfiable under given assumptions' + + # choosing the maximum + outvals = [float(model.get_py_value(o)) for o in self.outs] + maxoval = max(zip(outvals, range(len(outvals)))) + + # correct class id (corresponds to the maximum computed) + self.out_id = maxoval[1] + self.output = self.xgb.target_name[self.out_id] + + # forcing a misclassification, i.e. a wrong observation + disj = [] + for i in range(len(self.outs)): + if i != self.out_id: + disj.append(GT(self.outs[i], self.outs[self.out_id])) + self.oracle.add_assertion(Implies(self.selv, Or(disj))) + + if self.verbose: + inpvals = self.xgb.readable_sample(sample) + + self.preamble = [] + for f, v in zip(self.xgb.feature_names, inpvals): + if f not in str(v): + self.preamble.append('{0} = {1}'.format(f, v)) + else: + self.preamble.append(v) + + print(' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.output)) + + def explain(self, sample, smallest, expl_ext=None, prefer_ext=False): + """ + Hypotheses minimization. + """ + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # adapt the solver to deal with the current sample + self.prepare(sample) + + # saving external explanation to be minimized further + if expl_ext == None or prefer_ext: + self.to_consider = [True for h in self.rhypos] + else: + eexpl = set(expl_ext) + self.to_consider = [True if i in eexpl else False for i, h in enumerate(self.rhypos)] + + # if satisfiable, then the observation is not implied by the hypotheses + if self.oracle.solve([self.selv] + [h for h, c in zip(self.rhypos, self.to_consider) if c]): + print(' no implication!') + print(self.oracle.get_model()) + sys.exit(1) + + if not smallest: + self.compute_minimal(prefer_ext=prefer_ext) + else: + self.compute_smallest() + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time + + expl = sorted([self.sel2fid[h] for h in self.rhypos]) + + if self.verbose: + self.preamble = [self.preamble[i] for i in expl] + print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id])) + print(' # hypos left:', len(self.rhypos)) + print(' time: {0:.2f}'.format(self.time)) + + return expl + + def compute_minimal(self, prefer_ext=False): + """ + Compute any subset-minimal explanation. + """ + + i = 0 + + if not prefer_ext: + # here, we want to reduce external explanation + + # filtering out unnecessary features if external explanation is given + self.rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if c] + else: + # here, we want to compute an explanation that is preferred + # to be similar to the given external one + # for that, we try to postpone removing features that are + # in the external explanation provided + + rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if not c] + rhypos += [h for h, c in zip(self.rhypos, self.to_consider) if c] + self.rhypos = rhypos + + # simple deletion-based linear search + while i < len(self.rhypos): + to_test = self.rhypos[:i] + self.rhypos[(i + 1):] + + if self.oracle.solve([self.selv] + to_test): + i += 1 + else: + self.rhypos = to_test + + def compute_smallest(self): + """ + Compute a cardinality-minimal explanation. + """ + + # result + rhypos = [] + + with Hitman(bootstrap_with=[[i for i in range(len(self.rhypos)) if self.to_consider[i]]]) as hitman: + # computing unit-size MCSes + for i, hypo in enumerate(self.rhypos): + if self.to_consider[i] == False: + continue + + if self.oracle.solve([self.selv] + self.rhypos[:i] + self.rhypos[(i + 1):]): + hitman.hit([i]) + + # main loop + iters = 0 + while True: + hset = hitman.get() + iters += 1 + + if self.verbose > 1: + print('iter:', iters) + print('cand:', hset) + + if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset]): + to_hit = [] + satisfied, unsatisfied = [], [] + + removed = list(set(range(len(self.rhypos))).difference(set(hset))) + + model = self.oracle.get_model() + for h in removed: + i = self.sel2fid[self.rhypos[h]] + if '_' not in self.inps[i].symbol_name(): + # feature variable and its expected value + var, exp = self.inps[i], self.sample[i] + + # true value + true_val = float(model.get_py_value(var)) + + if not exp - 0.001 <= true_val <= exp + 0.001: + unsatisfied.append(h) + else: + hset.append(h) + else: + for vid in self.sel2vid[self.rhypos[h]]: + var, exp = self.inps[vid], int(self.sample[vid]) + + # true value + true_val = int(model.get_py_value(var)) + + if exp != true_val: + unsatisfied.append(h) + break + else: + hset.append(h) + + # computing an MCS (expensive) + for h in unsatisfied: + if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset] + [self.rhypos[h]]): + hset.append(h) + else: + to_hit.append(h) + + if self.verbose > 1: + print('coex:', to_hit) + + hitman.hit(to_hit) + else: + self.rhypos = [self.rhypos[i] for i in hset] + break diff --git a/pages/application/RandomForest/utils/xgbooster/preprocess.py b/pages/application/RandomForest/utils/xgbooster/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..d66cc338fe6b232fff0155aea5055e7fe9241dac --- /dev/null +++ b/pages/application/RandomForest/utils/xgbooster/preprocess.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## preprocess.py +## +## Created on: Jan 10, 2019 +## Author: Nina Narodytska +## E-mail: narodytska@vmware.com +## + +# +#============================================================================== +import json +import numpy as np +import xgboost as xgb +import math +import pandas as pd +import numpy as np +import sklearn +import pickle + + +# +#============================================================================== +def preprocess_dataset(raw_data_path, files, use_categ=True): + print("preprocess dataset from ", raw_data_path) + files = files.split(",") + + data_file = files[0] + dataset_name = files[1] + + categorical_features = [] + if use_categ: + try: + catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) + categorical_features = np.concatenate(catcols.values).tolist() + + print(categorical_features) + except Exception as e: + print("Please provide info about categorical columns/original datasets or omit option -p", e) + exit() + + try: + data_raw = pd.read_csv(raw_data_path + data_file, sep=',', na_values= ['']) + #catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) + #categorical_features = np.concatenate(catcols.values).tolist() + + + for i in range(len(data_raw.values[0])): + if i in categorical_features: + data_raw.fillna('',inplace=True) + else: + data_raw.fillna(0,inplace=True) + dataset_all = data_raw + dataset = dataset_all.values.copy() + + print(categorical_features) + except Exception as e: + print("Please provide info about categorical columns/original datasets or omit option -p", e) + exit() + + # move categrorical columns forward + + feature_names = dataset_all.columns + print(feature_names) + + ############################## + extra_info = {} + categorical_names = {} + print(categorical_features) + dataset_new = dataset_all.values.copy() + for feature in categorical_features: + print("feature", feature) + print(dataset[:, feature]) + le = sklearn.preprocessing.LabelEncoder() + le.fit(dataset[:, feature]) + categorical_names[feature] = le.classes_ + dataset_new[:, feature] = le.transform(dataset[:, feature]) + + ###################################3 + # target as categorical + labels_new = [] + + le = sklearn.preprocessing.LabelEncoder() + le.fit(dataset[:, -1]) + dataset_new[:, -1]= le.transform(dataset[:, -1]) + class_names = le.classes_ + ######################################33 + + + if (False): + dataset_new = np.delete(dataset_new, -1, axis=1) + oneencoder = sklearn.preprocessing.OneHotEncoder() + oneencoder.fit(dataset_new[:, categorical_features]) + print(oneencoder.categories_) + n_transformed_features = sum([len(cats) for cats in oneencoder.categories_]) + print(n_transformed_features) + print(dataset_new.shape) + X = dataset_new[:,categorical_features][0] + print(X) + x = np.expand_dims(X, axis=0) + print("x", x, x.shape) + y = dataset_new[0].copy() + print(y.shape, oneencoder.transform(x).shape) + y[categorical_features] = oneencoder.transform(x).toarray() + + print("y", y, y.shape) + + z = oneencoder.inverse_transform(y) + print(z.shape) + exit() + + ###########################################################################3 + extra_info = {"categorical_features": categorical_features, + "categorical_names": categorical_names, + "feature_names": feature_names, + "class_names": class_names} + + new_file_train = raw_data_path + dataset_name + '_data.csv' + df = pd.DataFrame(data=dataset_new) + df.columns = list(feature_names) + df.to_csv(new_file_train, mode = 'w', index=False) + print("new dataset", new_file_train) + + + f = open(raw_data_path + dataset_name + '_data.csv.pkl', "wb") + pickle.dump(extra_info, f) + f.close() diff --git a/pages/application/RandomForest/utils/xgbooster/tree.py b/pages/application/RandomForest/utils/xgbooster/tree.py new file mode 100644 index 0000000000000000000000000000000000000000..ebaf24dbf4672e985943f242f238dad8b841e604 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbooster/tree.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## tree.py (reuses parts of the code of SHAP) +## +## Created on: Dec 7, 2018 +## Author: Nina Narodytska +## E-mail: narodytska@vmware.com +## + +# +#============================================================================== +from anytree import Node, RenderTree,AsciiStyle +import json +import numpy as np +import xgboost as xgb +import math + + +# +#============================================================================== +class xgnode(Node): + def __init__(self, id, parent = None): + Node.__init__(self, id, parent) + self.id = id # The node value + self.name = None + self.left_node_id = -1 # Left child + self.right_node_id = -1 # Right child + self.missing_node_id = -1 + + self.feature = -1 + self.threshold = -1 + + self.cover = -1 + self.values = -1 + + def __str__(self): + pref = ' ' * self.depth + if (len(self.children) == 0): + return (pref+ "leaf: {} {}".format(self.id, self.values)) + else: + if(self.name is None): + return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) + else: + return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) + + +# +#============================================================================== +def build_tree(json_tree, node = None, feature_names = None, inverse = False): + def max_id(node): + if "children" in node: + return max(node["nodeid"], *[max_id(n) for n in node["children"]]) + else: + return node["nodeid"] + m = max_id(json_tree) + 1 + def extract_data(json_node, root = None, feature_names = None): + i = json_node["nodeid"] + if (root is None): + node = xgnode(i) + else: + node = xgnode(i, parent = root) + node.cover = json_node["cover"] + if "children" in json_node: + + node.left_node_id = json_node["yes"] + node.right_node_id = json_node["no"] + node.missing_node_id = json_node["missing"] + node.feature = json_node["split"] + if (feature_names is not None): + node.name = feature_names[node.feature] + node.threshold = json_node["split_condition"] + for c, n in enumerate(json_node["children"]): + child = extract_data(n, node, feature_names) + elif "leaf" in json_node: + node.values = json_node["leaf"] + if(inverse): + node.values = -node.values + return node + + root = extract_data(json_tree, None, feature_names) + return root + +# +#============================================================================== +def walk_tree(node): + if (len(node.children) == 0): + # leaf + print(node) + else: + print(node) + walk_tree(node.children[0]) + walk_tree(node.children[1]) + +def count_nodes(root): + def count(node): + if len(node.children): + return sum([1+count(n) for n in node.children]) + else: + return 0 + m = count(root) + 1 + return m + +# +#============================================================================== +def scores_tree(node, sample): + if (len(node.children) == 0): + # leaf + return node.values + else: + feature_branch = node.feature + sample_value = sample[feature_branch] + assert(sample_value is not None) + if(sample_value < node.threshold): + return scores_tree(node.children[0], sample) + else: + return scores_tree(node.children[1], sample) + + +# +#============================================================================== +class TreeEnsemble: + """ An ensemble of decision trees. + + This object provides a common interface to many different types of models. + """ + def __init__(self, model, feature_names = None, nb_classes = 0): + self.model_type = "xgboost" + self.original_model = model.get_booster() + self.base_offset = None + json_trees = get_xgboost_json(self.original_model) + self.trees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] + if(nb_classes == 2): + # NASTY trick for binary + # We change signs of values in leaves so that we can just sum all the values in leaves for class X + # and take max to get the right class + self.otrees = [build_tree(json.loads(t), None, feature_names, inverse = True) for t in json_trees] + self.itrees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] + self.trees = [] + for i,_ in enumerate(self.otrees): + self.trees.append(self.otrees[i]) + self.trees.append(self.itrees[i]) + self.feature_names = feature_names + self.sz = sum([count_nodes(dt) for dt in self.trees]) + def print_tree(self): + for i,t in enumerate(self.trees): + print("tree number: ", i) + walk_tree(t) + + def invert_tree_prob(self, node): + if (len(node.children) == 0): + node.values = -node.values + return node + else: + self.invert_tree_prob(node.children[0]) + self.invert_tree_prob(node.children[1]) + return node + def predict(self, samples, nb_classes): + # https://github.com/dmlc/xgboost/issues/1746#issuecomment-290130695 + prob = [] + for sample in np.asarray(samples): + scores = [] + for i,t in enumerate(self.trees): + s = scores_tree(t, sample) + scores.append((s)) + scores = np.asarray(scores) + class_scores = [] + if (nb_classes == 2): + + for i in range(nb_classes): + class_scores.append(math.exp(-(scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor + s0 = class_scores[0] + s1 = class_scores[1] + v0 = 1/(1 + s0) + v1 = 1/(1 + s1) + class_scores[0] = v0 + class_scores[1] = v1 + else: + for i in range(nb_classes): + class_scores.append(math.exp((scores[i::nb_classes]).sum())) + class_scores = np.asarray(class_scores) + prob.append(class_scores/class_scores.sum()) + return np.asarray(prob).reshape((-1, nb_classes)) + + +# +#============================================================================== +def get_xgboost_json(model): + """ REUSED FROM SHAP + This gets a JSON dump of an XGBoost model while ensuring the feature names are their indexes. + """ + fnames = model.feature_names + model.feature_names = None + json_trees = model.get_dump(with_stats=True, dump_format="json") + model.feature_names = fnames + return json_trees diff --git a/pages/application/RandomForest/utils/xgbooster/validate.py b/pages/application/RandomForest/utils/xgbooster/validate.py new file mode 100644 index 0000000000000000000000000000000000000000..c3c6f82a6f241a7f158ead0694b1718893d89c5c --- /dev/null +++ b/pages/application/RandomForest/utils/xgbooster/validate.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## validate.py +## +## Created on: Jan 4, 2019 +## Author: Alexey Ignatiev +## E-mail: aignatiev@ciencias.ulisboa.pt +## + +# +#============================================================================== +from __future__ import print_function +import getopt +import numpy as np +import os +from pysat.formula import IDPool +from pysmt.shortcuts import Solver +from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol +from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL +import resource +from six.moves import range +import sys + + +# +#============================================================================== +class SMTValidator(object): + """ + Validating Anchor's explanations using SMT solving. + """ + + def __init__(self, formula, feats, nof_classes, xgb): + """ + Constructor. + """ + + self.ftids = {f: i for i, f in enumerate(feats)} + self.nofcl = nof_classes + self.idmgr = IDPool() + self.optns = xgb.options + + # xgbooster will also be needed + self.xgb = xgb + + self.verbose = self.optns.verb + self.oracle = Solver(name=self.xgb.options.solver) + + self.inps = [] # input (feature value) variables + for f in self.xgb.extended_feature_names_as_array_strings: + if '_' not in f: + self.inps.append(Symbol(f, typename=REAL)) + else: + self.inps.append(Symbol(f, typename=BOOL)) + + self.outs = [] # output (class score) variables + for c in range(self.nofcl): + self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) + + # theory + self.oracle.add_assertion(formula) + + # current selector + self.selv = None + + def prepare(self, sample, expl): + """ + Prepare the oracle for validating an explanation given a sample. + """ + + if self.selv: + # disable the previous assumption if any + self.oracle.add_assertion(Not(self.selv)) + + # creating a fresh selector for a new sample + sname = ','.join([str(v).strip() for v in sample]) + + # the samples should not repeat; otherwise, they will be + # inconsistent with the previously introduced selectors + assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) + self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) + + self.rhypos = [] # relaxed hypotheses + + # transformed sample + self.sample = list(self.xgb.transform(sample)[0]) + + # preparing the selectors + for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): + feat = inp.symbol_name().split('_')[0] + selv = Symbol('selv_{0}'.format(feat)) + val = float(val) + + self.rhypos.append(selv) + + # adding relaxed hypotheses to the oracle + for inp, val, sel in zip(self.inps, self.sample, self.rhypos): + if '_' not in inp.symbol_name(): + hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) + else: + hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) + + self.oracle.add_assertion(hypo) + + # propagating the true observation + if self.oracle.solve([self.selv] + self.rhypos): + model = self.oracle.get_model() + else: + assert 0, 'Formula is unsatisfiable under given assumptions' + + # choosing the maximum + outvals = [float(model.get_py_value(o)) for o in self.outs] + maxoval = max(zip(outvals, range(len(outvals)))) + + # correct class id (corresponds to the maximum computed) + true_output = maxoval[1] + + # forcing a misclassification, i.e. a wrong observation + disj = [] + for i in range(len(self.outs)): + if i != true_output: + disj.append(GT(self.outs[i], self.outs[true_output])) + self.oracle.add_assertion(Implies(self.selv, Or(disj))) + + # removing all hypotheses except for those in the explanation + hypos = [] + for i, hypo in enumerate(self.rhypos): + j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] + if j in expl: + hypos.append(hypo) + self.rhypos = hypos + + if self.verbose: + inpvals = self.xgb.readable_sample(sample) + + preamble = [] + for f, v in zip(self.xgb.feature_names, inpvals): + if f not in v: + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) + + def validate(self, sample, expl): + """ + Make an effort to show that the explanation is too optimistic. + """ + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # adapt the solver to deal with the current sample + self.prepare(sample, expl) + + # if satisfiable, then there is a counterexample + if self.oracle.solve([self.selv] + self.rhypos): + model = self.oracle.get_model() + inpvals = [float(model.get_py_value(i)) for i in self.inps] + outvals = [float(model.get_py_value(o)) for o in self.outs] + maxoval = max(zip(outvals, range(len(outvals)))) + + inpvals = self.xgb.transform_inverse(np.array(inpvals))[0] + self.coex = tuple([inpvals, maxoval[1]]) + inpvals = self.xgb.readable_sample(inpvals) + + if self.verbose: + preamble = [] + for f, v in zip(self.xgb.feature_names, inpvals): + if f not in v: + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + print(' explanation is incorrect') + print(' counterexample: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[maxoval[1]])) + else: + self.coex = None + + if self.verbose: + print(' explanation is correct') + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time + + if self.verbose: + print(' time: {0:.2f}'.format(self.time)) + + return self.coex diff --git a/pages/application/RandomForest/utils/xgbooster/xgbooster.py b/pages/application/RandomForest/utils/xgbooster/xgbooster.py new file mode 100644 index 0000000000000000000000000000000000000000..25cd86ce6de6ec5e5f0836344354afa7f7b87d26 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbooster/xgbooster.py @@ -0,0 +1,445 @@ +#!/us/bin/env python +# -*- coding:utf-8 -*- +## +## xgbooster.py +## +## Created on: Dec 7, 2018 +## Author: Nina Narodytska, Alexey Ignatiev +## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt +## + +# +# ============================================================================== +from __future__ import print_function +from .validate import SMTValidator +from .encode import SMTEncoder +from .explain import SMTExplainer +import numpy as np +import os +import resource +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +import sklearn +# print('The scikit-learn version is {}.'.format(sklearn.__version__)) + +from sklearn.preprocessing import OneHotEncoder, LabelEncoder +import sys +from six.moves import range +from .tree import TreeEnsemble +import xgboost as xgb +from xgboost import XGBClassifier, Booster +import pickle + + +# +# ============================================================================== +class XGBooster(object): + """ + The main class to train/encode/explain XGBoost models. + """ + + def __init__(self, options, from_data=None, from_model=None, + from_encoding=None): + """ + Constructor. + """ + + assert from_data or from_model or from_encoding, \ + 'At least one input file should be specified' + + self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime + self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + + # saving command-line options + self.options = options + self.seed = self.options.seed + np.random.seed(self.seed) + + if from_data: + self.use_categorical = self.options.use_categorical + # saving data + self.data = from_data + ## + samps = np.asarray(self.data.samps) + if not all(c.isnumeric() for c in samps[:, -1]): + le = LabelEncoder() + le.fit(samps[:, -1]) + samps[:, -1] = le.transform(samps[:, -1]) + # self.class_names = le.classes_ + # print(le.classes_) + ## + dataset = np.asarray(samps, dtype=np.float32) + # dataset = np.asarray(self.data.samps, dtype=np.float32) + + # split data into X and y + self.feature_names = self.data.names[:-1] + self.nb_features = len(self.feature_names) + + self.X = dataset[:, 0:self.nb_features] + self.Y = dataset[:, self.nb_features] + self.num_class = len(set(self.Y)) + self.target_name = list(range(self.num_class)) + + param_dist = {'n_estimators': self.options.n_estimators, + 'max_depth': self.options.maxdepth} + + if (self.num_class == 2): + param_dist['objective'] = 'binary:logistic' + + self.model = XGBClassifier(**param_dist) + + # split data into train and test sets + self.test_size = self.options.testsplit + if (self.test_size > 0): + self.X_train, self.X_test, self.Y_train, self.Y_test = \ + train_test_split(self.X, self.Y, test_size=self.test_size, + random_state=self.seed) + else: + self.X_train = self.X + self.X_test = [] # need a fix + self.Y_train = self.Y + self.Y_test = [] # need a fix + + # check if we have info about categorical features + if (self.use_categorical): + self.categorical_features = from_data.categorical_features + self.categorical_names = from_data.categorical_names + self.target_name = from_data.class_names + + #################################### + # this is a set of checks to make sure that we use the same as anchor encoding + cat_names = sorted(self.categorical_names.keys()) + assert (cat_names == self.categorical_features) + self.encoder = {} + for i in self.categorical_features: + self.encoder.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , + self.encoder[i].fit(self.X[:, [i]]) + + else: + self.categorical_features = [] + self.categorical_names = [] + self.encoder = [] + + fname = from_data + + elif from_model: + fname = from_model + self.load_datainfo(from_model) + if (self.use_categorical is False) and (self.options.use_categorical is True): + print( + "Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions") + exit() + # load model + + elif from_encoding: + fname = from_encoding + + # encoding, feature names, and number of classes + # are read from an input file + enc = SMTEncoder(None, None, None, self, from_encoding) + self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \ + self.num_class = enc.access() + + # create extra file names + try: + os.stat(options.output) + except: + os.mkdir(options.output) + + self.mapping_features() + ################# + self.test_encoding_transformes() + + bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] + bench_dir_name = options.output + "/bt/" + bench_name + try: + os.stat(bench_dir_name) + except: + os.mkdir(bench_dir_name) + + self.basename = (os.path.join(bench_dir_name, bench_name + + "_nbestim_" + str(options.n_estimators) + + "_maxdepth_" + str(options.maxdepth) + + "_testsplit_" + str(options.testsplit))) + + data_suffix = '.splitdata.pkl' + self.modfile = self.basename + '.mod.pkl' + + self.mod_plainfile = self.basename + '.mod.txt' + + self.resfile = self.basename + '.res.txt' + self.encfile = self.basename + '.enc.txt' + self.expfile = self.basename + '.exp.txt' + + def load_datainfo(self, model_from_pkl, data_from_pkl): + self.model = XGBClassifier() + self.model = model_from_pkl + loaded_data = data_from_pkl + self.X = loaded_data["X"] + self.Y = loaded_data["Y"] + self.X_train = loaded_data["X_train"] + self.X_test = loaded_data["X_test"] + self.Y_train = loaded_data["Y_train"] + self.Y_test = loaded_data["Y_test"] + self.feature_names = loaded_data["feature_names"] + self.target_name = loaded_data["target_name"] + self.num_class = loaded_data["num_class"] + self.nb_features = len(self.feature_names) + self.categorical_features = loaded_data["categorical_features"] + self.categorical_names = loaded_data["categorical_names"] + self.encoder = loaded_data["encoder"] + self.use_categorical = loaded_data["use_categorical"] + + def train(self, outfile=None): + """ + Train a tree ensemble using XGBoost. + """ + + return self.build_xgbtree(outfile) + + def encode(self, test_on=None): + """ + Encode a tree ensemble trained previously. + """ + + encoder = SMTEncoder(self.model, self.feature_names, self.num_class, self) + self.enc, self.intvs, self.imaps, self.ivars = encoder.encode() + + if test_on: + encoder.test_sample(np.array(test_on)) + + # encoder.save_to(self.encfile) + + def explain(self, sample, use_lime=None, use_anchor=None, use_shap=None, + expl_ext=None, prefer_ext=False, nof_feats=5): + """ + Explain a prediction made for a given sample with a previously + trained tree ensemble. + """ + + if use_lime: + expl = use_lime(self, sample=sample, nb_samples=5, + nb_features_in_exp=nof_feats) + elif use_anchor: + expl = use_anchor(self, sample=sample, nb_samples=5, + nb_features_in_exp=nof_feats, threshold=0.95) + elif use_shap: + expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats) + else: + if 'x' not in dir(self): + self.x = SMTExplainer(self.enc, self.intvs, self.imaps, + self.ivars, self.feature_names, self.num_class, + self.options, self) + + expl = self.x.explain(np.array(sample), self.options.smallest, + expl_ext, prefer_ext) + + # returning the explanation + return expl + + def validate(self, sample, expl): + """ + Make an attempt to show that a given explanation is optimistic. + """ + + # there must exist an encoding + if 'enc' not in dir(self): + encoder = SMTEncoder(self.model, self.feature_names, self.num_class, + self) + self.enc, _, _, _ = encoder.encode() + + if 'v' not in dir(self): + self.v = SMTValidator(self.enc, self.feature_names, self.num_class, + self) + + # try to compute a counterexample + return self.v.validate(np.array(sample), expl) + + def transform(self, x): + if (len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert (self.encoder != []) + tx = [] + for i in range(self.nb_features): + self.encoder[i].drop = None + if (i in self.categorical_features): + tx_aux = self.encoder[i].transform(x[:, [i]]) + tx_aux = np.vstack(tx_aux) + tx.append(tx_aux) + else: + tx.append(x[:, [i]]) + tx = np.hstack(tx) + return tx + else: + return x + + def transform_inverse(self, x): + if (len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert (self.encoder != []) + inverse_x = [] + for i, xi in enumerate(x): + inverse_xi = np.zeros(self.nb_features) + for f in range(self.nb_features): + if f in self.categorical_features: + nb_values = len(self.categorical_names[f]) + v = xi[:nb_values] + v = np.expand_dims(v, axis=0) + iv = self.encoder[f].inverse_transform(v) + inverse_xi[f] = iv + xi = xi[nb_values:] + + else: + inverse_xi[f] = xi[0] + xi = xi[1:] + inverse_x.append(inverse_xi) + return inverse_x + else: + return x + + def transform_inverse_by_index(self, idx): + if (idx in self.extended_feature_names): + return self.extended_feature_names[idx] + else: + print("Warning there is no feature {} in the internal mapping".format(idx)) + return None + + def transform_by_value(self, feat_value_pair): + if (feat_value_pair in self.extended_feature_names.values()): + keys = ( + list(self.extended_feature_names.keys())[list(self.extended_feature_names.values()).index(feat_value_pair)]) + return keys + else: + print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) + return None + + def mapping_features(self): + self.extended_feature_names = {} + self.extended_feature_names_as_array_strings = [] + counter = 0 + if (self.use_categorical): + for i in range(self.nb_features): + if (i in self.categorical_features): + for j, _ in enumerate(self.encoder[i].categories_[0]): + self.extended_feature_names.update({counter: (self.feature_names[i], j)}) + self.extended_feature_names_as_array_strings.append( + "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) + counter = counter + 1 + else: + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) + counter = counter + 1 + else: + for i in range(self.nb_features): + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) + counter = counter + 1 + + def readable_sample(self, x): + readable_x = [] + for i, v in enumerate(x): + if (i in self.categorical_features): + readable_x.append(self.categorical_names[i][int(v)]) + else: + readable_x.append(v) + return np.asarray(readable_x) + + def test_encoding_transformes(self): + # test encoding + + X = self.X_train[[0], :] + + print("Sample of length", len(X[0]), " : ", X) + enc_X = self.transform(X) + print("Encoded sample of length", len(enc_X[0]), " : ", enc_X) + inv_X = self.transform_inverse(enc_X) + print("Back to sample", inv_X) + print("Readable sample", self.readable_sample(inv_X[0])) + assert ((inv_X == X).all()) + + if (self.options.verb > 1): + for i in range(len(self.extended_feature_names)): + print(i, self.transform_inverse_by_index(i)) + for key, value in self.extended_feature_names.items(): + print(value, self.transform_by_value(value)) + + def transfomed_sample_info(self, i): + print(enc.categories_) + + def build_xgbtree(self, outfile=None): + """ + Build an ensemble of trees. + """ + + if (outfile is None): + outfile = self.modfile + else: + self.datafile = sefl.form_datefile_name(outfile) + + # fit model no training data + + if (len(self.X_test) > 0): + eval_set = [(self.transform(self.X_train), self.Y_train), (self.transform(self.X_test), self.Y_test)] + else: + eval_set = [(self.transform(self.X_train), self.Y_train)] + + print("start xgb") + self.model.fit(self.transform(self.X_train), self.Y_train, + eval_set=eval_set, + verbose=self.options.verb) # eval_set=[(X_test, Y_test)], + print("end xgb") + + evals_result = self.model.evals_result() + ########## saving model + self.save_datainfo(outfile) + print("saving plain model to ", self.mod_plainfile) + self.model._Booster.dump_model(self.mod_plainfile) + + ensemble = TreeEnsemble(self.model, self.extended_feature_names_as_array_strings, nb_classes=self.num_class) + + y_pred_prob = self.model.predict_proba(self.transform(self.X_train[:10])) + y_pred_prob_compute = ensemble.predict(self.transform(self.X_train[:10]), self.num_class) + + assert (np.absolute(y_pred_prob_compute - y_pred_prob).sum() < 0.01 * len(y_pred_prob)) + + ### accuracy + try: + train_accuracy = round(1 - evals_result['validation_0']['merror'][-1], 2) + except: + try: + train_accuracy = round(1 - evals_result['validation_0']['error'][-1], 2) + except: + assert (False) + + try: + test_accuracy = round(1 - evals_result['validation_1']['merror'][-1], 2) + except: + try: + test_accuracy = round(1 - evals_result['validation_1']['error'][-1], 2) + except: + print("no results test data") + test_accuracy = 0 + + #### saving + print("saving results to ", self.resfile) + with open(self.resfile, 'w') as f: + f.write("{} & {} & {} &{} &{} & {} \\\\ \n \hline \n".format( + os.path.basename(self.options.files[0]).replace("_", "-"), + train_accuracy, + test_accuracy, + self.options.n_estimators, + self.options.maxdepth, + self.options.testsplit)) + f.close() + + print("c BT sz:", ensemble.sz) + print("Train accuracy: %.2f%%" % (train_accuracy * 100.0)) + print("Test accuracy: %.2f%%" % (test_accuracy * 100.0)) + + return train_accuracy, test_accuracy, self.model diff --git a/pages/application/RandomForest/utils/xgbrf/__init__.py b/pages/application/RandomForest/utils/xgbrf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7cf92d3ca20939258ac326649b20fc5c0a79abb7 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbrf/__init__.py @@ -0,0 +1,4 @@ +from .encode import * +from .tree import * +from .xgb_rf import * +from .preprocess import * \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xgbrf/encode.py b/pages/application/RandomForest/utils/xgbrf/encode.py new file mode 100644 index 0000000000000000000000000000000000000000..6a77fb3afb792f0cd15276c11e03b4b4005f5109 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbrf/encode.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## encode.py +## +## Created on: Dec 7, 2018 +## Author: Alexey Ignatiev +## E-mail: aignatiev@ciencias.ulisboa.pt +## + +# +#============================================================================== +from __future__ import print_function +import collections +from pysat.formula import IDPool +from pysmt.smtlib.parser import SmtLibParser +from pysmt.shortcuts import And, BOOL, Iff, Implies, Not, Or, Symbol, get_model +from pysmt.shortcuts import Equals, ExactlyOne, LT, Plus, REAL, Real, write_smtlib +from .tree import TreeEnsemble, scores_tree +import six +from six.moves import range + +try: # for Python2 + from cStringIO import StringIO +except ImportError: # for Python3 + from io import StringIO + + +# +#============================================================================== +class SMTEncoder(object): + """ + Encoder of XGBoost tree ensembles into SMT. + """ + + def __init__(self, model, feats, nof_classes, xgb, from_file=None): + """ + Constructor. + """ + + self.model = model + self.feats = {f: i for i, f in enumerate(feats)} + self.nofcl = nof_classes + self.idmgr = IDPool() + self.optns = xgb.options + + # xgbooster will also be needed + self.xgb = xgb + + # for interval-based encoding + self.intvs, self.imaps, self.ivars = None, None, None + + if from_file: + self.load_from(from_file) + + def traverse(self, tree, tvar, prefix=[]): + """ + Traverse a tree and encode each node. + """ + + if tree.children: + pos, neg = self.encode_node(tree) + + self.traverse(tree.children[0], tvar, prefix + [pos]) + self.traverse(tree.children[1], tvar, prefix + [neg]) + else: # leaf node + if prefix: + self.enc.append(Implies(And(prefix), Equals(tvar, Real(tree.values)))) + else: + self.enc.append(Equals(tvar, Real(tree.values))) + + def encode_node(self, node): + """ + Encode a node of a tree. + """ + + if '_' not in node.name: + # continuous features => expecting an upper bound + # feature and its upper bound (value) + f, v = node.name, node.threshold + + existing = True if tuple([f, v]) in self.idmgr.obj2id else False + vid = self.idmgr.id(tuple([f, v])) + bv = Symbol('bvar{0}'.format(vid), typename=BOOL) + + if not existing: + if self.intvs: + d = self.imaps[f][v] + 1 + pos, neg = self.ivars[f][:d], self.ivars[f][d:] + self.enc.append(Iff(bv, Or(pos))) + self.enc.append(Iff(Not(bv), Or(neg))) + else: + fvar, fval = Symbol(f, typename=REAL), Real(v) + self.enc.append(Iff(bv, LT(fvar, fval))) + + return bv, Not(bv) + else: + # all features are expected to be categorical and + # encoded with one-hot encoding into Booleans + # each node is expected to be of the form: f_i < 0.5 + bv = Symbol(node.name, typename=BOOL) + + # left branch is positive, i.e. bv is true + # right branch is negative, i.e. bv is false + return Not(bv), bv + + def compute_intervals(self): + """ + Traverse all trees in the ensemble and extract intervals for each + feature. + + At this point, the method only works for numerical datasets! + """ + + def traverse_intervals(tree): + """ + Auxiliary function. Recursive tree traversal. + """ + + if tree.children: + f = tree.name + v = tree.threshold + self.intvs[f].add(v) + + traverse_intervals(tree.children[0]) + traverse_intervals(tree.children[1]) + + # initializing the intervals + self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} + + for tree in self.ensemble.trees: + traverse_intervals(tree) + + # OK, we got all intervals; let's sort the values + self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} + + self.imaps, self.ivars = {}, {} + for feat, intvs in six.iteritems(self.intvs): + self.imaps[feat] = {} + self.ivars[feat] = [] + for i, ub in enumerate(intvs): + self.imaps[feat][ub] = i + + ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) + self.ivars[feat].append(ivar) + + def encode(self): + """ + Do the job. + """ + + self.enc = [] + + # getting a tree ensemble + self.ensemble = TreeEnsemble(self.model, + self.xgb.extended_feature_names_as_array_strings, + nb_classes=self.nofcl) + + # introducing class score variables + csum = [] + for j in range(self.nofcl): + cvar = Symbol('class{0}_score'.format(j), typename=REAL) + csum.append(tuple([cvar, []])) + + # if targeting interval-based encoding, + # traverse all trees and extract all possible intervals + # for each feature + if self.optns.encode == 'smtbool': + self.compute_intervals() + + # traversing and encoding each tree + for i, tree in enumerate(self.ensemble.trees): + # getting class id + clid = i % self.nofcl + + # encoding the tree + tvar = Symbol('tr{0}_score'.format(i + 1), typename=REAL) + self.traverse(tree, tvar, prefix=[]) + + # this tree contributes to class with clid + csum[clid][1].append(tvar) + + # encoding the sums + for pair in csum: + cvar, tvars = pair + self.enc.append(Equals(cvar, Plus(tvars))) + + # enforce exactly one of the feature values to be chosen + # (for categorical features) + categories = collections.defaultdict(lambda: []) + for f in self.xgb.extended_feature_names_as_array_strings: + if '_' in f: + categories[f.split('_')[0]].append(Symbol(name=f, typename=BOOL)) + for c, feats in six.iteritems(categories): + self.enc.append(ExactlyOne(feats)) + + # number of assertions + nof_asserts = len(self.enc) + + # making conjunction + self.enc = And(self.enc) + + # number of variables + nof_vars = len(self.enc.get_free_variables()) + + if self.optns.verb: + print('encoding vars:', nof_vars) + print('encoding asserts:', nof_asserts) + + return self.enc, self.intvs, self.imaps, self.ivars + + def test_sample(self, sample): + """ + Check whether or not the encoding "predicts" the same class + as the classifier given an input sample. + """ + + # first, compute the scores for all classes as would be + # predicted by the classifier + + # score arrays computed for each class + csum = [[] for c in range(self.nofcl)] + + if self.optns.verb: + print('testing sample:', list(sample)) + + sample_internal = list(self.xgb.transform(sample)[0]) + + # traversing all trees + for i, tree in enumerate(self.ensemble.trees): + # getting class id + clid = i % self.nofcl + + # a score computed by the current tree + score = scores_tree(tree, sample_internal) + + # this tree contributes to class with clid + csum[clid].append(score) + + # final scores for each class + cscores = [sum(scores) for scores in csum] + + # second, get the scores computed with the use of the encoding + + # asserting the sample + hypos = [] + + if not self.intvs: + for i, fval in enumerate(sample_internal): + feat, vid = self.xgb.transform_inverse_by_index(i) + fid = self.feats[feat] + + if vid == None: + fvar = Symbol('f{0}'.format(fid), typename=REAL) + hypos.append(Equals(fvar, Real(float(fval)))) + else: + fvar = Symbol('f{0}_{1}'.format(fid, vid), typename=BOOL) + if int(fval) == 1: + hypos.append(fvar) + else: + hypos.append(Not(fvar)) + else: + for i, fval in enumerate(sample_internal): + feat, _ = self.xgb.transform_inverse_by_index(i) + feat = 'f{0}'.format(self.feats[feat]) + + # determining the right interval and the corresponding variable + for ub, fvar in zip(self.intvs[feat], self.ivars[feat]): + if ub == '+' or fval < ub: + hypos.append(fvar) + break + else: + assert 0, 'No proper interval found for {0}'.format(feat) + + # now, getting the model + escores = [] + model = get_model(And(self.enc, *hypos), solver_name=self.optns.solver) + for c in range(self.nofcl): + v = Symbol('class{0}_score'.format(c), typename=REAL) + escores.append(float(model.get_py_value(v))) + + assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \ + 'wrong prediction: {0} vs {1}'.format(cscores, escores) + + if self.optns.verb: + print('xgb scores:', cscores) + print('enc scores:', escores) + + def save_to(self, outfile): + """ + Save the encoding into a file with a given name. + """ + + if outfile.endswith('.txt'): + outfile = outfile[:-3] + 'smt2' + + write_smtlib(self.enc, outfile) + + # appending additional information + with open(outfile, 'r') as fp: + contents = fp.readlines() + + # comments + comments = ['; features: {0}\n'.format(', '.join(self.feats)), + '; classes: {0}\n'.format(self.nofcl)] + + if self.intvs: + for f in self.xgb.extended_feature_names_as_array_strings: + c = '; i {0}: '.format(f) + c += ', '.join(['{0}<->{1}'.format(u, v) for u, v in zip(self.intvs[f], self.ivars[f])]) + comments.append(c + '\n') + + contents = comments + contents + with open(outfile, 'w') as fp: + fp.writelines(contents) + + def load_from(self, infile): + """ + Loads the encoding from an input file. + """ + + with open(infile, 'r') as fp: + file_content = fp.readlines() + + # empty intervals for the standard encoding + self.intvs, self.imaps, self.ivars = {}, {}, {} + + for line in file_content: + if line[0] != ';': + break + elif line.startswith('; i '): + f, arr = line[4:].strip().split(': ', 1) + f = f.replace('-', '_') + self.intvs[f], self.imaps[f], self.ivars[f] = [], {}, [] + + for i, pair in enumerate(arr.split(', ')): + ub, symb = pair.split('<->') + + if ub[0] != '+': + ub = float(ub) + symb = Symbol(symb, typename=BOOL) + + self.intvs[f].append(ub) + self.ivars[f].append(symb) + self.imaps[f][ub] = i + + elif line.startswith('; features:'): + self.feats = line[11:].strip().split(', ') + elif line.startswith('; classes:'): + self.nofcl = int(line[10:].strip()) + + parser = SmtLibParser() + script = parser.get_script(StringIO(''.join(file_content))) + + self.enc = script.get_last_formula() + + def access(self): + """ + Get access to the encoding, features names, and the number of + classes. + """ + + return self.enc, self.intvs, self.imaps, self.ivars, self.feats, self.nofcl diff --git a/pages/application/RandomForest/utils/xgbrf/explain.py b/pages/application/RandomForest/utils/xgbrf/explain.py new file mode 100644 index 0000000000000000000000000000000000000000..7487f0eb683ad30b5c19546a5eb5d8db9009b28e --- /dev/null +++ b/pages/application/RandomForest/utils/xgbrf/explain.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## explain.py +## +## Created on: Dec 14, 2018 +## Author: Alexey Ignatiev +## E-mail: aignatiev@ciencias.ulisboa.pt +## + +# +#============================================================================== +from __future__ import print_function +import numpy as np +import os +from pysat.examples.hitman import Hitman +from pysat.formula import IDPool +from pysmt.shortcuts import Solver +from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol +from pysmt.shortcuts import Equals, GT, Int, Real, REAL +import resource +from six.moves import range +import sys + + +# +#============================================================================== +class SMTExplainer(object): + """ + An SMT-inspired minimal explanation extractor for XGBoost models. + """ + + def __init__(self, formula, intvs, imaps, ivars, feats, nof_classes, + options, xgb): + """ + Constructor. + """ + + self.feats = feats + self.intvs = intvs + self.imaps = imaps + self.ivars = ivars + self.nofcl = nof_classes + self.optns = options + self.idmgr = IDPool() + + # saving XGBooster + self.xgb = xgb + + self.verbose = self.optns.verb + self.oracle = Solver(name=options.solver) + + self.inps = [] # input (feature value) variables + for f in self.xgb.extended_feature_names_as_array_strings: + if '_' not in f: + self.inps.append(Symbol(f, typename=REAL)) + else: + self.inps.append(Symbol(f, typename=BOOL)) + + self.outs = [] # output (class score) variables + for c in range(self.nofcl): + self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) + + # theory + self.oracle.add_assertion(formula) + + # current selector + self.selv = None + + def prepare(self, sample): + """ + Prepare the oracle for computing an explanation. + """ + + if self.selv: + # disable the previous assumption if any + self.oracle.add_assertion(Not(self.selv)) + + # creating a fresh selector for a new sample + sname = ','.join([str(v).strip() for v in sample]) + + # the samples should not repeat; otherwise, they will be + # inconsistent with the previously introduced selectors + assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) + self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) + + self.rhypos = [] # relaxed hypotheses + + # transformed sample + self.sample = list(self.xgb.transform(sample)[0]) + + self.sel2fid = {} # selectors to original feature ids + self.sel2vid = {} # selectors to categorical feature ids + + # preparing the selectors + for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): + feat = inp.symbol_name().split('_')[0] + selv = Symbol('selv_{0}'.format(feat)) + val = float(val) + + self.rhypos.append(selv) + if selv not in self.sel2fid: + self.sel2fid[selv] = int(feat[1:]) + self.sel2vid[selv] = [i - 1] + else: + self.sel2vid[selv].append(i - 1) + + # adding relaxed hypotheses to the oracle + if not self.intvs: + for inp, val, sel in zip(self.inps, self.sample, self.rhypos): + if '_' not in inp.symbol_name(): + hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) + else: + hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) + + self.oracle.add_assertion(hypo) + else: + for inp, val, sel in zip(self.inps, self.sample, self.rhypos): + inp = inp.symbol_name() + # determining the right interval and the corresponding variable + for ub, fvar in zip(self.intvs[inp], self.ivars[inp]): + if ub == '+' or val < ub: + hypo = Implies(self.selv, Implies(sel, fvar)) + break + + self.oracle.add_assertion(hypo) + + # in case of categorical data, there are selector duplicates + # and we need to remove them + self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) + + # propagating the true observation + if self.oracle.solve([self.selv] + self.rhypos): + model = self.oracle.get_model() + else: + assert 0, 'Formula is unsatisfiable under given assumptions' + + # choosing the maximum + outvals = [float(model.get_py_value(o)) for o in self.outs] + maxoval = max(zip(outvals, range(len(outvals)))) + + # correct class id (corresponds to the maximum computed) + self.out_id = maxoval[1] + self.output = self.xgb.target_name[self.out_id] + + # forcing a misclassification, i.e. a wrong observation + disj = [] + for i in range(len(self.outs)): + if i != self.out_id: + disj.append(GT(self.outs[i], self.outs[self.out_id])) + self.oracle.add_assertion(Implies(self.selv, Or(disj))) + + if self.verbose: + inpvals = self.xgb.readable_sample(sample) + + self.preamble = [] + for f, v in zip(self.xgb.feature_names, inpvals): + if f not in v: + self.preamble.append('{0} = {1}'.format(f, v)) + else: + self.preamble.append(v) + + print('\n explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.output)) + + def explain(self, sample, smallest, expl_ext=None, prefer_ext=False): + """ + Hypotheses minimization. + """ + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # adapt the solver to deal with the current sample + self.prepare(sample) + + # saving external explanation to be minimized further + if expl_ext == None or prefer_ext: + self.to_consider = [True for h in self.rhypos] + else: + eexpl = set(expl_ext) + self.to_consider = [True if i in eexpl else False for i, h in enumerate(self.rhypos)] + + # if satisfiable, then the observation is not implied by the hypotheses + if self.oracle.solve([self.selv] + [h for h, c in zip(self.rhypos, self.to_consider) if c]): + print(' no implication!') + print(self.oracle.get_model()) + sys.exit(1) + + if not smallest: + self.compute_minimal(prefer_ext=prefer_ext) + else: + self.compute_smallest() + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time + + expl = sorted([self.sel2fid[h] for h in self.rhypos]) + #print('expl >>>> : ', expl) + + if self.verbose: + self.preamble = [self.preamble[i] for i in expl] + print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id])) + print(' # hypos left:', len(self.rhypos)) + print(' time: {0:.2f}'.format(self.time)) + + return expl + + def compute_minimal(self, prefer_ext=False): + """ + Compute any subset-minimal explanation. + """ + + i = 0 + + if not prefer_ext: + # here, we want to reduce external explanation + + # filtering out unnecessary features if external explanation is given + self.rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if c] + else: + # here, we want to compute an explanation that is preferred + # to be similar to the given external one + # for that, we try to postpone removing features that are + # in the external explanation provided + + rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if not c] + rhypos += [h for h, c in zip(self.rhypos, self.to_consider) if c] + self.rhypos = rhypos + + # simple deletion-based linear search + while i < len(self.rhypos): + to_test = self.rhypos[:i] + self.rhypos[(i + 1):] + + if self.oracle.solve([self.selv] + to_test): + i += 1 + else: + self.rhypos = to_test + + def compute_smallest(self): + """ + Compute a cardinality-minimal explanation. + """ + + # result + rhypos = [] + + with Hitman(bootstrap_with=[[i for i in range(len(self.rhypos)) if self.to_consider[i]]]) as hitman: + # computing unit-size MCSes + for i, hypo in enumerate(self.rhypos): + if self.to_consider[i] == False: + continue + + if self.oracle.solve([self.selv] + self.rhypos[:i] + self.rhypos[(i + 1):]): + hitman.hit([i]) + + # main loop + iters = 0 + while True: + hset = hitman.get() + iters += 1 + + if self.verbose > 1: + print('iter:', iters) + print('cand:', hset) + + if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset]): + to_hit = [] + satisfied, unsatisfied = [], [] + + removed = list(set(range(len(self.rhypos))).difference(set(hset))) + + model = self.oracle.get_model() + for h in removed: + i = self.sel2fid[self.rhypos[h]] + if '_' not in self.inps[i].symbol_name(): + # feature variable and its expected value + var, exp = self.inps[i], self.sample[i] + + # true value + true_val = float(model.get_py_value(var)) + + if not exp - 0.001 <= true_val <= exp + 0.001: + unsatisfied.append(h) + else: + hset.append(h) + else: + for vid in self.sel2vid[self.rhypos[h]]: + var, exp = self.inps[vid], int(self.sample[vid]) + + # true value + true_val = int(model.get_py_value(var)) + + if exp != true_val: + unsatisfied.append(h) + break + else: + hset.append(h) + + # computing an MCS (expensive) + for h in unsatisfied: + if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset] + [self.rhypos[h]]): + hset.append(h) + else: + to_hit.append(h) + + if self.verbose > 1: + print('coex:', to_hit) + + hitman.hit(to_hit) + else: + self.rhypos = [self.rhypos[i] for i in hset] + break diff --git a/pages/application/RandomForest/utils/xgbrf/pi_checker.py b/pages/application/RandomForest/utils/xgbrf/pi_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..f3f0d33cbd6118a1d746ded0a478f6b4ba4a0d90 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbrf/pi_checker.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## pi_checker.py +## +## Created on: +## Author: +## E-mail: +## + +# +#============================================================================== +from __future__ import print_function +import getopt +import numpy as np +import os +from pysat.formula import IDPool +from pysmt.shortcuts import Solver +from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol +from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL +import resource +from six.moves import range +import sys + + +# +#============================================================================== +class SMTChecker(object): + """ + checking explanation if is a Prime Implicant using SMT solving. + """ + + def __init__(self, formula, feats, nof_classes, xgb): + """ + Constructor. + """ + + self.ftids = {f: i for i, f in enumerate(feats)} + self.nofcl = nof_classes + self.idmgr = IDPool() + self.optns = xgb.options + + # xgbooster will also be needed + self.xgb = xgb + + self.verbose = self.optns.verb + self.oracle = Solver(name=self.xgb.options.solver) + + self.inps = [] # input (feature value) variables + for f in self.xgb.extended_feature_names_as_array_strings: + if '_' not in f: + self.inps.append(Symbol(f, typename=REAL)) + else: + self.inps.append(Symbol(f, typename=BOOL)) + + self.outs = [] # output (class score) variables + for c in range(self.nofcl): + self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) + + # theory + self.oracle.add_assertion(formula) + #print('+++++ ',len(self.oracle._assertion_stack)) + + # current selector + self.selv = None + + def prepare(self, sample, expl): + """ + Prepare the oracle for validating an explanation given a sample. + """ + + if self.selv: + # disable the previous assumption if any + self.oracle.add_assertion(Not(self.selv)) + + # creating a fresh selector for a new sample + sname = ','.join([str(v).strip() for v in sample]) + + # the samples should not repeat; otherwise, they will be + # inconsistent with the previously introduced selectors + assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) + self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) + + self.rhypos = [] # relaxed hypotheses + + # transformed sample + self.sample = list(self.xgb.transform(sample)[0]) + + # preparing the selectors + for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): + feat = inp.symbol_name().split('_')[0] + selv = Symbol('selv_{0}'.format(feat)) + val = float(val) + + self.rhypos.append(selv) + + + # adding relaxed hypotheses to the oracle + for inp, val, sel in zip(self.inps, self.sample, self.rhypos): + if '_' not in inp.symbol_name(): + hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) + else: + hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) + + self.oracle.add_assertion(hypo) + + # propagating the true observation + if self.oracle.solve([self.selv] + self.rhypos): + model = self.oracle.get_model() + else: + assert 0, 'Formula is unsatisfiable under given assumptions' + + # choosing the maximum + outvals = [float(model.get_py_value(o)) for o in self.outs] + maxoval = max(zip(outvals, range(len(outvals)))) + + # correct class id (corresponds to the maximum computed) + true_output = maxoval[1] + + # forcing a misclassification, i.e. a wrong observation + disj = [] + for i in range(len(self.outs)): + if i != true_output: + disj.append(GT(self.outs[i], self.outs[true_output])) + self.oracle.add_assertion(Implies(self.selv, Or(disj))) + + # removing all hypotheses except for those in the explanation + hypos = [] + for i, hypo in enumerate(self.rhypos): + j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] + if j in expl: + hypos.append(hypo) + self.rhypos = hypos + #print('assumps: ', self.rhypos) + #print('expl: ', expl) + + ''' + if self.verbose: + inpvals = self.xgb.readable_sample(sample) + + preamble = [] + for f, v in zip(self.xgb.feature_names, inpvals): + if f not in v: + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) + ''' + #print('+++++ ',self.oracle._assertion_stack[len(self.oracle._assertion_stack)-1 : ]) + + def check(self, sample, expl): + """ + Check the explanation. + """ + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # adapt the solver to deal with the current sample + self.prepare(sample, expl) + + # if satisfiable, then there is a counterexample + if self.oracle.solve([self.selv] + self.rhypos): + print('\n explanation is incorrect') + #print(self.oracle.get_model()) + return False + else: + if self.verbose: + print('\n explanation is correct') + + # in case of categorical data, there are selector duplicates + # and we need to remove them + self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) + #print(self.rhypos) + + i = 0 + # simple deletion-based linear search + while i < len(self.rhypos): + to_test = self.rhypos[:i] + self.rhypos[(i + 1):] + #print(self.rhypos[i]) + + if self.oracle.solve([self.selv] + to_test): + i += 1 + else: + print(' explanation is not a prime implicant') + return False + + + print(' explanation is a prime implicant') + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time + + if self.verbose: + print(' time: {0:.2f}'.format(self.time)) + + return True diff --git a/pages/application/RandomForest/utils/xgbrf/preprocess.py b/pages/application/RandomForest/utils/xgbrf/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..cdcd2cb3976c8764a21b512d2a5c80a073b739f1 --- /dev/null +++ b/pages/application/RandomForest/utils/xgbrf/preprocess.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## preprocess.py +## +## Created on: Jan 10, 2019 +## Author: Nina Narodytska +## E-mail: narodytska@vmware.com +## + +# +#============================================================================== +import json +import numpy as np +import xgboost as xgb +import math +import pandas as pd +import numpy as np +import sklearn +import pickle + + +# +#============================================================================== +def preprocess_dataset(raw_data_path, files): + print("preprocess dataset from ", raw_data_path) + files = files.split(",") + + data_file = files[0] + dataset_name = files[1] + + try: + data_raw = pd.read_csv(raw_data_path + data_file, sep=',', na_values= ['']) + catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) + categorical_features = np.concatenate(catcols.values).tolist() + + + for i in range(len(data_raw.values[0])): + if i in categorical_features: + data_raw.fillna('',inplace=True) + else: + data_raw.fillna(0,inplace=True) + dataset_all = data_raw + dataset = dataset_all.values.copy() + + print(categorical_features) + except Exception as e: + print("Please provide info about categorical columns/original datasets or omit option -p", e) + exit() + + # move categrorical columns forward + + feature_names = dataset_all.columns + print(feature_names) + + ############################## + extra_info = {} + categorical_names = {} + print(categorical_features) + dataset_new = dataset_all.values.copy() + for feature in categorical_features: + print("feature", feature) + print(dataset[:, feature]) + le = sklearn.preprocessing.LabelEncoder() + le.fit(dataset[:, feature]) + categorical_names[feature] = le.classes_ + dataset_new[:, feature] = le.transform(dataset[:, feature]) + + ###################################3 + # target as categorical + labels_new = [] + + le = sklearn.preprocessing.LabelEncoder() + le.fit(dataset[:, -1]) + dataset_new[:, -1]= le.transform(dataset[:, -1]) + class_names = le.classes_ + ######################################33 + + + if (False): + dataset_new = np.delete(dataset_new, -1, axis=1) + oneencoder = sklearn.preprocessing.OneHotEncoder() + oneencoder.fit(dataset_new[:, categorical_features]) + print(oneencoder.categories_) + n_transformed_features = sum([len(cats) for cats in oneencoder.categories_]) + print(n_transformed_features) + print(dataset_new.shape) + X = dataset_new[:,categorical_features][0] + print(X) + x = np.expand_dims(X, axis=0) + print("x", x, x.shape) + y = dataset_new[0].copy() + print(y.shape, oneencoder.transform(x).shape) + y[categorical_features] = oneencoder.transform(x).toarray() + + print("y", y, y.shape) + + z = oneencoder.inverse_transform(y) + print(z.shape) + exit() + + ###########################################################################3 + extra_info = {"categorical_features": categorical_features, + "categorical_names": categorical_names, + "feature_names": feature_names, + "class_names": class_names} + + new_file_train = raw_data_path + dataset_name + '_data.csv' + df = pd.DataFrame(data=dataset_new) + df.columns = list(feature_names) + df.to_csv(new_file_train, mode = 'w', index=False) + print("new dataset", new_file_train) + + + f = open(raw_data_path + dataset_name + '_data.csv.pkl', "wb") + pickle.dump(extra_info, f) + f.close() diff --git a/pages/application/RandomForest/utils/xgbrf/tree.py b/pages/application/RandomForest/utils/xgbrf/tree.py new file mode 100644 index 0000000000000000000000000000000000000000..afe34d97e331057f9a5b7c0ccef63e85801a572d --- /dev/null +++ b/pages/application/RandomForest/utils/xgbrf/tree.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## tree.py (reuses parts of the code of SHAP) +## +## Created on: Dec 7, 2018 +## Author: Nina Narodytska +## E-mail: narodytska@vmware.com +## + +# +#============================================================================== +from anytree import Node, RenderTree,AsciiStyle +import json +import numpy as np +import xgboost as xgb +import math + + +# +#============================================================================== +class xgnode(Node): + def __init__(self, id, parent = None): + Node.__init__(self, id, parent) + self.id = id # The node value + self.name = None + self.left_node_id = -1 # Left child + self.right_node_id = -1 # Right child + self.missing_node_id = -1 + + self.feature = -1 + self.threshold = -1 + + self.cover = -1 + self.values = -1 + + def __str__(self): + pref = ' ' * self.depth + if (len(self.children) == 0): + return (pref+ "leaf: {} {}".format(self.id, self.values)) + else: + if(self.name is None): + return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) + else: + return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) + + +# +#============================================================================== +def build_tree(json_tree, node = None, feature_names = None, inverse = False): + def max_id(node): + if "children" in node: + return max(node["nodeid"], *[max_id(n) for n in node["children"]]) + else: + return node["nodeid"] + m = max_id(json_tree) + 1 + def extract_data(json_node, root = None, feature_names = None): + i = json_node["nodeid"] + if (root is None): + node = xgnode(i) + else: + node = xgnode(i, parent = root) + node.cover = json_node["cover"] + if "children" in json_node: + + node.left_node_id = json_node["yes"] + node.right_node_id = json_node["no"] + node.missing_node_id = json_node["missing"] + node.feature = json_node["split"] + if (feature_names is not None): + node.name = feature_names[node.feature] + node.threshold = json_node["split_condition"] + for c, n in enumerate(json_node["children"]): + child = extract_data(n, node, feature_names) + elif "leaf" in json_node: + node.values = json_node["leaf"] + if(inverse): + node.values = -node.values + return node + + root = extract_data(json_tree, None, feature_names) + return root + + +# +#============================================================================== +def walk_tree(node): + if (len(node.children) == 0): + # leaf + print(node) + else: + print(node) + walk_tree(node.children[0]) + walk_tree(node.children[1]) + + +# +#============================================================================== +def scores_tree(node, sample): + if (len(node.children) == 0): + # leaf + return node.values + else: + feature_branch = node.feature + sample_value = sample[feature_branch] + assert(sample_value is not None) + if(sample_value < node.threshold): + return scores_tree(node.children[0], sample) + else: + return scores_tree(node.children[1], sample) + + +# +#============================================================================== +class TreeEnsemble: + """ An ensemble of decision trees. + + This object provides a common interface to many different types of models. + """ + def __init__(self, model, feature_names = None, nb_classes = 0): + self.model_type = "xgboost" + #self.original_model = model.get_booster() + self.original_model = model + #### + self.base_offset = None + json_trees = get_xgboost_json(self.original_model) + self.trees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] + if(nb_classes == 2): + # NASTY trick for binary + # We change signs of values in leaves so that we can just sum all the values in leaves for class X + # and take max to get the right class + self.otrees = [build_tree(json.loads(t), None, feature_names, inverse = True) for t in json_trees] + self.itrees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] + self.trees = [] + for i,_ in enumerate(self.otrees): + self.trees.append(self.otrees[i]) + self.trees.append(self.itrees[i]) + self.feature_names = feature_names + def print_tree(self): + for i,t in enumerate(self.trees): + print("tree number: ", i) + walk_tree(t) + + def invert_tree_prob(self, node): + if (len(node.children) == 0): + node.values = -node.values + return node + else: + self.invert_tree_prob(node.children[0]) + self.invert_tree_prob(node.children[1]) + return node + def predict(self, samples, nb_classes): + # https://github.com/dmlc/xgboost/issues/1746#issuecomment-290130695 + prob = [] + nb_estimators = int(len(self.trees)/nb_classes) + for sample in np.asarray(samples): + scores = [] + for i,t in enumerate(self.trees): + s = scores_tree(t, sample) + scores.append((s)) + scores = np.asarray(scores) + class_scores = [] + if (nb_classes == 2): + + for i in range(nb_classes): + class_scores.append(math.exp(-(scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor + s0 = class_scores[0] + s1 = class_scores[1] + v0 = 1/(1 + s0) + v1 = 1/(1 + s1) + class_scores[0] = v0 + class_scores[1] = v1 + else: + for i in range(0,nb_classes*nb_estimators,nb_estimators): + class_scores.append(math.exp((scores[i:i+nb_estimators]).sum())) + #for i in range(nb_classes): + # class_scores.append(math.exp((scores[i::nb_classes]).sum())) + class_scores = np.asarray(class_scores) + prob.append(class_scores/class_scores.sum()) + return np.asarray(prob).reshape((-1, nb_classes)) + + +# +#============================================================================== +def get_xgboost_json(model): + """ REUSED FROM SHAP + This gets a JSON dump of an XGBoost model while ensuring the feature names are their indexes. + """ + fnames = model.feature_names + model.feature_names = None + json_trees = model.get_dump(with_stats=True, dump_format="json") + model.feature_names = fnames + return json_trees diff --git a/pages/application/RandomForest/utils/xgbrf/validate.py b/pages/application/RandomForest/utils/xgbrf/validate.py new file mode 100644 index 0000000000000000000000000000000000000000..024a6800f454c7bae39104a0f17781c2755ce0ea --- /dev/null +++ b/pages/application/RandomForest/utils/xgbrf/validate.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## validate.py +## +## Created on: Jan 4, 2019 +## Author: Alexey Ignatiev +## E-mail: aignatiev@ciencias.ulisboa.pt +## + +# +#============================================================================== +from __future__ import print_function +import getopt +import numpy as np +import os +from pysat.formula import IDPool +from pysmt.shortcuts import Solver +from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol +from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL +import resource +from six.moves import range +import sys + + +# +#============================================================================== +class SMTValidator(object): + """ + Validating Anchor's explanations using SMT solving. + """ + + def __init__(self, formula, feats, nof_classes, xgb): + """ + Constructor. + """ + + self.ftids = {f: i for i, f in enumerate(feats)} + self.nofcl = nof_classes + self.idmgr = IDPool() + self.optns = xgb.options + + # xgbooster will also be needed + self.xgb = xgb + + self.verbose = self.optns.verb + self.oracle = Solver(name=self.xgb.options.solver) + + self.inps = [] # input (feature value) variables + for f in self.xgb.extended_feature_names_as_array_strings: + if '_' not in f: + self.inps.append(Symbol(f, typename=REAL)) + else: + self.inps.append(Symbol(f, typename=BOOL)) + + self.outs = [] # output (class score) variables + for c in range(self.nofcl): + self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) + + # theory + self.oracle.add_assertion(formula) + + # current selector + self.selv = None + + def prepare(self, sample, expl): + """ + Prepare the oracle for validating an explanation given a sample. + """ + + if self.selv: + # disable the previous assumption if any + self.oracle.add_assertion(Not(self.selv)) + + # creating a fresh selector for a new sample + sname = ','.join([str(v).strip() for v in sample]) + + # the samples should not repeat; otherwise, they will be + # inconsistent with the previously introduced selectors + assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) + self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) + + self.rhypos = [] # relaxed hypotheses + + # transformed sample + self.sample = list(self.xgb.transform(sample)[0]) + + # preparing the selectors + for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): + feat = inp.symbol_name().split('_')[0] + selv = Symbol('selv_{0}'.format(feat)) + val = float(val) + + self.rhypos.append(selv) + + # adding relaxed hypotheses to the oracle + for inp, val, sel in zip(self.inps, self.sample, self.rhypos): + if '_' not in inp.symbol_name(): + hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) + else: + hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) + + self.oracle.add_assertion(hypo) + + # propagating the true observation + if self.oracle.solve([self.selv] + self.rhypos): + model = self.oracle.get_model() + else: + assert 0, 'Formula is unsatisfiable under given assumptions' + + # choosing the maximum + outvals = [float(model.get_py_value(o)) for o in self.outs] + maxoval = max(zip(outvals, range(len(outvals)))) + + # correct class id (corresponds to the maximum computed) + true_output = maxoval[1] + + # forcing a misclassification, i.e. a wrong observation + disj = [] + for i in range(len(self.outs)): + if i != true_output: + disj.append(GT(self.outs[i], self.outs[true_output])) + self.oracle.add_assertion(Implies(self.selv, Or(disj))) + + # removing all hypotheses except for those in the explanation + hypos = [] + for i, hypo in enumerate(self.rhypos): + j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] + if j in expl: + hypos.append(hypo) + self.rhypos = hypos + + if self.verbose: + inpvals = self.xgb.readable_sample(sample) + + preamble = [] + for f, v in zip(self.xgb.feature_names, inpvals): + if f not in v: + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) + + def validate(self, sample, expl): + """ + Make an effort to show that the explanation is too optimistic. + """ + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # adapt the solver to deal with the current sample + self.prepare(sample, expl) + + # if satisfiable, then there is a counterexample + if self.oracle.solve([self.selv] + self.rhypos): + model = self.oracle.get_model() + inpvals = [float(model.get_py_value(i)) for i in self.inps] + outvals = [float(model.get_py_value(o)) for o in self.outs] + maxoval = max(zip(outvals, range(len(outvals)))) + + inpvals = self.xgb.transform_inverse(np.array(inpvals))[0] + self.coex = tuple([inpvals, maxoval[1]]) + inpvals = self.xgb.readable_sample(inpvals) + + if self.verbose: + preamble = [] + for f, v in zip(self.xgb.feature_names, inpvals): + if f not in v: + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + print(' explanation is incorrect') + print(' counterexample: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[maxoval[1]])) + else: + self.coex = None + + if self.verbose: + print(' explanation is correct') + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time + + if self.verbose: + print(' time: {0:.2f}'.format(self.time)) + + return self.coex + diff --git a/pages/application/RandomForest/utils/xgbrf/xgb_rf.py b/pages/application/RandomForest/utils/xgbrf/xgb_rf.py new file mode 100644 index 0000000000000000000000000000000000000000..024225fe8cf140a3eaaeef5f8dfc653c85db8d0f --- /dev/null +++ b/pages/application/RandomForest/utils/xgbrf/xgb_rf.py @@ -0,0 +1,600 @@ +#!/us/bin/env python +#-*- coding:utf-8 -*- +## +## xgb_rf.py +## +## Created on: May 23, 2020 +## Author: Yacine Izza +## E-mail: yacine.izza@univ-toulouse.fr +## + +# +#============================================================================== +from .validate import SMTValidator +from .pi_checker import SMTChecker +from .encode import SMTEncoder +from .explain import SMTExplainer +import numpy as np +import os +import resource +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +import sklearn +# print('The scikit-learn version is {}.'.format(sklearn.__version__)) + +from sklearn.preprocessing import OneHotEncoder +import sys +from six.moves import range +from .tree import TreeEnsemble +import xgboost as xgb +from xgboost import XGBRFClassifier, Booster, plot_tree +import matplotlib.pyplot as plt +import pickle + + +# +#============================================================================== +class XGBRandomForest(object): + """ + The main class to train/encode/explain Random Forest models. + """ + + def __init__(self, options, from_data=None, from_model=None, + from_encoding=None): + """ + Constructor. + """ + + assert from_data or from_model or from_encoding, \ + 'At least one input file should be specified' + + self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime + self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + + # saving command-line options + self.options = options + self.seed = 42 + np.random.seed(self.seed) + + if from_data: + self.use_categorical = self.options.use_categorical + # saving data + self.data = from_data + dataset = np.asarray(self.data.samps, dtype=np.float32) + + + # split data into X and y + self.feature_names = self.data.names[:-1] + self.nb_features = len(self.feature_names) + + self.X = dataset[:, 0:self.nb_features] + self.Y = dataset[:, self.nb_features] + self.num_class = len(set(self.Y)) + self.target_name = list(range(self.num_class)) + + param_dist = {'n_estimators':self.options.n_estimators, + 'max_depth':self.options.maxdepth} + + self.params = { 'num_parallel_tree': self.options.n_estimators, + 'max_depth': self.options.maxdepth, + 'colsample_bynode': 0.8, 'subsample': 0.8, + 'learning_rate': 1, 'random_state': self.seed, + 'verbosity' : self.options.verb + } + + if(self.num_class == 2): + self.params['eval_metric'] = 'error' + self.params['objective'] = 'binary:logistic' + else: + self.params['eval_metric'] = 'merror' + self.params['num_class'] = self.num_class + self.params['objective'] = 'multi:softprob' + + if(self.num_class == 2): + param_dist['objective'] = 'binary:logistic' + + #self.model = XGBRFClassifier(**param_dist) + self.model = None + + # split data into train and test sets + self.test_size = self.options.testsplit + if (self.test_size > 0): + self.X_train, self.X_test, self.Y_train, self.Y_test = \ + train_test_split(self.X, self.Y, test_size=self.test_size, + random_state=self.seed) + else: + self.X_train = self.X + self.X_test = [] # need a fix + self.Y_train = self.Y + self.Y_test = []# need a fix + + # check if we have info about categorical features + if (self.use_categorical): + self.categorical_features = from_data.categorical_features + self.categorical_names = from_data.categorical_names + self.target_name = from_data.class_names + + #################################### + # this is a set of checks to make sure that we use the same as anchor encoding + cat_names = sorted(self.categorical_names.keys()) + assert(cat_names == self.categorical_features) + self.encoder = {} + for i in self.categorical_features: + self.encoder.update({i: OneHotEncoder(categories='auto', sparse=False)})#, + self.encoder[i].fit(self.X[:,[i]]) + + else: + self.categorical_features = [] + self.categorical_names = [] + self.encoder = [] + + fname = from_data + + elif from_model: + fname = from_model + self.load_datainfo(from_model) + if (self.use_categorical is False) and (self.options.use_categorical is True): + print("Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions") + exit() + # load model + + elif from_encoding: + fname = from_encoding + + # encoding, feature names, and number of classes + # are read from an input file + enc = SMTEncoder(None, None, None, self, from_encoding) + self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \ + self.num_class = enc.access() + + # create extra file names + try: + os.stat(options.output) + except: + os.mkdir(options.output) + + self.mapping_features() + ################# + self.test_encoding_transformes() + + bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] + bench_dir_name = options.output + "/" + bench_name + try: + os.stat(bench_dir_name) + except: + os.mkdir(bench_dir_name) + + self.basename = (os.path.join(bench_dir_name, bench_name + + "_nbestim_" + str(options.n_estimators) + + "_maxdepth_" + str(options.maxdepth) + + "_testsplit_" + str(options.testsplit))) + + data_suffix = '.splitdata.pkl' + self.modfile = self.basename + '.mod.pkl' + + self.mod_plainfile = self.basename + '.mod.txt' + + self.resfile = self.basename + '.res.txt' + self.encfile = self.basename + '.enc.txt' + self.expfile = self.basename + '.exp.txt' + + def form_datefile_name(self, modfile): + data_suffix = '.splitdata.pkl' + return modfile + data_suffix + + def pickle_save_file(self, filename, data): + try: + f = open(filename, "wb") + pickle.dump(data, f) + f.close() + except: + print("Cannot save to file", filename) + exit() + + def pickle_load_file(self, filename): + try: + f = open(filename, "rb") + data = pickle.load(f) + f.close() + return data + except: + print("Cannot load from file", filename) + exit() + + def save_datainfo(self, filename): + + print("saving model to ", filename) + self.pickle_save_file(filename, self.model) + + filename_data = self.form_datefile_name(filename) + print("saving data to ", filename_data) + samples = {} + samples["X"] = self.X + samples["Y"] = self.Y + samples["X_train"] = self.X_train + samples["Y_train"] = self.Y_train + samples["X_test"] = self.X_test + samples["Y_test"] = self.Y_test + samples["feature_names"] = self.feature_names + samples["target_name"] = self.target_name + samples["num_class"] = self.num_class + samples["categorical_features"] = self.categorical_features + samples["categorical_names"] = self.categorical_names + samples["encoder"] = self.encoder + samples["use_categorical"] = self.use_categorical + + + self.pickle_save_file(filename_data, samples) + + def load_datainfo(self, filename): + print("loading model from ", filename) + self.model = XGBRFClassifier() + self.model = self.pickle_load_file(filename) + + datafile = self.form_datefile_name(filename) + print("loading data from ", datafile) + loaded_data = self.pickle_load_file(datafile) + self.X = loaded_data["X"] + self.Y = loaded_data["Y"] + self.X_train = loaded_data["X_train"] + self.X_test = loaded_data["X_test"] + self.Y_train = loaded_data["Y_train"] + self.Y_test = loaded_data["Y_test"] + self.feature_names = loaded_data["feature_names"] + self.target_name = loaded_data["target_name"] + self.num_class = loaded_data["num_class"] + self.nb_features = len(self.feature_names) + self.categorical_features = loaded_data["categorical_features"] + self.categorical_names = loaded_data["categorical_names"] + self.encoder = loaded_data["encoder"] + self.use_categorical = loaded_data["use_categorical"] + + def train(self, outfile=None): + """ + Train a random forest using XGBoost. + """ + + return self.build_xgbtree(outfile) + + def encode(self, test_on=None): + """ + Encode a random forest trained previously. + """ + encoder = SMTEncoder(self.model, self.feature_names, self.num_class, self) + self.enc, self.intvs, self.imaps, self.ivars = encoder.encode() + + if test_on: + encoder.test_sample(np.array(test_on)) + + encoder.save_to(self.encfile) + + def explain(self, sample, use_lime=None, use_anchor=None, use_shap=None, + expl_ext=None, prefer_ext=False, nof_feats=5): + """ + Explain a prediction made for a given sample with a previously + trained tree ensemble. + """ + + if use_lime: + expl = use_lime(self, sample=sample, nb_samples=5, nb_features_in_exp=nof_feats) + elif use_anchor: + expl = use_anchor(self, sample=sample, nb_samples=5, + nb_features_in_exp=nof_feats, threshold=0.95) + elif use_shap: + expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats) + else: + if 'x' not in dir(self): + self.x = SMTExplainer(self.enc, self.intvs, self.imaps, + self.ivars, self.feature_names, self.num_class, + self.options, self) + + expl = self.x.explain(np.array(sample), self.options.smallest, + expl_ext, prefer_ext) + + # returning the explanation + return expl + + def validate(self, sample, expl): + """ + Make an attempt to show that a given explanation is optimistic. + """ + + # there must exist an encoding + if 'enc' not in dir(self): + encoder = SMTEncoder(self.model, self.feature_names, self.num_class, + self) + self.enc, _, _, _ = encoder.encode() + + if 'v' not in dir(self): + self.v = SMTValidator(self.enc, self.feature_names, self.num_class, + self) + + # try to compute a counterexample + return self.v.validate(np.array(sample), expl) + + + def isPrimeImplicant(self, sample, expl): + """ + Check the explnation if it is a prime implicant. + """ + + # there must exist an encoding + if 'enc' not in dir(self): + encoder = SMTEncoder(self.model, self.feature_names, self.num_class, + self) + self.enc, _, _, _ = encoder.encode() + + if 'checker' not in dir(self): + self.checker = SMTChecker(self.enc, self.feature_names, self.num_class, + self) + + # check the explanation + return self.checker.check(np.array(sample), expl) + + def repair(self, sample, expl): + """ + Make an attempt to repair that a given pessimistic (incorrect) explanation. + """ + #encode without sample + self.encode() + gexpl = self.explain(sample, expl_ext=expl, prefer_ext=True) + + #global explanation + return gexpl + + def refine(self, sample, expl): + """ + Make an attempt to refine that a given optimistic explanation. + """ + #encode without sample + self.encode() + gexpl = self.explain(sample, expl_ext=expl) + + #global explanation + return gexpl + + def transform(self, x): + if(len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert(self.encoder != []) + tx = [] + for i in range(self.nb_features): + #self.encoder[i].drop = None + if (i in self.categorical_features): + self.encoder[i].drop = None + tx_aux = self.encoder[i].transform(x[:,[i]]) + tx_aux = np.vstack(tx_aux) + tx.append(tx_aux) + else: + tx.append(x[:,[i]]) + tx = np.hstack(tx) + return tx + else: + return x + + def transform_inverse(self, x): + if(len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert(self.encoder != []) + inverse_x = [] + for i, xi in enumerate(x): + inverse_xi = np.zeros(self.nb_features) + for f in range(self.nb_features): + if f in self.categorical_features: + nb_values = len(self.categorical_names[f]) + v = xi[:nb_values] + v = np.expand_dims(v, axis=0) + iv = self.encoder[f].inverse_transform(v) + inverse_xi[f] =iv + xi = xi[nb_values:] + + else: + inverse_xi[f] = xi[0] + xi = xi[1:] + inverse_x.append(inverse_xi) + return inverse_x + else: + return x + + def transform_inverse_by_index(self, idx): + if (idx in self.extended_feature_names): + return self.extended_feature_names[idx] + else: + print("Warning there is no feature {} in the internal mapping".format(idx)) + return None + + def transform_by_value(self, feat_value_pair): + if (feat_value_pair in self.extended_feature_names.values()): + keys = (list(self.extended_feature_names.keys())[list( self.extended_feature_names.values()).index(feat_value_pair)]) + return keys + else: + print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) + return None + + def mapping_features(self): + self.extended_feature_names = {} + self.extended_feature_names_as_array_strings = [] + counter = 0 + if (self.use_categorical): + for i in range(self.nb_features): + if (i in self.categorical_features): + for j, _ in enumerate(self.encoder[i].categories_[0]): + self.extended_feature_names.update({counter: (self.feature_names[i], j)}) + self.extended_feature_names_as_array_strings.append("f{}_{}".format(i,j)) # str(self.feature_names[i]), j)) + counter = counter + 1 + else: + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) #(self.feature_names[i]) + counter = counter + 1 + else: + for i in range(self.nb_features): + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i))#(self.feature_names[i]) + counter = counter + 1 + + def readable_sample(self, x): + readable_x = [] + for i, v in enumerate(x): + if (i in self.categorical_features): + readable_x.append(self.categorical_names[i][int(v)]) + else: + #readable_x.append(v) + readable_x.append(str(v)) + return np.asarray(readable_x) + + def test_encoding_transformes(self): + # test encoding + + X = self.X_train[[0],:] + + print("Sample of length", len(X[0])," : ", X) + enc_X = self.transform(X) + print("Encoded sample of length", len(enc_X[0])," : ", enc_X) + inv_X = self.transform_inverse(enc_X) + print("Back to sample", inv_X) + print("Readable sample", self.readable_sample(inv_X[0])) + assert((inv_X == X).all()) + + if (self.options.verb > 1): + for i in range(len(self.extended_feature_names)): + print(i, self.transform_inverse_by_index(i)) + for key, value in self.extended_feature_names.items(): + print(value, self.transform_by_value(value)) + + def transfomed_sample_info(self, i): + print(enc.categories_) + + def build_xgbtree(self, outfile=None): + """ + Build an ensemble of trees (forest). + """ + + if (outfile is None): + outfile = self.modfile + else: + self.datafile = sefl.form_datefile_name(outfile) + + # fit model no training data + + if (len(self.X_test) > 0): + eval_set=[(self.transform(self.X_train), self.Y_train), (self.transform(self.X_test), self.Y_test)] + else: + eval_set=[(self.transform(self.X_train), self.Y_train)] + + print("start xgb") + ''' + self.model.fit(self.transform(self.X_train), self.Y_train, + eval_set=eval_set, + verbose=self.options.verb) # eval_set=[(X_test, Y_test)], + ''' + dtrain = xgb.DMatrix(self.transform(self.X_train), label=self.Y_train) + dtest = xgb.DMatrix(self.transform(self.X_test), label=self.Y_test) + eval_set = [(dtrain, 'train'), (dtest, 'eval')] + evals_result = {} + self.model = xgb.train(self.params, dtrain, num_boost_round=1, + evals=eval_set, evals_result=evals_result) + print("end xgb") + print(self.model.get_score()) + print(len(self.model.get_score())) + #for i in range(5): + # xgb.plot_tree(self.model, num_trees=i) + # plt.show() + + + try: + train_accuracy = round(1 - evals_result['train']['merror'][-1],2) + except: + try: + train_accuracy = round(1 - evals_result['train']['error'][-1],2) + except: + assert(False) + try: + test_accuracy = round(1 - evals_result['eval']['merror'][-1],2) + except: + try: + test_accuracy = round(1 - evals_result['eval']['error'][-1],2) + except: + assert(False) + #print('Train accuracy_xgb: ',train_accuracy) + #print('Test accuracy_xgb: ', test_accuracy) + + + #evals_result = self.model.evals_result() + ########## saving model + self.save_datainfo(outfile) + print("saving plain model to ", self.mod_plainfile) + #self.model._Booster.dump_model(self.mod_plainfile) + self.model.dump_model(self.mod_plainfile) + + ensemble = TreeEnsemble(self.model, self.extended_feature_names_as_array_strings, nb_classes = self.num_class) + #ensemble.print_tree() + + #y_pred_prob = self.model.predict_proba(self.transform(self.X_train[:10])) + classone_probs = self.model.predict(xgb.DMatrix(self.transform(self.X_train[:10]))) + if self.num_class == 2: + classzero_probs = 1.0 - classone_probs + y_pred_prob = np.vstack((classzero_probs, classone_probs)).transpose() + else: + y_pred_prob = classone_probs + + y_pred_prob_compute = ensemble.predict(self.transform(self.X_train[:10]), self.num_class) + #print('y_pred_prob: \n', y_pred_prob) + #print('y_pred_prob_compute: \n',y_pred_prob_compute) + #print('\n\n') + + assert(np.absolute(y_pred_prob_compute- y_pred_prob).sum() < 0.01*len(y_pred_prob)) + + ### accuracy + ''' + try: + train_accuracy = round(1 - evals_result['validation_0']['merror'][-1],2) + except: + try: + train_accuracy = round(1 - evals_result['validation_0']['error'][-1],2) + except: + assert(False) + + try: + test_accuracy = round(1 - evals_result['validation_1']['merror'][-1],2) + except: + try: + test_accuracy = round(1 - evals_result['validation_1']['error'][-1],2) + except: + print("no results test data") + test_accuracy = 0 + ''' + + #### saving + + print("saving results to ", self.resfile) + with open(self.resfile, 'w') as f: + f.write("{} & {} & {} &{} &{} & {} \\\\ \n \hline \n".format( + os.path.basename(self.options.files[0]).replace("_","-"), + train_accuracy, + test_accuracy, + self.options.n_estimators, + self.options.maxdepth, + self.options.testsplit)) + f.close() + + print("Train accuracy: %.2f%%" % (train_accuracy * 100.0)) + print("Test accuracy: %.2f%%" % (test_accuracy * 100.0)) + + + return train_accuracy, test_accuracy, self.model + + def predict(self, X): + classone_probs = self.model.predict(xgb.DMatrix(self.transform(X))) + if self.num_class == 2: + classzero_probs = 1.0 - classone_probs + y_pred_prob = np.vstack((classzero_probs, classone_probs)).transpose() + else: + y_pred_prob = classone_probs + return y_pred_prob + diff --git a/pages/application/RandomForest/utils/xpAnchor.py b/pages/application/RandomForest/utils/xpAnchor.py new file mode 100755 index 0000000000000000000000000000000000000000..86958e2bfd42e79e43276b13124c9ffa0f3deed8 --- /dev/null +++ b/pages/application/RandomForest/utils/xpAnchor.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +#-*- coding:utf-8 -*- +## +## lime_wrap.py (reuses parts of the code of SHAP) +## + + +# +#============================================================================== +from __future__ import print_function +#from data import Data +from pages.application.RandomForest.utils.xrf import Dataset +import os +import sys +import pickle + + +import json +import numpy as np +import math +from anchor import utils +from anchor import anchor_tabular +import resource + + +# +#============================================================================== +def anchor_call(model, data, sample, threshold=0.95, verbose=0): + + + classifier_fn = lambda x: model.forest.predict(data.transform(x)).astype(float) + X_train, _, _, _ = data.train_test_split() + + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + explainer = anchor_tabular.AnchorTabularExplainer( + class_names=data.target_name, + feature_names=data.feature_names, + train_data=data.X) + #print(explainer.d_train) + + if (sample is not None): + try: + feat_sample = np.asarray(sample, dtype=np.float32) + except: + print("Cannot parse input sample:", sample) + exit() + if verbose: + print("\n\n\nStarting Anchor explainer... \nConsidering a sample with features:", feat_sample) + if not (len(feat_sample) == len(X_train[0])): + print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(X_train[0]))) + exit() + + # compute boost predictions + feat_sample_exp = np.expand_dims(feat_sample, axis=0) + feat_sample_exp = data.transform(feat_sample_exp) + ##y_pred = model.forest.predict(feat_sample_exp)[0] + y_pred_prob = model.forest.predict_proba(feat_sample_exp)[0] + y_pred = np.argmax(y_pred_prob) + + + exp = explainer.explain_instance(feat_sample, + classifier_fn, + threshold=threshold) + if verbose: + print('Anchor: %s' % (' AND '.join(exp.names()))) + print('Precision: %.2f' % exp.precision()) + print('Coverage: %.2f' % exp.coverage()) + + + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer + if verbose: + print(' time: {0:.2f}'.format(timer)) + + expl = [] + return sorted(expl), timer + + +def pickle_load_file(filename): + try: + f = open(filename, "rb") + data = pickle.load(f) + f.close() + return data + except: + print("Cannot load from file", filename) + exit() + + +# +#============================================================================== +if __name__ == '__main__': + # parsing command-line options + options = Options(sys.argv) + + # making output unbuffered + if sys.version_info.major == 2: + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + + # showing head + #show_info() + print('Starting LIME explainer...') + + + if options.files: + cls = None + + print("loading data ...") + data = Dataset(filename=options.files[0], mapfile=options.mapfile, + separator=options.separator, use_categorical = False) + + + if options.explain: + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss + + if not cls: + print("loading model ...") + cls = pickle_load_file(options.files[1]) + cls.print_accuracy(data) # print test accuray + + samps_file = options.explain.strip() + print(samps_file) + with open(samps_file, 'r') as fp: + lines = fp.readlines() + + # timers + atimes = [] + tested = set() + + for i, s in enumerate(lines): + sample = [float(v.strip()) for v in s.split(',')] + + if tuple(sample) in tested: + continue + + #print("inst#{0}".format(i+1)) + + tested.add(tuple(sample)) + #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) + + expl, time = anchor_call(cls, data, sample, verbose=options.verb) # call lime + + atimes.append(time) + + + #if i == 100: + # break + + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem + + + # reporting the time spent + print('') + print('tot time: {0:.2f}'.format(sum(atimes))) + print('max time: {0:.2f}'.format(max(atimes))) + print('min time: {0:.2f}'.format(min(atimes))) + print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) + print('') + print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) + \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xpLime.py b/pages/application/RandomForest/utils/xpLime.py new file mode 100755 index 0000000000000000000000000000000000000000..9945bf7943e755633b688ff183efadea1d7a641a --- /dev/null +++ b/pages/application/RandomForest/utils/xpLime.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +#-*- coding:utf-8 -*- +## +## lime_wrap.py (reuses parts of the code of SHAP) +## + + +# +#============================================================================== +from __future__ import print_function +#from data import Data +from pages.application.RandomForest.utils.xrf import Dataset +import os +import sys +import pickle + + +import json +import numpy as np +import math +import lime +import lime.lime_tabular +import resource + + +# +#============================================================================== +def lime_call(model, data, sample, nb_samples = 50, feats='all', + nb_features_in_exp=10, verbose=0): + + # we need a way to say that features are categorical ? + # we do not have this informations. + predict_fn_rf = lambda x: model.forest.predict_proba(data.transform(x)).astype(float) + X_train, _, _, _ = data.train_test_split() + + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + explainer = lime.lime_tabular.LimeTabularExplainer( + X_train, + feature_names=data.feature_names, + categorical_features= None, + class_names=data.target_name, + discretize_continuous=True, + ) + + f2imap = {} + for i, f in enumerate(data.feature_names): + f2imap[f.strip()] = i + + if (sample is not None): + try: + feat_sample = np.asarray(sample, dtype=np.float32) + except: + print("Cannot parse input sample:", sample) + exit() + if verbose: + print("\n\n\nStarting LIME explainer... \nConsidering a sample with features:", feat_sample) + if not (len(feat_sample) == len(X_train[0])): + print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(X_train[0]))) + exit() + + # compute boost predictions + feat_sample_exp = np.expand_dims(feat_sample, axis=0) + feat_sample_exp = data.transform(feat_sample_exp) + #y_pred = model.forest.predict(feat_sample_exp)[0] + y_pred_prob = model.forest.predict_proba(feat_sample_exp)[0] + y_pred = np.argmax(y_pred_prob) + + + exp = explainer.explain_instance(feat_sample, + predict_fn_rf, + num_features = nb_features_in_exp, + top_labels = 1)#, + #labels = list(range(xgb.num_class))) + + expl = [] + + # choose which features in the explanation to focus on + if feats in ('p', 'pos', '+'): + feats = 1 + elif feats in ('n', 'neg', '-'): + feats = -1 + else: + feats = 0 + + for i in range(data.num_class): + if (i != y_pred): + continue + if verbose: + print("\t \t Explanations for the winner class", i, " ( confidence = ", y_pred_prob[i], ")") + print("\t \t Features in explanations: ", exp.as_list(label=i)) + + s_human_readable = "" + for k, v in enumerate(exp.as_list(label=i)): + if (feats == 1 and v[1] < 0) or (feats == -1 and v[1] >= 0): + continue + + if not (('<' in v[0]) or ('>' in v[0])): + a = v[0].split('=') + f = a[0].strip() + l = a[1].strip() + u = l + +# if (xgb.use_categorical): +# fid = f2imap[f] +# fvid = int(a[1]) +# s_human_readable = s_human_readable + "\t \t id = {}, name = {}, score = {}\n".format(fid, f, str(v[1])) + + + else: + a = v[0].split('<') + + if len(a) == 1: + a = v[0].split('>') + + if len(a) == 2: + f = a[0].strip() + + if '>' in v[0]: + l, u = float(a[1].strip(' =')), None + else: + l, u = None, float(a[1].strip(' =')) + else: + l = float(a[0].strip()) + f = a[1].strip(' =') + u = float(a[2].strip(' =')) + + # expl.append(tuple([f2imap[f], l, u, v[1] >= 0])) + expl.append(f2imap[f]) + +# if (xgb.use_categorical): +# if (len(s_human_readable) > 0): +# print("\t \t Features in explanations (with provided categorical labels): \n", s_human_readable) + timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer + if verbose: + print(' time: {0:.2f}'.format(timer)) + + + return sorted(expl), timer + + +def pickle_load_file(filename): + try: + f = open(filename, "rb") + data = pickle.load(f) + f.close() + return data + except: + print("Cannot load from file", filename) + exit() + + +# +#============================================================================== +if __name__ == '__main__': + # parsing command-line options + options = Options(sys.argv) + + # making output unbuffered + if sys.version_info.major == 2: + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + + # showing head + #show_info() + print('Starting LIME explainer...') + + + if options.files: + cls = None + + print("loading data ...") + data = Dataset(filename=options.files[0], mapfile=options.mapfile, + separator=options.separator, use_categorical = False) + + + if options.explain: + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss + + if not cls: + print("loading model ...") + cls = pickle_load_file(options.files[1]) + cls.print_accuracy(data) # print test accuray + + samps_file = options.explain.strip() + print(samps_file) + with open(samps_file, 'r') as fp: + lines = fp.readlines() + + # timers + atimes = [] + tested = set() + + for i, s in enumerate(lines): + sample = [float(v.strip()) for v in s.split(',')] + + if tuple(sample) in tested: + continue + + #print("inst#{0}".format(i+1)) + + tested.add(tuple(sample)) + #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) + + expl, time = lime_call(cls, data, sample, verbose=options.verb) # call lime + + atimes.append(time) + + + #if i == 3: + # break + + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem + + + # reporting the time spent + print('') + print('tot time: {0:.2f}'.format(sum(atimes))) + print('max time: {0:.2f}'.format(max(atimes))) + print('min time: {0:.2f}'.format(min(atimes))) + print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) + print('') + print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) + \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xprf.py b/pages/application/RandomForest/utils/xprf.py new file mode 100755 index 0000000000000000000000000000000000000000..acd3fd81c9201a52d6460d5d8397c7074e7f2e8d --- /dev/null +++ b/pages/application/RandomForest/utils/xprf.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +#-*- coding:utf-8 -*- +## +## xrf.py +## +## Created on: Oct 08, 2020 +## Author: Yacine Izza +## E-mail: yacine.izza@univ-toulouse.fr +## + +# +#============================================================================== +from __future__ import print_function +from pages.application.RandomForest.utils.data import Data +import os +import sys +import pickle +import resource + +from pages.application.RandomForest.utils.xgbooster import preprocess_dataset + +from pages.application.RandomForest.utils.xrf import XRF, RF2001, Dataset, Checker +import numpy as np + +################## +from pages.application.RandomForest.utils.xpLime import lime_call +import math +import lime +import lime.lime_tabular +### +from pages.application.RandomForest.utils.xpAnchor import anchor_call +#from anchor import utils +from anchor import anchor_tabular +################ + +# +#============================================================================== +def show_info(): + """ + Print info message. + """ + print("c XRF: eXplaining Random Forest.") + print('c') + + +# +#============================================================================== +def pickle_save_file(filename, data): + try: + f = open(filename, "wb") + pickle.dump(data, f) + f.close() + except: + print("Cannot save to file", filename) + exit() + +def pickle_load_file(filename): + try: + f = open(filename, "rb") + data = pickle.load(f) + f.close() + return data + except: + print("Cannot load from file", filename) + exit() + + +# +#============================================================================== +if __name__ == '__main__': + # parsing command-line options + options = Options(sys.argv) + + # making output unbuffered + if sys.version_info.major == 2: + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + + # showing head + show_info() + + if (options.preprocess_categorical): + preprocess_dataset(options.files[0], options.preprocess_categorical_files, options.use_categorical) + exit() + + + if options.files: + cls = None + xrf = None + + print("loading data ...") + data = Dataset(filename=options.files[0], mapfile=options.mapfile, + separator=options.separator, use_categorical = options.use_categorical) + + if options.train: + ''' + data = Dataset(filename=options.files[0], mapfile=options.mapfile, + separator=options.separator, + use_categorical = options.use_categorical) + ''' + + cls = RF2001(options) + train_accuracy, test_accuracy = cls.train(data) + + if options.verb == 1: + print("----------------------") + print("Train accuracy: {0:.2f}".format(100. * train_accuracy)) + print("Test accuracy: {0:.2f}".format(100. * test_accuracy)) + print("----------------------") + + xrf = XRF(options, cls, data) + #xrf.test_tree_ensemble() + + bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] + bench_dir_name = options.output + "/RF/" + bench_name + try: + os.stat(bench_dir_name) + except: + os.mkdir(bench_dir_name) + + basename = (os.path.join(bench_dir_name, bench_name + + "_nbestim_" + str(options.n_estimators) + + "_maxdepth_" + str(options.maxdepth))) + + modfile = basename + '.mod.pkl' + print("saving model to ", modfile) + pickle_save_file(modfile, cls) + + #data_suffix = '.splitdata.pkl' + #filename_data = basename + data_suffix + #print("saving data to ", filename_data) + #pickle_save_file(filename_data, data) + + + # read a sample from options.explain + #if options.explain: + # options.explain = [float(v.strip()) for v in options.explain.split(',')] + + ''' + if options.encode: + # encode it and save the encoding to another file + #xrf.encode(test_on=options.explain) + xrf.encode(options.explain) + ''' + if options.explain: + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss + + if not xrf: + print("loading model ...") + cls = pickle_load_file(options.files[1]) + xrf = XRF(options, cls, data) + + + #expl = xrf.explain(options.explain) + + #expl_checker = Checker(xrf.f, data.num_class, data.extended_feature_names_as_array_strings) + + cls.print_accuracy(data) # print test accuracy of the RF model + + samps_file = options.explain.strip() + print(samps_file) + with open(samps_file, 'r') as fp: + lines = fp.readlines() + + # timers + atimes = [] + lengths = [] + tested = set() + mSAT, mUNSAT = 0.0, 0.0 + stimes = [] + utimes = [] + nSatCalls = [] + nUnsCalls = [] + + ltimes = [] + ctimes = [] + wins = 0 + + for i, s in enumerate(lines): + sample = [float(v.strip()) for v in s.split(',')] + + if tuple(sample) in tested: + continue + + #print("inst#{0}".format(i+1)) + + tested.add(tuple(sample)) + #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) + + xrf.encode(sample) + expl = xrf.explain(sample) + atimes.append(xrf.x.time) + lengths.append(len(expl)) + + nvars = xrf.enc.cnf.nv + nclauses = len(xrf.enc.cnf.clauses) + + #mSAT = max(xrf.x.stimes+[mSAT]) + #mUNSAT = max(xrf.x.utimes+[mUNSAT]) + if len(xrf.x.stimes): + stimes.append(max(xrf.x.stimes)) + if len(xrf.x.utimes): + utimes.append(max(xrf.x.utimes)) + nSatCalls.append(xrf.x.nsat) + nUnsCalls.append(xrf.x.nunsat) + + #inst = data.transform(np.array(sample))[0] + #expl_checker.check(np.array(inst), expl) + #####check_expl(np.array(inst), expl, xrf.enc.forest, xrf.enc.intvs) + + del xrf.enc + del xrf.x + + #####################LIME########### + ''' + _, ltime = lime_call(cls, data, sample, verbose=options.verb) # call lime + ltimes.append(ltime) + #wins += 1 + if atimes[-1] < ltime: + wins += 1 + ''' + + _, ctime = anchor_call(cls, data, sample, verbose=options.verb) # call lime + ctimes.append(ctime) + if atimes[-1] < ctime: + wins += 1 + + #if i == 1: + # break + + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem + + + # reporting the time spent + print('') + print('tot time: {0:.2f}'.format(sum(atimes))) + print('max time: {0:.2f}'.format(max(atimes))) + print('min time: {0:.2f}'.format(min(atimes))) + print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) + print('') + #### + print('avg length: {0:.0f}'.format(round(sum(lengths) / len(lengths))*100/len(sample))) + #print('max SAT: {0:.2f}'.format(mSAT)) + #print('max UNSAT: {0:.2f}'.format(mUNSAT)) + print('max SAT: {0:.2f}'.format(max(stimes))) + print('max UNSAT: {0:.2f}'.format(max(utimes))) + print('avg #SAT: {0:.0f}'.format(sum(nSatCalls) / len(nSatCalls))) + print('avg #UNSAT: {0:.0f}'.format(sum(nUnsCalls) / len(nUnsCalls))) + print('') + #reporting nof_vars and nof_clauses + print('c nof vars: {0}'.format(nvars)) + print('c nof clauses: {0}'.format(nclauses)) + # + print('c nof instances: {0}'.format(len(tested))) + print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) + + + # LIME runtimes + ''' + print('') + print('min time for Lime: {0:.2f}'.format(min(ltimes))) + print('avg time for Lime: {0:.2f}'.format(sum(ltimes) / len(ltimes))) + print('#wins {0} out of {1}'.format(wins, len(tested)) ) + ''' + + # Anchor runtimes + print('') + print('tot time for Anchor: {0:.2f}'.format(sum(ctimes))) + print('max time for Anchor: {0:.2f}'.format(max(ctimes))) + print('min time for Anchor: {0:.2f}'.format(min(ctimes))) + print('avg time for Anchor: {0:.2f}'.format(sum(ctimes) / len(ctimes))) + print('#wins {0} out of {1}'.format(wins, len(tested)) ) + \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/__init__.py b/pages/application/RandomForest/utils/xrf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7ae37dec55e919f83a87d18f7c9271aa7baab3c2 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/__init__.py @@ -0,0 +1,5 @@ +#from .encode import * +#from .tree import * +from .rndmforest import * +#from .checker import check_expl +from .checker import Checker \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so b/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so new file mode 100755 index 0000000000000000000000000000000000000000..8037905889ba350ca148832a64d0dfd51f823953 Binary files /dev/null and b/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so b/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so new file mode 100755 index 0000000000000000000000000000000000000000..7aa11a7e970e9b6d0a19149ddf63f889f952f4a7 Binary files /dev/null and b/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o b/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o new file mode 100644 index 0000000000000000000000000000000000000000..ba7953dbc2b8e701d652333c93f2d2aa33024d72 Binary files /dev/null and b/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o b/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o new file mode 100644 index 0000000000000000000000000000000000000000..972595fc8d85029e1a21ad144b452cd34d8306b4 Binary files /dev/null and b/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o differ diff --git a/pages/application/RandomForest/utils/xrf/archive/encode.py b/pages/application/RandomForest/utils/xrf/archive/encode.py new file mode 100644 index 0000000000000000000000000000000000000000..3478ebc15ec768b853dfbca892b966d2177bac58 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/archive/encode.py @@ -0,0 +1,276 @@ + +from pysat.formula import CNF, IDPool +from pysat.solvers import Solver +from pysat.card import * +from itertools import combinations + +import collections +import six +from six.moves import range + +from .tree import Forest, predict_tree +from .sortnetwrk import HSorNetwrk + +# +#============================================================================== +class SATEncoder(object): + """ + Encoder of Random Forest classifier into SAT. + """ + + def __init__(self, forest, feats, nof_classes, extended_feature_names=None, from_file=None): + #self.model = model + self.forest = forest + self.feats = {f: i for i, f in enumerate(feats)} + self.num_class = nof_classes + self.vpool = IDPool() + #self.optns = xgb.options + self.extended_feature_names = extended_feature_names + + #encoding formula + self.cnf = None + + # for interval-based encoding + self.intvs, self.imaps, self.ivars = None, None, None + + #if from_file: + # self.load_from(from_file) + + def newVar(self, name): + assert(name) + + if name in self.vpool.obj2id: #var has been already created + return self.vpool.obj2id[name] + + var = self.vpool.id('{0}'.format(name)) + return var + + + def traverse(self, tree, k, clause): + """ + Traverse a tree and encode each node. + """ + + if tree.children: + var = self.newVar(tree.name) + #print("{0} => {1}".format(tree.name, var)) + pos, neg = var, -var + + self.traverse(tree.children[0], k, clause + [-neg]) # -var + self.traverse(tree.children[1], k, clause + [-pos]) # --var + else: # leaf node + cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k)) + print('c: ', clause + [cvar]) + self.cnf.append(clause + [cvar]) + + + ''' + def encode_node(self, node): + """ + Encode a node of a tree. + """ + + if '_' not in node.name: + # continuous features => expecting an upper bound + # feature and its upper bound (value) + f, v = node.name, node.threshold + + existing = True if tuple([f, v]) in self.idmgr.obj2id else False + vid = self.idmgr.id(tuple([f, v])) + bv = Symbol('bvar{0}'.format(vid), typename=BOOL) + + if not existing: + if self.intvs: + d = self.imaps[f][v] + 1 + pos, neg = self.ivars[f][:d], self.ivars[f][d:] + self.enc.append(Iff(bv, Or(pos))) + self.enc.append(Iff(Not(bv), Or(neg))) + else: + fvar, fval = Symbol(f, typename=REAL), Real(v) + self.enc.append(Iff(bv, LT(fvar, fval))) + + return bv, Not(bv) + else: + # all features are expected to be categorical and + # encoded with one-hot encoding into Booleans + # each node is expected to be of the form: f_i < 0.5 + bv = Symbol(node.name, typename=BOOL) + + # left branch is positive, i.e. bv is true + # right branch is negative, i.e. bv is false + return Not(bv), bv + ''' + + + def compute_intervals(self): + """ + Traverse all trees in the ensemble and extract intervals for each + feature. + + At this point, the method only works for numerical datasets! + """ + + def traverse_intervals(tree): + """ + Auxiliary function. Recursive tree traversal. + """ + + if tree.children: + f = tree.name + v = tree.threshold + self.intvs[f].add(v) + + traverse_intervals(tree.children[0]) + traverse_intervals(tree.children[1]) + + # initializing the intervals + self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} + + for tree in self.forest.trees: + traverse_intervals(tree) + + # OK, we got all intervals; let's sort the values + self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} + + self.imaps, self.ivars = {}, {} + for feat, intvs in six.iteritems(self.intvs): + self.imaps[feat] = {} + self.ivars[feat] = [] + for i, ub in enumerate(intvs): + self.imaps[feat][ub] = i + + ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) + self.ivars[feat].append(ivar) + + def encode(self, sample): + """ + Do the job. + """ + + self.cnf = CNF() + # getting a tree ensemble + #self.forest = Forest(self.model, self.extended_feature_names) + num_tree = len(self.forest.trees) + + # introducing class score variables + cvars = [[] for t in range(num_tree)] + for k in range(len(self.forest.trees)): + for j in range(self.num_class): + var = self.newVar('class{0}_tr{1}'.format(j,k)) + cvars[k].append(-var) + + # if targeting interval-based encoding, + # traverse all trees and extract all possible intervals + # for each feature + ''' + if self.optns.encode == 'smtbool': + self.compute_intervals() + ''' + + # traversing and encoding each tree + for k, tree in enumerate(self.forest.trees): + print("Encode tree#{0}".format(k)) + # encoding the tree + self.traverse(tree, k, []) + #exactly one class var is true this could could be squeezed + # more to reduce NB binary clauses!!!!!!! + enc = CardEnc.atmost(lits=[-v for v in cvars[k]], + vpool=self.vpool, + encoding=EncType.cardnetwrk) #AtMostOne constraint + self.cnf.extend(enc.clauses) + + + csum = [[] for c in range(self.num_class)] + for k, tree in enumerate(self.forest.trees): + c = predict_tree(tree, sample) + csum[c].append(k) + cvars[k][c] = - cvars[k][c] + + # encoding the majority + cmaj,_ = max(enumerate(csum), key=(lambda x: len(x[1]))) + sorted_lits = [[] for c in range(self.num_class)] + #sorting bits + for j in range(self.num_class): + tvars = [cvars[k][j] for k in range(num_tree)] + clauses, vout, _ = HSorNetwrk(lits=tvars, vpool = self.vpool) + self.cnf.extend(clauses) + sorted_lits[j] = vout + #print("tvars: {0} ==> {3} \nclauses: {1}\ntop: {2}".format(tvars, clauses, self.vpool.top, vout)) + #compare bits + for j in range(self.num_class): + if j == cmaj: + continue + for k in range(num_tree): + self.cnf.append([ -sorted_lits[j][k], sorted_lits[cmaj][k] ]) # (v1 => v2) + #print("-{0} => {1}".format(sorted_lits[j][k], sorted_lits[cmaj][k])) + + ''' + # enforce exactly one of the feature values to be chosen + # (for categorical features) + categories = collections.defaultdict(lambda: []) + for f in self.extended_feature_names: + if '_' in f: + categories[f.split('_')[0]].append(self.newVar(f)) + + for c, feats in six.iteritems(categories): + #ExactlyOne feat is True + self.cnf.append(feats) + enc = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(enc.clauses) + ''' + + #if self.optns.verb: + # number of variables + print('#vars:', self.cnf.nv) + # number of clauses + print('#clauses:', len(self.cnf.clauses)) + #print(self.cnf.clauses) + + return self.cnf, self.intvs, self.imaps, self.ivars + + ''' + def test_sample(self, sample): + """ + Check whether or not the encoding "predicts" the same class + as the classifier given an input sample. + """ + + # first, compute the scores for all classes as would be + # predicted by the classifier + + # score arrays computed for each class + csum = [[] for c in range(self.num_class)] + + #if self.optns.verb: + print('testing sample:', list(sample)) + + # traversing all trees + for i, tree in enumerate(self.forest.trees): + c = predict_tree(tree, sample) + csum[c].append(i) + + # encoding the majority + cmaj,_ = max(enumerate(csum), key=(lambda x: len(x[1]))) + + # second, get the scores computed with the use of the encoding + assert self.cnf, "There is no encoding." + + slv = Solver(name="minisat22") + slv.add_formula(self.cnf) + + + # asserting the sample + hypos = [] + + #for i, fval in enumerate(sample): + + ''' + + + def access(self): + """ + Get access to the encoding, features names, and the number of + classes. + """ + + return self.cnf, self.intvs, self.imaps, self.ivars, self.feats, self.num_class \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc b/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a44d9682946bf694983856d759abfaab3351f42 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc @@ -0,0 +1,248 @@ + + +#define PY_SSIZE_T_CLEAN + +#include <setjmp.h> +#include <signal.h> +#include <stdio.h> +#include <Python.h> + +#include "sortcard.hh" + +using namespace std; + +// docstrings +//============================================================================= +static char module_docstring[] = "This module provides an interface for " + "encoding a few types of cardinality " + "constraints"; +//static char atmost_docstring[] = "Create an AtMost(k) constraint."; +//static char atleast_docstring[] = "Create an AtLeast(k) constraint."; +static char sortn_docstring[] = "Sort an array of bits."; + +static PyObject *CardError; +static jmp_buf env; + +// function declaration for functions available in module +//============================================================================= +extern "C" { + //static PyObject *py_encode_atmost (PyObject *, PyObject *); + //static PyObject *py_encode_atleast (PyObject *, PyObject *); + static PyObject *py_sortn (PyObject *, PyObject *); +} + +// module specification +//============================================================================= +static PyMethodDef module_methods[] = { + //{ "encode_atmost", py_encode_atmost, METH_VARARGS, atmost_docstring }, + //{ "encode_atleast", py_encode_atleast, METH_VARARGS, atleast_docstring }, + { "HSort", py_sortn, METH_VARARGS, sortn_docstring }, + + { NULL, NULL, 0, NULL } +}; + +extern "C" { + +// signal handler for SIGINT +//============================================================================= +static void sigint_handler(int signum) +{ + longjmp(env, -1); +} + +//#if PY_MAJOR_VERSION >= 3 // for Python3 +// PyInt_asLong() +//============================================================================= +static int pyint_to_cint(PyObject *i_obj) +{ + return PyLong_AsLong(i_obj); +} + +// PyInt_fromLong() +//============================================================================= +static PyObject *pyint_from_cint(int i) +{ + return PyLong_FromLong(i); +} + +// PyCapsule_New() +//============================================================================= +static PyObject *void_to_pyobj(void *ptr) +{ + return PyCapsule_New(ptr, NULL, NULL); +} + +// PyCapsule_GetPointer() +//============================================================================= +static void *pyobj_to_void(PyObject *obj) +{ + return PyCapsule_GetPointer(obj, NULL); +} + +// PyInt_Check() +//============================================================================= +static int pyint_check(PyObject *i_obj) +{ + return PyLong_Check(i_obj); +} + +// module initialization +//============================================================================= +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "pysortnetwrk", /* m_name */ + module_docstring, /* m_doc */ + -1, /* m_size */ + module_methods, /* m_methods */ + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL, /* m_free */ +}; + +/* +PyMODINIT_FUNC PyInit_pycard(void) +{ + PyObject *m = PyModule_Create(&module_def); + + if (m == NULL) + return NULL; + + CardError = PyErr_NewException((char *)"pycard.error", NULL, NULL); + Py_INCREF(CardError); + + if (PyModule_AddObject(m, "error", CardError) < 0) { + Py_DECREF(CardError); + return NULL; + } + + return m; +}*/ + +PyMODINIT_FUNC PyInit_pysortnetwrk(void) +{ + PyObject *m = PyModule_Create(&module_def); + + if (m == NULL) + return NULL; + + CardError = PyErr_NewException((char *)"pycard.error", NULL, NULL); + Py_INCREF(CardError); + + if (PyModule_AddObject(m, "error", CardError) < 0) { + Py_DECREF(CardError); + return NULL; + } + + return m; +} + + +// auxiliary function for translating an iterable to a vector<int> +//============================================================================= +static bool pyiter_to_vector(PyObject *obj, vector<int>& vect) +{ + PyObject *i_obj = PyObject_GetIter(obj); + + if (i_obj == NULL) { + PyErr_SetString(PyExc_RuntimeError, + "Object does not seem to be an iterable."); + return false; + } + + PyObject *l_obj; + while ((l_obj = PyIter_Next(i_obj)) != NULL) { + if (!pyint_check(l_obj)) { + Py_DECREF(l_obj); + Py_DECREF(i_obj); + PyErr_SetString(PyExc_TypeError, "integer expected"); + return false; + } + + int l = pyint_to_cint(l_obj); + Py_DECREF(l_obj); + + if (l == 0) { + Py_DECREF(i_obj); + PyErr_SetString(PyExc_ValueError, "non-zero integer expected"); + return false; + } + + vect.push_back(l); + } + + Py_DECREF(i_obj); + return true; +} + +// +//============================================================================= +static PyObject *py_sortn(PyObject *self, PyObject *args) +{ + + PyObject *av_obj; + //PyObject *cv_obj; + int top; + int zvar; + + //PyObject *lhs_obj; + //int rhs; + //int top; + //int enc; + int main_thread; + + if (!PyArg_ParseTuple(args, "Oiii", &av_obj, &top, &zvar, + &main_thread)) + return NULL; + + vector<int> av; + if (pyiter_to_vector(av_obj, av) == false) + return NULL; + + PyOS_sighandler_t sig_save; + if (main_thread) { + sig_save = PyOS_setsig(SIGINT, sigint_handler); + + if (setjmp(env) != 0) { + PyErr_SetString(CardError, "Caught keyboard interrupt"); + return NULL; + } + } + + // calling encoder + ClauseSet dest; + vector<int> cv; + sortn_half_sorter_recur(top, dest, av, cv, zvar); + //_encode_atmost(dest, lhs, rhs, top, enc); + + if (main_thread) + PyOS_setsig(SIGINT, sig_save); + + // creating the resulting clause set + PyObject *dest_obj = PyList_New(dest.size()); + for (size_t i = 0; i < dest.size(); ++i) { + PyObject *cl_obj = PyList_New(dest[i].size()); + + for (size_t j = 0; j < dest[i].size(); ++j) { + PyObject *lit_obj = pyint_from_cint(dest[i][j]); + PyList_SetItem(cl_obj, j, lit_obj); + } + + PyList_SetItem(dest_obj, i, cl_obj); + } + + PyObject *cv_obj = PyList_New(cv.size()); + for (size_t i = 0; i < cv.size(); ++i) { + PyObject *lit_obj = pyint_from_cint(cv[i]); + PyList_SetItem(cv_obj, i, lit_obj); + } + + PyObject *ret = Py_BuildValue("OOn", dest_obj, cv_obj, (Py_ssize_t)top); + Py_DECREF(dest_obj); + Py_DECREF(cv_obj); + + return ret; +} + + +} // extern "C" diff --git a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so b/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so new file mode 100755 index 0000000000000000000000000000000000000000..7aa11a7e970e9b6d0a19149ddf63f889f952f4a7 Binary files /dev/null and b/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so differ diff --git a/pages/application/RandomForest/utils/xrf/archive/rfc.py b/pages/application/RandomForest/utils/xrf/archive/rfc.py new file mode 100644 index 0000000000000000000000000000000000000000..411ad46c1e635664cd64695394bc2093710c2368 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/archive/rfc.py @@ -0,0 +1,636 @@ +from sklearn.ensemble._voting import VotingClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import OneHotEncoder, LabelEncoder +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +import numpy as np +import sys +import os +import resource + +import collections +from six.moves import range +import six + +from pages.application.RandomForest.utils.data import Data +from .tree import Forest, predict_tree +# from .encode import SATEncoder +from .sortnetwrk import HSorNetwrk +from pysat.formula import CNF, IDPool +from pysat.solvers import Solver +from pysat.card import CardEnc, EncType +from itertools import combinations + + +# +# ============================================================================== +class Dataset(Data): + """ + Class for representing dataset (transactions). + """ + + def __init__(self, filename=None, fpointer=None, mapfile=None, + separator=' ', use_categorical=False): + super().__init__(filename, fpointer, mapfile, separator, use_categorical) + + # split data into X and y + self.feature_names = self.names[:-1] + self.nb_features = len(self.feature_names) + self.use_categorical = use_categorical + samples = np.asarray(self.samps, dtype=np.float32) + self.X = samples[:, 0: self.nb_features] + self.y = samples[:, self.nb_features] + self.num_class = len(set(self.y)) + self.target_name = list(range(self.num_class)) + + print("c nof_features: {0}".format(self.nb_features)) + print("c nof_samples: {0}".format(len(self.samps))) + + # check if we have info about categorical features + if (self.use_categorical): + self.binarizer = {} + for i in self.categorical_features: + self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , + self.binarizer[i].fit(self.X[:, [i]]) + else: + self.binarize = [] + # feat map + self.mapping_features() + + def train_test_split(self, test_size=0.2, seed=0): + return train_test_split(self.X, self.y, test_size=test_size, random_state=seed) + + def transform(self, x): + if (len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert (self.binarizer != []) + tx = [] + for i in range(self.nb_features): + self.binarizer[i].drop = None + if (i in self.categorical_features): + tx_aux = self.binarizer[i].transform(x[:, [i]]) + tx_aux = np.vstack(tx_aux) + tx.append(tx_aux) + else: + tx.append(x[:, [i]]) + tx = np.hstack(tx) + return tx + else: + return x + + def transform_inverse(self, x): + if (len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert (self.binarizer != []) + inverse_x = [] + for i, xi in enumerate(x): + inverse_xi = np.zeros(self.nb_features) + for f in range(self.nb_features): + if f in self.categorical_features: + nb_values = len(self.categorical_names[f]) + v = xi[:nb_values] + v = np.expand_dims(v, axis=0) + iv = self.binarizer[f].inverse_transform(v) + inverse_xi[f] = iv + xi = xi[nb_values:] + + else: + inverse_xi[f] = xi[0] + xi = xi[1:] + inverse_x.append(inverse_xi) + return inverse_x + else: + return x + + def transform_inverse_by_index(self, idx): + if (idx in self.extended_feature_names): + return self.extended_feature_names[idx] + else: + print("Warning there is no feature {} in the internal mapping".format(idx)) + return None + + def transform_by_value(self, feat_value_pair): + if (feat_value_pair in self.extended_feature_names.values()): + keys = ( + list(self.extended_feature_names.keys())[list(self.extended_feature_names.values()).index(feat_value_pair)]) + return keys + else: + print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) + return None + + def mapping_features(self): + self.extended_feature_names = {} + self.extended_feature_names_as_array_strings = [] + counter = 0 + if (self.use_categorical): + for i in range(self.nb_features): + if (i in self.categorical_features): + for j, _ in enumerate(self.binarizer[i].categories_[0]): + self.extended_feature_names.update({counter: (self.feature_names[i], j)}) + self.extended_feature_names_as_array_strings.append( + "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) + counter = counter + 1 + else: + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) + counter = counter + 1 + else: + for i in range(self.nb_features): + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) + counter = counter + 1 + + def readable_sample(self, x): + readable_x = [] + for i, v in enumerate(x): + if (i in self.categorical_features): + readable_x.append(self.categorical_names[i][int(v)]) + else: + readable_x.append(v) + return np.asarray(readable_x) + + def test_encoding_transformes(self, X_train): + # test encoding + + X = X_train[[0], :] + + print("Sample of length", len(X[0]), " : ", X) + enc_X = self.transform(X) + print("Encoded sample of length", len(enc_X[0]), " : ", enc_X) + inv_X = self.transform_inverse(enc_X) + print("Back to sample", inv_X) + print("Readable sample", self.readable_sample(inv_X[0])) + assert ((inv_X == X).all()) + + ''' + for i in range(len(self.extended_feature_names)): + print(i, self.transform_inverse_by_index(i)) + for key, value in self.extended_feature_names.items(): + print(value, self.transform_by_value(value)) + ''' + + +# +# ============================================================================== +class VotingRF(VotingClassifier): + """ + Majority rule classifier + """ + + def fit(self, X, y, sample_weight=None): + self.estimators_ = [] + for _, est in self.estimators: + self.estimators_.append(est) + + self.le_ = LabelEncoder().fit(y) + self.classes_ = self.le_.classes_ + + def predict(self, X): + """Predict class labels for X. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. + Returns + ------- + maj : array-like of shape (n_samples,) + Predicted class labels. + """ + # check_is_fitted(self) + + # 'hard' voting + predictions = self._predict(X) + predictions = np.asarray(predictions, np.int64) # NEED TO BE CHECKED + maj = np.apply_along_axis( + lambda x: np.argmax( + np.bincount(x, weights=self._weights_not_none)), + axis=1, arr=predictions) + + maj = self.le_.inverse_transform(maj) + + return maj + + +# +# ============================================================================== +class RF2001(object): + """ + The main class to train Random Forest Classifier (RFC). + """ + + def __init__(self, options, from_data=None, from_model=None): + """ + Constructor. + """ + self.forest = None + self.voting = None + self.opt = options + + param_dist = {'n_estimators': options.n_estimators, + 'max_depth': options.maxdepth} + + self.forest = RandomForestClassifier(**param_dist) + + def train(self, dataset, outfile=None): + """ + Train a random forest. + """ + + X_train, X_test, y_train, y_test = dataset.train_test_split() + + dataset.test_encoding_transformes(X_train) + X_train = dataset.transform(X_train) + X_test = dataset.transform(X_test) + + print("Build a random forest.") + self.forest.fit(X_train, y_train) + + rtrees = [('dt', dt) for i, dt in enumerate(self.forest.estimators_)] + self.voting = VotingRF(estimators=rtrees) + self.voting.fit(X_train, y_train) + + train_acc = accuracy_score(self.voting.predict(X_train), y_train) + ''' + print(X_test[[0],:]) + print("RF: ",np.asarray(self.voting.predict(X_test[[0],:]))) + for i,t in enumerate(self.forest.estimators_): + print("DT_{0}: {1}".format(i,np.asarray(t.predict(X_test[[0],:])))) + ''' + test_acc = accuracy_score(self.voting.predict(X_test), y_test) + print("----------------------") + print("RF2001:") + print("Train accuracy RF2001: {0:.2f}".format(100. * train_acc)) + print("Test accuracy RF2001: {0:.2f}".format(100. * test_acc)) + print("----------------------") + + train_acc = accuracy_score(self.forest.predict(X_train), y_train) + test_acc = accuracy_score(self.forest.predict(X_test), y_test) + print("RF-scikit:") + print("Train accuracy RF-scikit: {0:.2f}".format(100. * train_acc)) + print("Test accuracy RF-scikit: {0:.2f}".format(100. * test_acc)) + print("----------------------") + + return train_acc, test_acc + + def predict(self, X): + return self.voting.predict(X) + + def estimators(self): + assert (self.forest.estimators_ is not None) + return self.forest.estimators_ + + def n_estimators(self): + return self.forest.n_estimators + + +# +# ============================================================================== +class XRF(object): + """ + class to encode and explain Random Forest classifiers. + """ + + def __init__(self, options, model): + self.cls = model + self.f = Forest(model) + # self.f.print_tree() + self.verbose = options.verb + + def encode(self, data): + """ + Encode a tree ensemble trained previously. + """ + ########## + self.f = Forest(self.cls, data.extended_feature_names_as_array_strings) + self.f.print_tree() + ####### + self.sat_enc = SATEncoder(self.f, data.feature_names, data.num_class, + extended_feature_names=data.extended_feature_names_as_array_strings) + + _, X_test, _, y_test = data.train_test_split() + + inst = X_test[[1], :] + inst = data.transform(inst)[0] + self.sat_enc.encode(inst) + self.explain(inst, data) + + def explain(self, sample, data): + """ + Explain a prediction made for a given sample with a previously + trained RF. + """ + + preamble = None + if self.verbose: + inpvals = data.readable_sample(sample) + + preamble = [] + for f, v in zip(data.feature_names, inpvals): + if f not in v: + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + inps = data.extended_feature_names_as_array_strings # input (feature value) variables + # print("inps: {0}".format(inps)) + + if 'x' not in dir(self): + self.x = SATExplainer(self.sat_enc, inps, preamble, data.target_name) + + expl = self.x.explain(np.array(sample)) + + # returning the explanation + return expl + + def test_tree_ensemble(self, dataset): + _, X_test, _, y_test = dataset.train_test_split() + X_test = dataset.transform(X_test) + + y_pred_forest = self.f.predict(X_test) + acc = accuracy_score(y_pred_forest, y_test) + print("Test accuracy: {0:.2f}".format(100. * acc)) + + y_pred_cls = self.cls.predict(X_test) + # print(np.asarray(y_pred_cls, np.int64)) + # print(y_pred_forest) + + assert ((y_pred_cls == y_pred_forest).all()) + + +# +# ============================================================================== +class SATEncoder(object): + """ + Encoder of Random Forest classifier into SAT. + """ + + def __init__(self, forest, feats, nof_classes, extended_feature_names=None, from_file=None): + # self.model = model + self.forest = forest + self.feats = {f: i for i, f in enumerate(feats)} + self.num_class = nof_classes + self.vpool = IDPool() + # self.optns = xgb.options + self.extended_feature_names = extended_feature_names + + # encoding formula + self.cnf = None + + # for interval-based encoding + self.intvs, self.imaps, self.ivars = None, None, None + + # if from_file: + # self.load_from(from_file) + + def newVar(self, name): + assert (name) + + if name in self.vpool.obj2id: # var has been already created + return self.vpool.obj2id[name] + + var = self.vpool.id('{0}'.format(name)) + return var + + def traverse(self, tree, k, clause): + """ + Traverse a tree and encode each node. + """ + + if tree.children: + var = self.newVar(tree.name) + # print("{0} => {1}".format(tree.name, var)) + pos, neg = var, -var + + self.traverse(tree.children[0], k, clause + [-neg]) # -var + self.traverse(tree.children[1], k, clause + [-pos]) # --var + else: # leaf node + cvar = self.newVar('class{0}_tr{1}'.format(tree.values, k)) + # print('c: ', clause + [cvar]) + self.cnf.append(clause + [cvar]) + + def encode(self, sample): + """ + Do the job. + """ + + self.cnf = CNF() + # getting a tree ensemble + # self.forest = Forest(self.model, self.extended_feature_names) + num_tree = len(self.forest.trees) + + # introducing class variables + cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_tree)] + + # introducing class-tree variables + ctvars = [[] for t in range(num_tree)] + for k in range(num_tree): + for j in range(self.num_class): + var = self.newVar('class{0}_tr{1}'.format(j, k)) + ctvars[k].append(var) + + # if targeting interval-based encoding, + # traverse all trees and extract all possible intervals + # for each feature + ''' + if self.optns.encode == 'smtbool': + self.compute_intervals() + ''' + + # traversing and encoding each tree + for k, tree in enumerate(self.forest.trees): + # print("Encode tree#{0}".format(k)) + # encoding the tree + self.traverse(tree, k, []) + # exactly one class var is true this could could be squeezed + # more to reduce NB binary clauses!!!!!!! + enc = CardEnc.atmost(lits=ctvars[k], + vpool=self.vpool, + encoding=EncType.cardnetwrk) # AtMostOne constraint + self.cnf.extend(enc.clauses) + + csum = [[] for c in range(self.num_class)] + for k, tree in enumerate(self.forest.trees): + c = predict_tree(tree, sample) + csum[c].append(k) + + # encoding the majority + self.cmaj, _ = max(enumerate(csum), key=(lambda x: len(x[1]))) + sorted_lits = [[] for c in range(self.num_class)] + # sorting bits + for j in range(self.num_class): + lits = [ctvars[k][j] for k in range(num_tree)] + clauses, vout, _ = HSorNetwrk(lits=lits, vpool=self.vpool) + self.cnf.extend(clauses) + sorted_lits[j] = vout + print("{0}:{2} => {1}".format(j, vout, lits)) + # compare bits + for j in range(self.cmaj): + + for j in range(self.cmaj + 1, self.num_class): + for k in range(num_tree): + self.cnf.append([-sorted_lits[j][k], sorted_lits[self.cmaj][k]]) # (v1 => v2) + # print("-{0} => {1}".format(sorted_lits[j][k], sorted_lits[self.cmaj][k])) + + ''' + # enforce exactly one of the feature values to be chosen + # (for categorical features) + categories = collections.defaultdict(lambda: []) + for f in self.extended_feature_names: + if '_' in f: + categories[f.split('_')[0]].append(self.newVar(f)) + + for c, feats in six.iteritems(categories): + #ExactlyOne feat is True + self.cnf.append(feats) + enc = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(enc.clauses) + ''' + for cl in self.cnf: + print("{0} == {1}".format(cl, + [self.vpool.obj(abs(p)) if p > 0 else "!" + str(self.vpool.obj(abs(p))) for p in + cl])) + + # if self.optns.verb: + # number of variables + print('#vars:', self.cnf.nv) + # number of clauses + print('#clauses:', len(self.cnf.clauses)) + # print(self.cnf.clauses) + + return self.cnf, self.intvs, self.imaps, self.ivars + + +# +# ============================================================================== +class SATExplainer(object): + """ + An SAT-inspired minimal explanation extractor for Random Forest models. + """ + + def __init__(self, sat_enc, inps, preamble, target_name): + """ + Constructor. + """ + + self.enc = sat_enc + # self.optns = options + self.inps = inps # input (feature value) variables + self.target_name = target_name + self.preamble = preamble + + self.verbose = True # self.optns.verb + # self.slv = Solver(name=options.solver) + self.slv = Solver(name="minisat22") + + # CNF formula + self.slv.append_formula(self.enc.cnf) + + # current selector + # self.selv = None + + def explain(self, sample, smallest=False): + """ + Hypotheses minimization. + """ + if self.verbose: + print( + ' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # adapt the solver to deal with the current sample + self.assums = [] # var selectors to be used as assumptions + self.sel2fid = {} # selectors to original feature ids + self.sel2vid = {} # selectors to categorical feature ids + + # preparing the selectors + for i, (inp, val) in enumerate(zip(self.inps, sample), 1): + feat = inp.split('_')[0] + selv = self.enc.newVar('selv_{0}'.format(feat)) + + self.assums.append(selv) + if selv not in self.sel2fid: + self.sel2fid[selv] = int(feat[1:]) + self.sel2vid[selv] = [i - 1] + else: + self.sel2vid[selv].append(i - 1) + + if not self.enc.intvs: + for inp, val, sel in zip(self.inps, sample, self.assums): + p = self.enc.newVar(inp) + hypo = [-sel, p if val else -p] + print("{0} => {1}".format(self.enc.vpool.obj(sel), inp if val else "!" + inp)) + self.slv.add_clause(hypo) + else: + raise NotImplementedError('Intervals are not supported.') + + self.assums = sorted(set(self.assums)) + # print("selctors: ", self.assums) + + self.slv.solve(assumptions=self.assums) + print("Model1:") + for p in self.slv.get_model(): + # if self.enc.vpool.obj(abs(p)) :and self.enc.vpool.obj(abs(p)) in self.inps: + if self.enc.vpool.obj(abs(p)) and "class" in self.enc.vpool.obj(abs(p)): + print((p, self.enc.vpool.obj(abs(p)))) + print(self.slv.get_model()) + + # forcing a misclassification, i.e. a wrong observation + for k in range(len(self.enc.forest.trees)): + cl = [] + for j in range(self.enc.num_class): + if j != self.enc.cmaj: + cl.append(self.enc.newVar('class{0}_tr{1}'.format(j, k))) + self.slv.add_clause(cl) + + # if satisfiable, then the observation is not implied by the hypotheses + if self.slv.solve(assumptions=self.assums): + print(' no implication!') + print(self.slv.get_model()) + # print("Model: {0}".format([ (p, self.enc.vpool.obj(abs(p))) for p in self.slv.get_model()])) + sys.exit(1) + + if not smallest: + self.compute_minimal() + else: + raise NotImplementedError('Smallest explanation is not yet implemented.') + # self.compute_smallest() + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time + + expl = sorted([self.sel2fid[h] for h in self.assums if h > 0]) + print("expl-selctors: ", expl) + + if self.verbose: + self.preamble = [self.preamble[i] for i in expl] + # print("cmaj: ", self.enc.cmaj) + print( + ' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) + print(' # hypos left:', len(self.assums)) + print(' time: {0:.2f}'.format(self.time)) + + return expl + + def compute_minimal(self): + """ + Compute any subset-minimal explanation. + """ + i = 0 + # simple deletion-based linear search + for i, p in enumerate(self.assums): + to_test = self.assums[:i] + self.assums[(i + 1):] + [-p] + # print(to_test) + if self.slv.solve(assumptions=to_test): + self.assums[i] = -p + print("Model:") + for p in self.slv.get_model(): + if self.enc.vpool.obj(abs(p)) and self.enc.vpool.obj(abs(p)) in self.inps: + print((p, self.enc.vpool.obj(abs(p)))) diff --git a/pages/application/RandomForest/utils/xrf/archive/setup.py b/pages/application/RandomForest/utils/xrf/archive/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..15c642788ddbea871d9a2a51fa61101d62230808 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/archive/setup.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +from distutils.core import setup, Extension + +pysortn_ext = Extension('pysortnetwrk', + sources=['pysortnetwrk.cc'], + include_dirs=['sortcard'], + language='c++') + +setup(name='pysortnetwrk', + version='1.0', + description='This module provides a sorting network to sort a vector of bits', + py_modules=['pysortnetwrk'], + ext_modules=[pysortn_ext]) diff --git a/pages/application/RandomForest/utils/xrf/archive/sortcard.hh b/pages/application/RandomForest/utils/xrf/archive/sortcard.hh new file mode 100644 index 0000000000000000000000000000000000000000..e6410504144682d39d43fcd172ce1c0c10a58f53 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/archive/sortcard.hh @@ -0,0 +1,298 @@ + +#ifndef SORTCARD_HH_ +#define SORTCARD_HH_ + +#include <vector> +#include <algorithm> +#include <vector> +#include <ostream> + + +#define NOPTCLS true + + +using namespace std; + +class ClauseSet { +public: + ClauseSet() : clauses(0) {} + ClauseSet(ClauseSet& orig) : clauses(orig.clauses) {} + + void clear() + { + clauses.clear(); + } + + size_t size() + { + return clauses.size(); + } + + void resize(size_t sz_new) + { + return clauses.resize(sz_new); + } + + vector<int>& operator[](size_t i) + { + return clauses[i]; + } + + void erase(vector<int>& cl) + { + clauses.erase(std::find(clauses.begin(), clauses.end(), cl)); + } + + void erase_subset(size_t start, ClauseSet& clset) + { + if (clset.size()) { + vector<int>& cl_first = clset[0]; + vector<vector<int> >::iterator begin = std::find(clauses.begin() + start, clauses.end(), cl_first); + clauses.erase(begin, begin + clset.size()); + } + } + + vector<vector<int> >& get_clauses() + { + return clauses; + } + + void add_clause(vector<int> cl) + { + clauses.push_back(cl); + } + + void add_clause_ref(vector<int>& cl) + { + clauses.push_back(cl); + } + + void create_clause(vector<int>& cl) + { + add_clause(cl); + } + + void create_unit_clause(int l) + { + vector<int> cl; cl.push_back(l); + clauses.push_back(cl); + } + + void create_binary_clause(int l1, int l2) + { + vector<int> cl; + cl.push_back(l1); + cl.push_back(l2); + + clauses.push_back(cl); + } + + void create_ternary_clause(int l1, int l2, int l3) + { + vector<int> cl; + cl.push_back(l1); + cl.push_back(l2); + cl.push_back(l3); + + clauses.push_back(cl); + } + + void dump(ostream& out) + { + for (size_t i = 0; i < clauses.size(); ++i) + dump_clause(out, clauses[i]); + } +private: + void dump_clause(ostream& out, vector<int>& cl) + { + for (size_t i = 0; i < cl.size(); ++i) + out << cl[i] << " "; + out << "0" << endl; + } +protected: + vector<vector<int> > clauses; +}; + + + +// +//============================================================================= +inline void create_vvect(int& top_id, vector<int>& ov, size_t nvars) +{ + assert(nvars > 0); + + size_t refnv = ov.size(); + size_t tvars = refnv + nvars; + ov.resize(tvars, 0); + + for (size_t k = refnv; k < tvars; ++k) + ov[k] = ++top_id; + + assert(ov.size() > 0); +} + + +// +//============================================================================= +inline void copy_vvect(int& top_id, vector<int>& ov, vector<int>& iv) +{ + size_t refnv = ov.size(); + ov.resize(refnv + iv.size(), 0); + + for (size_t k = 0; k < iv.size(); ++k) + ov[refnv + k] = iv[k]; + + assert(ov.size() > 0); +} + + + +// +//============================================================================= +inline void mk_half_vect(vector<int>& ov, vector<int>& iv, size_t offset) +{ + assert(iv.size() > 0); + + size_t ns = iv.size() / 2; + ov.resize(ns, 0); + + for (size_t k = 0; k < ns; ++k) + ov[k] = iv[offset + k]; +} + +// +//============================================================================= +inline void mk_odd_vect(vector<int>& ov, vector<int>& iv) +{ + assert(iv.size() > 0); + + size_t ns = iv.size() / 2; + ov.resize(ns, 0); + + for (size_t k = 0; k < ns; ++k) + ov[k] = iv[k * 2]; +} + +//============================================================================= +inline void mk_even_vect(vector<int>& ov, vector<int>& iv) +{ + assert(iv.size() > 0); + + size_t ns = iv.size() / 2; + ov.resize(ns, 0); + + for (size_t k = 0; k < ns; ++k) + ov[k] = iv[k * 2 + 1]; +} + +// sorting networks +//============================================================================= +inline void sortn_half_merge_recur( + int& top_id, + ClauseSet& clset, + vector<int>& av, + vector<int>& bv, + vector<int>& cv, + size_t zvar +) +{ + assert(bv.size() == av.size()); + + if (av.size() == 1) { // vectors of size 1 + assert(av[0] != 0); + if (NOPTCLS || (av[0] != zvar && bv[0] != zvar)) { + create_vvect(top_id, cv, 2); + clset.create_binary_clause (-av[0], cv[0]); + clset.create_binary_clause (-bv[0], cv[0]); + clset.create_ternary_clause(-av[0], -bv[0], cv[1]); + } + else { + if (av[0] == zvar) { + cv.push_back(bv[0]); + cv.push_back(av[0]); + } + else { + assert(bv[0] == zvar); + cv.push_back(av[0]); + cv.push_back(bv[0]); + } + } + } + else { + if (NOPTCLS || + ((av[0] != zvar || av[av.size() - 1] != zvar) && + (bv[0] != zvar || bv[av.size() - 1] != zvar))) { + vector<int> aodd, aeven, bodd, beven, dv, ev; + + mk_odd_vect(aodd, av); mk_even_vect(aeven, av); + mk_odd_vect(bodd, bv); mk_even_vect(beven, bv); + + sortn_half_merge_recur(top_id, clset, aodd, bodd, dv, zvar); + sortn_half_merge_recur(top_id, clset, aeven, beven, ev, zvar); + + assert(cv.size() == 0); + cv.push_back(dv[0]); + create_vvect(top_id, cv, 2 * av.size() - 2); + cv.push_back(ev[ev.size() - 1]); + + for (size_t i = 0; i < av.size() - 1; ++i) { + assert(i + 1 < dv.size()); + assert(i < ev.size()); + assert(2 * 1 + 1 < cv.size()); + + clset.create_binary_clause (-dv[i + 1], cv[2 * i + 1]); + clset.create_binary_clause (-ev[i ], cv[2 * i + 1]); + clset.create_ternary_clause(-dv[i + 1], -ev[i], cv[2 * i + 2]); + } + } + else { + if (av[0] == zvar && av[av.size() - 1] == zvar) { + copy_vvect(top_id, cv, bv); + copy_vvect(top_id, cv, av); + } + else { + assert(bv[0] == zvar && bv[av.size() - 1] == zvar); + copy_vvect(top_id, cv, av); + copy_vvect(top_id, cv, bv); + } + } + } + + assert(cv.size() > 0); +} + +// +//============================================================================= +inline vector<int>& sortn_half_sorter_recur( + int& top_id, + ClauseSet& clset, + vector<int>& av, + vector<int>& cv, + size_t zvar +) +{ + assert(av.size() > 1); + + if (av.size() == 2) { + assert(av[0] != 0 && av[1] != 0); + + vector<int> xav, xbv; xav.push_back(av[0]); xbv.push_back(av[1]); + sortn_half_merge_recur(top_id, clset, xav, xbv, cv, zvar); + } + else { + vector<int> dv1, dv2, lav, uav; + mk_half_vect(lav, av, 0); + mk_half_vect(uav, av, av.size() / 2); + + assert(lav.size() == uav.size()); + sortn_half_sorter_recur(top_id, clset, lav, dv1, zvar); assert(dv1.size() > 0); + sortn_half_sorter_recur(top_id, clset, uav, dv2, zvar); assert(dv2.size() > 0); + sortn_half_merge_recur (top_id, clset, dv1, dv2, cv, zvar); + } + + assert(cv.size() > 0); + return cv; +} + + +#endif // SORTCARD_HH_ diff --git a/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py b/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py new file mode 100644 index 0000000000000000000000000000000000000000..f18c26ece6ad9804c0d40f4e584df7e149b075c0 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# +from math import exp, log, trunc + +from pysat._utils import MainThread +from pysat.formula import CNF, IDPool +import pysortnetwrk + + + +def HSorNetwrk(lits, top_id=None, vpool=None): + assert not top_id or not vpool, \ + 'Use either a top id or a pool of variables but not both.' + + + # we are going to return this formula + #ret = CNF() + + # if the list of literals is empty, return empty formula + if not lits: + return ret + + # obtaining the top id from the variable pool + if vpool: + top_id = vpool.top + + + # making sure we are dealing with a list of literals + lits = list(lits) + + # choosing the maximum id among the current top and the list of literals + top_id = max(map(lambda x: abs(x), lits + [top_id if top_id != None else 0])) + + + nvars = len(lits) + + #get smallest power of 2 larger than number of vars + exponent = trunc(log(nvars) / log(2)) # assume exponent + nvtmp = exp(log(2) * exponent) + + # check if number of vars already is power of 2; correct exponent if required + exponent = exponent if (nvars - nvtmp < 0.000001) else exponent + 1 + nnvars = trunc(exp(log(2) * exponent) + 0.1) + + cl = None + zvar = 0 + if (nnvars != nvars): + top_id += 1 + zvar = top_id + lits.extend([zvar] * (nnvars - nvars)) + cl = [-zvar] + + # generate odd-even sorting network + clset,slits,top = pysortnetwrk.HSort(lits, top_id, zvar, int(MainThread.check())) + + clset = clset +[cl] if (cl is not None) else clset + + + # updating vpool if necessary + if vpool: + vpool.top = top - 1 + vpool._next() + + + + return clset, slits, top + +if __name__ == '__main__': + print("Sorting Network:") + lits = [1, 2, 3] + top_id = 5 + clauses, slits, top = HSorNetwrk(lits, top_id) + print(clauses) + print(slits) \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/archive/tree.py b/pages/application/RandomForest/utils/xrf/archive/tree.py new file mode 100644 index 0000000000000000000000000000000000000000..9e3794544ac8d0e2baded3a4271bab312f4aa86e --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/archive/tree.py @@ -0,0 +1,154 @@ +# +#============================================================================== +from anytree import Node, RenderTree,AsciiStyle +import json +import numpy as np +import math +import os + + +# +#============================================================================== +class xgnode(Node): + def __init__(self, id, parent = None): + Node.__init__(self, id, parent) + self.id = id # The node value + self.name = None + self.left_node_id = -1 # Left child + self.right_node_id = -1 # Right child + + self.feature = -1 + self.threshold = None + self.values = -1 + #iai + #self.split = None + + def __str__(self): + pref = ' ' * self.depth + if (len(self.children) == 0): + return (pref+ "leaf: {} {}".format(self.id, self.values)) + else: + if(self.name is None): + return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) + else: + return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) + + +#============================================================================== +def build_tree(tree_, feature_names = None): + ## + feature = tree_.feature + threshold = tree_.threshold + values = tree_.value + n_nodes = tree_.node_count + children_left = tree_.children_left + children_right = tree_.children_right + node_depth = np.zeros(shape=n_nodes, dtype=np.int64) + is_leaf = np.zeros(shape=n_nodes, dtype=bool) + stack = [(0, -1)] # seed is the root node id and its parent depth + while len(stack) > 0: + node_id, parent_depth = stack.pop() + node_depth[node_id] = parent_depth + 1 + + # If we have a test node + if (children_left[node_id] != children_right[node_id]): + stack.append((children_left[node_id], parent_depth + 1)) + stack.append((children_right[node_id], parent_depth + 1)) + else: + is_leaf[node_id] = True + ## + + m = tree_.node_count + assert (m > 0), "Empty tree" + + def extract_data(idx, root = None, feature_names = None): + i = idx + assert (i < m), "Error index node" + if (root is None): + node = xgnode(i) + else: + node = xgnode(i, parent = root) + #node.cover = json_node["cover"] + if is_leaf[i]: + node.values = np.argmax(values[i]) + #if(inverse): + # node.values = -node.values + else: + node.feature = feature[i] + if (feature_names is not None): + node.name = feature_names[feature[i]] + node.threshold = threshold[i] + node.left_node_id = children_left[i] + node.right_node_id = children_right[i] + extract_data(node.left_node_id, node, feature_names) #feat < 0.5 (False) + extract_data(node.right_node_id, node, feature_names) #feat > 0.% (True) + + return node + + root = extract_data(0, None, feature_names) + + return root + + +#============================================================================== +def walk_tree(node): + if (len(node.children) == 0): + # leaf + print(node) + else: + print(node) + walk_tree(node.children[0]) + walk_tree(node.children[1]) + +# +#============================================================================== +def predict_tree(node, sample): + if (len(node.children) == 0): + # leaf + return node.values + else: + feature_branch = node.feature + sample_value = sample[feature_branch] + assert(sample_value is not None) + if(sample_value < node.threshold): + return predict_tree(node.children[0], sample) + else: + return predict_tree(node.children[1], sample) + + +# +#============================================================================== +class Forest: + """ An ensemble of decision trees. + + This object provides a common interface to many different types of models. + """ + def __init__(self, rf, feature_names = None): + self.rf = rf + ##self.feature_names = feature_names + self.trees = [ build_tree(dt.tree_, feature_names) for dt in self.rf.estimators()] + #self.print_trees() + + def print_trees(self): + for i,t in enumerate(self.trees): + print("tree number: ", i) + walk_tree(t) + + def predict(self, samples): + predictions = [] + n_estimators = self.rf.n_estimators() + print("#Trees: ", n_estimators) + for sample in np.asarray(samples): + scores = [] + for i,t in enumerate(self.trees): + s = predict_tree(t, sample) + scores.append((s)) + scores = np.asarray(scores) + predictions.append(scores) + predictions = np.asarray(predictions) + #print(predictions) + #np.bincount(x, weights=self._weights_not_none) + maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions) + + return maj + diff --git a/pages/application/RandomForest/utils/xrf/checker.py b/pages/application/RandomForest/utils/xrf/checker.py new file mode 100644 index 0000000000000000000000000000000000000000..5fb8650613bc4fe4e8d9d033e71476729a051164 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/checker.py @@ -0,0 +1,346 @@ +# +#============================================================================== +import numpy as np +import math + +from .tree import Forest, dt_node +import six +from pysat.formula import CNF, IDPool +from pysat.solvers import Solver +from pysat.card import CardEnc, EncType +#from itertools import combinations + +# +#============================================================================== +def predict_tree(node, sample): + if (len(node.children) == 0): + # leaf + return node.values + else: + feat = node.feature + sample_value = sample[feat] + if sample_value is None: + return predict_tree(node.children[0], sample) + elif(sample_value < node.threshold): + return predict_tree(node.children[0], sample) + else: + return predict_tree(node.children[1], sample) + + +# +#============================================================================== +class Checker: + + def __init__(self, forest, num_class, feature_names): + self.forest = forest + self.num_class = num_class + self.feature_names = feature_names + self.cnf = None + self.vpool = IDPool() + + + self.intvs = None + self.intvs = {'{0}'.format(f): set([]) for f in feature_names if '_' not in f} + for tree in self.forest.trees: + self.traverse_intervals(tree) + self.intvs = {f: sorted(self.intvs[f]) + + ([math.inf] if len(self.intvs[f]) else []) + for f in six.iterkeys(self.intvs)} + self.imaps, self.ivars = {}, {} + self.thvars = {} + for feat, intvs in six.iteritems(self.intvs): + self.imaps[feat] = {} + self.ivars[feat] = [] + self.thvars[feat] = [] + for i, ub in enumerate(intvs): + self.imaps[feat][ub] = i + ivar = self.newVar('{0}_intv{1}'.format(feat, i)) + self.ivars[feat].append(ivar) + if ub != math.inf: + thvar = self.newVar('{0}_th{1}'.format(feat, i)) + self.thvars[feat].append(thvar) + + + self.cnf = CNF() + #### + cvars = [self.newVar('class{0}'.format(i)) for i in range(num_class)] + num_tree = len(self.forest.trees) + ctvars = [[] for t in range(num_tree)] + for k in range(num_tree): + for j in range(self.num_class): + var = self.newVar('class{0}_tr{1}'.format(j,k)) + ctvars[k].append(var) + ##### + for k, tree in enumerate(self.forest.trees): + self.traverse(tree, k, []) + card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool,encoding=EncType.cardnetwrk) + self.cnf.extend(card.clauses) + ###### + for f, intvs in six.iteritems(self.ivars): + if not len(intvs): + continue + self.cnf.append(intvs) + card = CardEnc.atmost(lits=intvs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(card.clauses) + for f, threshold in six.iteritems(self.thvars): + for j, thvar in enumerate(threshold): + d = j+1 + pos, neg = self.ivars[f][d:], self.ivars[f][:d] + if j == 0: + self.cnf.append([thvar, neg[-1]]) + self.cnf.append([-thvar, -neg[-1]]) + else: + self.cnf.append([thvar, neg[-1], -threshold[j-1]]) + self.cnf.append([-thvar, threshold[j-1]]) + self.cnf.append([-thvar, -neg[-1]]) + + if j == len(threshold) - 1: + self.cnf.append([-thvar, pos[0]]) + self.cnf.append([thvar, -pos[0]]) + else: + self.cnf.append([-thvar, pos[0], threshold[j+1]]) + self.cnf.append([thvar, -pos[0]]) + self.cnf.append([thvar, -threshold[j+1]]) + + + def newVar(self, name): + if name in self.vpool.obj2id: #var has been already created + return self.vpool.obj2id[name] + var = self.vpool.id('{0}'.format(name)) + return var + + def traverse(self, tree, k, clause): + if tree.children: + f = tree.name + v = tree.threshold + pos = neg = [] + if f in self.intvs: + d = self.imaps[f][v] + pos, neg = self.thvars[f][d], -self.thvars[f][d] + else: + var = self.newVar(tree.name) + pos, neg = var, -var + self.traverse(tree.children[0], k, clause + [-neg]) + self.traverse(tree.children[1], k, clause + [-pos]) + else: # leaf node + cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k)) + self.cnf.append(clause + [cvar]) + #self.printLits(clause + [cvar]) + + + + def traverse_intervals(self, tree): + if tree.children: + f = tree.name + v = tree.threshold + if f in self.intvs: + self.intvs[f].add(v) + self.traverse_intervals(tree.children[0]) + self.traverse_intervals(tree.children[1]) + + + def check(self, sample, expl): + print("check PI-expl") + slv = Solver(name="glucose3") + slv.append_formula(self.cnf) + + pred = self.forest.predict_inst(sample) + num_tree = len(self.forest.trees) + ##### + cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] + ctvars = [[] for t in range(num_tree)] + for k in range(num_tree): + for j in range(self.num_class): + var = self.newVar('class{0}_tr{1}'.format(j,k)) + ctvars[k].append(var) + # + rhs = num_tree - 1 + for j in range(pred): + lhs = [ctvars[k][j] for k in range(num_tree)] + [ - ctvars[k][pred] for k in range(num_tree)] + atms = CardEnc.atmost(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + #add maj class selector to activate/deactivate eq atmsk + #self.cnf.extend([cl + [-cvars[pred]] for cl in atms]) + slv.append_formula([cl + [-cvars[pred]] for cl in atms]) + rhs = num_tree + for j in range(pred + 1, self.num_class): + lhs = [ctvars[k][j] for k in range(num_tree)] + [ - ctvars[k][pred] for k in range(num_tree)] + atms = CardEnc.atmost(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + #self.cnf.extend([cl + [-cvars[pred]] for cl in atms]) + slv.append_formula([cl + [-cvars[pred]] for cl in atms]) + ######## + ######## + rhs = num_tree + for j in range(pred): + lhs = [ - ctvars[k][j] for k in range(num_tree)] + [ctvars[k][pred] for k in range(num_tree)] + atms = CardEnc.atmost(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + #self.cnf.extend([cl+[-cvars[j]] for cl in atms]) + slv.append_formula([cl+[-cvars[j]] for cl in atms]) + rhs = num_tree - 1 + for j in range(pred + 1, self.num_class): + lhs = [ - ctvars[k][j] for k in range(num_tree)] + [ctvars[k][pred] for k in range(num_tree)] + atms = CardEnc.atmost(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + #self.cnf.extend([cl+[-cvars[j]] for cl in atms]) + slv.append_formula([cl+[-cvars[j]] for cl in atms]) + ############ + #self.cnf.append(cvars) + card = CardEnc.atmost(lits=cvars, vpool=self.vpool, encoding=EncType.cardnetwrk) #AtMostOne constraint + #self.cnf.extend(card.clauses) + slv.add_clause(cvars) + slv.append_formula(card.clauses) + + assums = [] # var selectors to be used as assumptions + #sel2fid = {} # selectors to original feature ids + #sel2vid = {} # selectors to categorical feature ids + #sel2v = {} # selectors to (categorical/interval) values + sel_expl = [] + + #inps = ['f{0}'.format(f) for f in range(len(sample))] # works only with pure continuous feats + inps = self.feature_names + + for i, (inp, val) in enumerate(zip(inps, sample)): + if len(self.intvs[inp]): + v = next((intv for intv in self.intvs[inp] if intv > val), None) + assert(v is not None) + selv = self.newVar('selv_{0}'.format(inp)) + assums.append(selv) + ## + if i in expl: + sel_expl.append(selv) + #print('{0}={1}'.format('selv_{0}'.format(inp), val)) + ## + for j,p in enumerate(self.ivars[inp]): + cl = [-selv] + if j == self.imaps[inp][v]: + cl += [p] + #self.sel2v[selv] = p + else: + cl += [-p] + #self.cnf.append(cl) + slv.add_clause(cl) + assums = sorted(set(assums)) + #print(sel_expl, assums) + sel_pred = cvars[pred] + + #slv = Solver(name="glucose3") + #slv.append_formula(self.cnf) + + + assert (slv.solve(assumptions=sel_expl+[sel_pred])), '{0} is not an explanation.'.format(expl) + print('expl:{0} is valid'.format(expl)) + + for i, p in enumerate(sel_expl): + #print(i,p) + to_test = sel_expl[:i] + sel_expl[(i + 1):] + [-sel_pred] + print(to_test) + assert slv.solve(assumptions=to_test), '{0} is not minimal explanation.'.format(expl) + + # delete sat solver + slv.delete() + slv = None + + print('expl:{0} is minimal'.format(expl)) + print() + + +def check_expl(sample, expl, forest, intvs): + + print("check PI-expl") + + pred = forest.predict_inst(sample) + + sample_expl = [None]*len(sample) + for p in expl: + sample_expl[p] = sample[p] + + # initializing the intervals + #intvs = {'f{0}'.format(f): set([]) for f in range(len(sample))} + #for tree in forest.trees: + # traverse_intervals(tree) + + # first, check if expl is an explanation + scores = [predict_tree(dt, sample_expl) for dt in forest.trees] + scores = np.asarray(scores) + maj = np.argmax(np.bincount(scores)) + + assert maj == pred, '{0} is not an explanation.'.format(expl) + + print('expl:{0} is valid'.format(expl)) + print("pred = ", pred) + + sample_expl = sample + + feats = ['f{0}'.format(f) for f in expl] + univ = [(i, f) for i, f in enumerate(intvs) if (len(intvs[f]) and (f not in feats))] + + # Now, check if expl is a minimal + for p, f in zip(expl, feats): + print("{0}={1}".format(f, sample_expl[p])) + print([-math.inf]+intvs[f]) + assert(len(intvs[f])) + + # calculate possible values for f + possible_val = [] + d = next((i for i, v in enumerate(intvs[f]) if v > sample_expl[p]), None) + assert(d is not None) + print("d=",d) + + if d: + #possible_val.append(intvs[f][0] - 1) + possible_val.append(-math.inf) + print(intvs[f][:d-1]) + for i, v in enumerate(intvs[f][:d-1]): + possible_val.append((v + intvs[f][i + 1]) * 0.5) + + for i, v in enumerate(intvs[f][d+1:]): + #print('{0} + {1}'.format(v , intvs[f][d+i])) + possible_val.append((v + intvs[f][d+i]) * 0.5) + #if v == math.inf: + # assert(v == intvs[f][-1]) + # possible_val.append(v + 1) + #else: + # possible_val.append((v + intvs[f][i - 1]) * 0.5) + + + ## print("{0} => {1} | {2} , {3}".format(f,sample_expl[p], [-math.inf]+intvs[f], possible_val)) + for v in possible_val: + sample_expl[p] = v + for uf in univ: + for x in ([-math.inf]+intvs[uf[1]]): + print('{0}={1}'.format(uf[1], x)) + sample_expl[uf[0]] = x + scores = [predict_tree(dt, sample_expl) for dt in forest.trees] + scores = np.asarray(scores) + maj = np.argmax(np.bincount(scores)) + #print("maj: {0} | {1}={2}".format( maj, f, v)) + if maj != pred: + break + sample_expl[uf[0]] = sample[p] + + print("maj: {0} | {1}={2}".format( maj, f, v)) + + else: + assert False, '{0} is not minimal explanation.'.format(expl) + + sample_expl[p] = sample[p] + + print('expl:{0} is minimal'.format(expl)) + print() + + return True + +''' +def traverse_intervals(tree, intvs): + if tree.children: + f = tree.name + v = tree.threshold + if f in self.intvs: + intvs[p].add(v) + + l_intvs = traverse_intervals(tree.children[0]) + r_intvs = traverse_intervals(tree.children[1]) + return {**l_intvs, **r_intvs} + + else: + return intvs +''' + diff --git a/pages/application/RandomForest/utils/xrf/rndmforest.py b/pages/application/RandomForest/utils/xrf/rndmforest.py new file mode 100644 index 0000000000000000000000000000000000000000..583fe91b8dcd0fab92b9be65b30794636497eb7d --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/rndmforest.py @@ -0,0 +1,976 @@ + +from sklearn.ensemble._voting import VotingClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import OneHotEncoder, LabelEncoder +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +import numpy as np +import sys +import os +import resource + +import collections +from six.moves import range +import six +import math + +from pages.application.RandomForest.utils.data import Data +from .tree import Forest, predict_tree +#from .encode import SATEncoder +from pysat.formula import CNF, IDPool +from pysat.solvers import Solver +from pysat.card import CardEnc, EncType +from itertools import combinations + + + +# +#============================================================================== +class Dataset(Data): + """ + Class for representing dataset (transactions). + """ + def __init__(self, filename=None, fpointer=None, mapfile=None, + separator=' ', use_categorical = False): + super().__init__(filename, fpointer, mapfile, separator, use_categorical) + + # split data into X and y + self.feature_names = self.names[:-1] + self.nb_features = len(self.feature_names) + self.use_categorical = use_categorical + + samples = np.asarray(self.samps) + if not all(c.isnumeric() for c in samples[:, -1]): + le = LabelEncoder() + le.fit(samples[:, -1]) + samples[:, -1]= le.transform(samples[:, -1]) + #self.class_names = le.classes_ + print(le.classes_) + print(samples[1:4, :]) + + samples = np.asarray(samples, dtype=np.float32) + self.X = samples[:, 0: self.nb_features] + self.y = samples[:, self.nb_features] + self.num_class = len(set(self.y)) + self.target_name = list(range(self.num_class)) + + print("c nof features: {0}".format(self.nb_features)) + print("c nof classes: {0}".format(self.num_class)) + print("c nof samples: {0}".format(len(self.samps))) + + # check if we have info about categorical features + if (self.use_categorical): + self.target_name = self.class_names + + self.binarizer = {} + for i in self.categorical_features: + self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)})#, + self.binarizer[i].fit(self.X[:,[i]]) + else: + self.categorical_features = [] + self.categorical_names = [] + self.binarizer = [] + #feat map + self.mapping_features() + + + + def train_test_split(self, test_size=0.2, seed=0): + return train_test_split(self.X, self.y, test_size=test_size, random_state=seed) + + + def transform(self, x): + if(len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert(self.binarizer != []) + tx = [] + for i in range(self.nb_features): + #self.binarizer[i].drop = None + if (i in self.categorical_features): + self.binarizer[i].drop = None + tx_aux = self.binarizer[i].transform(x[:,[i]]) + tx_aux = np.vstack(tx_aux) + tx.append(tx_aux) + else: + tx.append(x[:,[i]]) + tx = np.hstack(tx) + return tx + else: + return x + + def transform_inverse(self, x): + if(len(x) == 0): + return x + if (len(x.shape) == 1): + x = np.expand_dims(x, axis=0) + if (self.use_categorical): + assert(self.binarizer != []) + inverse_x = [] + for i, xi in enumerate(x): + inverse_xi = np.zeros(self.nb_features) + for f in range(self.nb_features): + if f in self.categorical_features: + nb_values = len(self.categorical_names[f]) + v = xi[:nb_values] + v = np.expand_dims(v, axis=0) + iv = self.binarizer[f].inverse_transform(v) + inverse_xi[f] =iv + xi = xi[nb_values:] + + else: + inverse_xi[f] = xi[0] + xi = xi[1:] + inverse_x.append(inverse_xi) + return inverse_x + else: + return x + + def transform_inverse_by_index(self, idx): + if (idx in self.extended_feature_names): + return self.extended_feature_names[idx] + else: + print("Warning there is no feature {} in the internal mapping".format(idx)) + return None + + def transform_by_value(self, feat_value_pair): + if (feat_value_pair in self.extended_feature_names.values()): + keys = (list(self.extended_feature_names.keys())[list( self.extended_feature_names.values()).index(feat_value_pair)]) + return keys + else: + print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) + return None + + def mapping_features(self): + self.extended_feature_names = {} + self.extended_feature_names_as_array_strings = [] + counter = 0 + if (self.use_categorical): + for i in range(self.nb_features): + if (i in self.categorical_features): + for j, _ in enumerate(self.binarizer[i].categories_[0]): + self.extended_feature_names.update({counter: (self.feature_names[i], j)}) + self.extended_feature_names_as_array_strings.append("f{}_{}".format(i,j)) # str(self.feature_names[i]), j)) + counter = counter + 1 + else: + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i)) #(self.feature_names[i]) + counter = counter + 1 + else: + for i in range(self.nb_features): + self.extended_feature_names.update({counter: (self.feature_names[i], None)}) + self.extended_feature_names_as_array_strings.append("f{}".format(i))#(self.feature_names[i]) + counter = counter + 1 + + def readable_sample(self, x): + readable_x = [] + for i, v in enumerate(x): + if (i in self.categorical_features): + readable_x.append(self.categorical_names[i][int(v)]) + else: + readable_x.append(v) + return np.asarray(readable_x) + + + def test_encoding_transformes(self, X_train): + # test encoding + + X = X_train[[0],:] + + print("Sample of length", len(X[0])," : ", X) + enc_X = self.transform(X) + print("Encoded sample of length", len(enc_X[0])," : ", enc_X) + inv_X = self.transform_inverse(enc_X) + print("Back to sample", inv_X) + print("Readable sample", self.readable_sample(inv_X[0])) + assert((inv_X == X).all()) + + ''' + for i in range(len(self.extended_feature_names)): + print(i, self.transform_inverse_by_index(i)) + for key, value in self.extended_feature_names.items(): + print(value, self.transform_by_value(value)) + ''' +# +#============================================================================== +class VotingRF(VotingClassifier): + """ + Majority rule classifier + """ + + def fit(self, X, y, sample_weight=None): + self.estimators_ = [] + for _, est in self.estimators: + self.estimators_.append(est) + + self.le_ = LabelEncoder().fit(y) + self.classes_ = self.le_.classes_ + + + def predict(self, X): + """Predict class labels for X. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. + Returns + ------- + maj : array-like of shape (n_samples,) + Predicted class labels. + """ + #check_is_fitted(self) + + # 'hard' voting + predictions = self._predict(X) + predictions = np.asarray(predictions, np.int64) #NEED TO BE CHECKED + maj = np.apply_along_axis( + lambda x: np.argmax( + np.bincount(x, weights=self._weights_not_none)), + axis=1, arr=predictions) + + maj = self.le_.inverse_transform(maj) + + return maj + + +# +#============================================================================== +class RF2001(object): + """ + The main class to train Random Forest Classifier (RFC). + """ + + def __init__(self, options): + """ + Constructor. + """ + self.forest = None + self.voting = None + self.opt = options + + param_dist = {'n_estimators':options.n_estimators, + 'max_depth':options.maxdepth, + 'criterion':'entropy', + 'random_state':324089} + + self.forest = RandomForestClassifier(**param_dist) + + + + def train(self, dataset, outfile=None): + """ + Train a random forest. + """ + + X_train, X_test, y_train, y_test = dataset.train_test_split() + + if self.opt.verb: + dataset.test_encoding_transformes(X_train) + + X_train = dataset.transform(X_train) + X_test = dataset.transform(X_test) + + print("Build a random forest.") + self.forest.fit(X_train,y_train) + + rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)] + self.voting = VotingRF(estimators=rtrees) + self.voting.fit(X_train,y_train) + + ''' + print(X_test[[0],:]) + print("RF: ",np.asarray(self.voting.predict(X_test[[0],:]))) + for i,t in enumerate(self.forest.estimators_): + print("DT_{0}: {1}".format(i,np.asarray(t.predict(X_test[[0],:])))) + ''' + + train_acc = accuracy_score(self.predict(X_train), y_train) + test_acc = accuracy_score(self.predict(X_test), y_test) + + if self.opt.verb > 1: + self.print_acc_vote(X_train, X_test, y_train, y_test) + self.print_acc_prob(X_train, X_test, y_train, y_test) + + return train_acc, test_acc + + def predict(self, X): + return self.voting.predict(X) + + def predict_prob(self, X): + self.forest.predict(X) + + def estimators(self): + assert(self.forest.estimators_ is not None) + return self.forest.estimators_ + + def n_estimators(self): + return self.forest.n_estimators + + def print_acc_vote(self, X_train, X_test, y_train, y_test): + train_acc = accuracy_score(self.predict(X_train), y_train) + test_acc = accuracy_score(self.predict(X_test), y_test) + print("----------------------") + print("RF2001:") + print("Train accuracy RF2001: {0:.2f}".format(100. * train_acc)) + print("Test accuracy RF2001: {0:.2f}".format(100. * test_acc)) + print("----------------------") + + def print_acc_prob(self, X_train, X_test, y_train, y_test): + train_acc = accuracy_score(self.forest.predict(X_train), y_train) + test_acc = accuracy_score(self.forest.predict(X_test), y_test) + print("RF-scikit:") + print("Train accuracy RF-scikit: {0:.2f}".format(100. * train_acc)) + print("Test accuracy RF-scikit: {0:.2f}".format(100. * test_acc)) + print("----------------------") + + def print_accuracy(self, data): + _, X_test, _, y_test = data.train_test_split() + #X_train = dataset.transform(X_train) + X_test = data.transform(X_test) + test_acc = accuracy_score(self.predict(X_test), y_test) + #print("----------------------") + #print("Train accuracy : {0:.2f}".format(100. * train_acc)) + #print("Test accuracy : {0:.2f}".format(100. * test_acc)) + print("c Cross-Validation: {0:.2f}".format(100. * test_acc)) + #print("----------------------") +# +#============================================================================== +class XRF(object): + """ + class to encode and explain Random Forest classifiers. + """ + + def __init__(self, options, model, dataset): + self.cls = model + self.data = dataset + self.verbose = options.verb + self.f = Forest(model, dataset.extended_feature_names_as_array_strings) + + if options.verb > 2: + self.f.print_trees() + print("c RF sz:", self.f.sz) + print('c max-depth:', self.f.md) + print('c nof DTs:', len(self.f.trees)) + + + def encode(self, inst): + """ + Encode a tree ensemble trained previously. + """ + if 'f' not in dir(self): + self.f = Forest(self.cls, self.data.extended_feature_names_as_array_strings) + #self.f.print_tree() + + time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + self.enc = SATEncoder(self.f, self.data.feature_names, self.data.num_class, \ + self.data.extended_feature_names_as_array_strings) + + inst = self.data.transform(np.array(inst))[0] + formula, _, _, _ = self.enc.encode(inst) + + time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - time + + if self.verbose: + print('c nof vars:', formula.nv) # number of variables + print('c nof clauses:', len(formula.clauses)) # number of clauses + print('c encoding time: {0:.3f}'.format(time)) + + def explain(self, inst): + """ + Explain a prediction made for a given sample with a previously + trained RF. + """ + + time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + if 'enc' not in dir(self): + self.encode(inst) + + #if self.verbose: + # print("instance: {0}".format(np.array(inst)) ) + + inpvals = self.data.readable_sample(inst) + preamble = [] + for f, v in zip(self.data.feature_names, inpvals): + if f not in str(v): + preamble.append('{0} = {1}'.format(f, v)) + else: + preamble.append(v) + + inps = self.data.extended_feature_names_as_array_strings # input (feature value) variables + #print("inps: {0}".format(inps)) + + self.x = SATExplainer(self.enc, inps, preamble, self.data.target_name, verb=self.verbose) + inst = self.data.transform(np.array(inst))[0] + expl = self.x.explain(np.array(inst)) + + time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - time + + if self.verbose: + print("c Total time: {0:.3f}".format(time)) + + return expl + + + def test_tree_ensemble(self): + if 'f' not in dir(self): + self.f = Forest(self.cls) + + _, X_test, _, y_test = self.data.train_test_split() + X_test = self.data.transform(X_test) + + y_pred_forest = self.f.predict(X_test) + acc = accuracy_score(y_pred_forest, y_test) + print("Test accuracy: {0:.2f}".format(100. * acc)) + + y_pred_cls = self.cls.predict(X_test) + #print(np.asarray(y_pred_cls, np.int64)) + #print(y_pred_forest) + + assert((y_pred_cls == y_pred_forest).all()) + + +# +#============================================================================== +class SATEncoder(object): + """ + Encoder of Random Forest classifier into SAT. + """ + + def __init__(self, forest, feats, nof_classes, extended_feature_names, from_file=None): + self.forest = forest + #self.feats = {f: i for i, f in enumerate(feats)} + self.num_class = nof_classes + self.vpool = IDPool() + self.extended_feature_names = extended_feature_names + + #encoding formula + self.cnf = None + + # for interval-based encoding + self.intvs, self.imaps, self.ivars, self.thvars = None, None, None, None + + + def newVar(self, name): + + if name in self.vpool.obj2id: #var has been already created + return self.vpool.obj2id[name] + + var = self.vpool.id('{0}'.format(name)) + return var + + def printLits(self, lits): + print(["{0}{1}".format("-" if p<0 else "",self.vpool.obj(abs(p))) for p in lits]) + + def traverse(self, tree, k, clause): + """ + Traverse a tree and encode each node. + """ + + if tree.children: + f = tree.name + v = tree.threshold + pos = neg = [] + if f in self.intvs: + d = self.imaps[f][v] + pos, neg = self.thvars[f][d], -self.thvars[f][d] + else: + var = self.newVar(tree.name) + pos, neg = var, -var + #print("{0} => {1}".format(tree.name, var)) + + assert (pos and neg) + self.traverse(tree.children[0], k, clause + [-neg]) + self.traverse(tree.children[1], k, clause + [-pos]) + else: # leaf node + cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k)) + self.cnf.append(clause + [cvar]) + #self.printLits(clause + [cvar]) + + def compute_intervals(self): + """ + Traverse all trees in the ensemble and extract intervals for each + feature. + + At this point, the method only works for numerical datasets! + """ + + def traverse_intervals(tree): + """ + Auxiliary function. Recursive tree traversal. + """ + + if tree.children: + f = tree.name + v = tree.threshold + if f in self.intvs: + self.intvs[f].add(v) + + traverse_intervals(tree.children[0]) + traverse_intervals(tree.children[1]) + + # initializing the intervals + self.intvs = {'{0}'.format(f): set([]) for f in self.extended_feature_names if '_' not in f} + + for tree in self.forest.trees: + traverse_intervals(tree) + + # OK, we got all intervals; let's sort the values + self.intvs = {f: sorted(self.intvs[f]) + ([math.inf] if len(self.intvs[f]) else []) for f in six.iterkeys(self.intvs)} + + self.imaps, self.ivars = {}, {} + self.thvars = {} + for feat, intvs in six.iteritems(self.intvs): + self.imaps[feat] = {} + self.ivars[feat] = [] + self.thvars[feat] = [] + for i, ub in enumerate(intvs): + self.imaps[feat][ub] = i + + ivar = self.newVar('{0}_intv{1}'.format(feat, i)) + self.ivars[feat].append(ivar) + #print('{0}_intv{1}'.format(feat, i)) + + if ub != math.inf: + #assert(i < len(intvs)-1) + thvar = self.newVar('{0}_th{1}'.format(feat, i)) + self.thvars[feat].append(thvar) + #print('{0}_th{1}'.format(feat, i)) + + + + def encode(self, sample): + """ + Do the job. + """ + + ###print('Encode RF into SAT ...') + + self.cnf = CNF() + # getting a tree ensemble + #self.forest = Forest(self.model, self.extended_feature_names) + num_tree = len(self.forest.trees) + self.forest.predict_inst(sample) + + #introducing class variables + #cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] + + # define Tautology var + vtaut = self.newVar('Tautology') + + # introducing class-tree variables + ctvars = [[] for t in range(num_tree)] + for k in range(num_tree): + for j in range(self.num_class): + var = self.newVar('class{0}_tr{1}'.format(j,k)) + ctvars[k].append(var) + + # traverse all trees and extract all possible intervals + # for each feature + ###print("compute intervarls ...") + self.compute_intervals() + + #print(self.intvs) + #print([len(self.intvs[f]) for f in self.intvs]) + #print(self.imaps) + #print(self.ivars) + #print(self.thvars) + #print(ctvars) + + + ##print("encode trees ...") + # traversing and encoding each tree + for k, tree in enumerate(self.forest.trees): + #print("Encode tree#{0}".format(k)) + # encoding the tree + self.traverse(tree, k, []) + # exactly one class var is true + #self.printLits(ctvars[k]) + card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool,encoding=EncType.cardnetwrk) + self.cnf.extend(card.clauses) + + + + # calculate the majority class + self.cmaj = self.forest.predict_inst(sample) + + ##print("encode majority class ...") + #Cardinality constraint AtMostK to capture a j_th class + + if(self.num_class == 2): + rhs = math.floor(num_tree / 2) + 1 + if(self.cmaj==1 and not num_tree%2): + rhs = math.floor(num_tree / 2) + lhs = [ctvars[k][1 - self.cmaj] for k in range(num_tree)] + atls = CardEnc.atleast(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(atls) + else: + zvars = [] + zvars.append([self.newVar('z_0_{0}'.format(k)) for k in range (num_tree) ]) + zvars.append([self.newVar('z_1_{0}'.format(k)) for k in range (num_tree) ]) + ## + rhs = num_tree + lhs0 = zvars[0] + [ - ctvars[k][self.cmaj] for k in range(num_tree)] + ##self.printLits(lhs0) + atls = CardEnc.atleast(lits = lhs0, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(atls) + ## + #rhs = num_tree - 1 + rhs = num_tree + 1 + ########### + lhs1 = zvars[1] + [ - ctvars[k][self.cmaj] for k in range(num_tree)] + ##self.printLits(lhs1) + atls = CardEnc.atleast(lits = lhs1, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(atls) + # + pvars = [self.newVar('p_{0}'.format(k)) for k in range(self.num_class + 1)] + ##self.printLits(pvars) + for k,p in enumerate(pvars): + for i in range(num_tree): + if k == 0: + z = zvars[0][i] + #self.cnf.append([-p, -z, vtaut]) + self.cnf.append([-p, z, -vtaut]) + #self.printLits([-p, z, -vtaut]) + #print() + elif k == self.cmaj+1: + z = zvars[1][i] + self.cnf.append([-p, z, -vtaut]) + + #self.printLits([-p, z, -vtaut]) + #print() + + else: + z = zvars[0][i] if (k<self.cmaj+1) else zvars[1][i] + self.cnf.append([-p, -z, ctvars[i][k-1] ]) + self.cnf.append([-p, z, -ctvars[i][k-1] ]) + + #self.printLits([-p, -z, ctvars[i][k-1] ]) + #self.printLits([-p, z, -ctvars[i][k-1] ]) + #print() + + # + self.cnf.append([-pvars[0], -pvars[self.cmaj+1]]) + ## + lhs1 = pvars[:(self.cmaj+1)] + ##self.printLits(lhs1) + eqls = CardEnc.equals(lits = lhs1, bound = 1, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(eqls) + + + lhs2 = pvars[(self.cmaj + 1):] + ##self.printLits(lhs2) + eqls = CardEnc.equals(lits = lhs2, bound = 1, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(eqls) + + + + ##print("exactly-one feat const ...") + # enforce exactly one of the feature values to be chosen + # (for categorical features) + categories = collections.defaultdict(lambda: []) + for f in self.extended_feature_names: + if '_' in f: + categories[f.split('_')[0]].append(self.newVar(f)) + for c, feats in six.iteritems(categories): + # exactly-one feat is True + self.cnf.append(feats) + card = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(card.clauses) + # lits of intervals + for f, intvs in six.iteritems(self.ivars): + if not len(intvs): + continue + self.cnf.append(intvs) + card = CardEnc.atmost(lits=intvs, vpool=self.vpool, encoding=EncType.cardnetwrk) + self.cnf.extend(card.clauses) + #self.printLits(intvs) + + + + for f, threshold in six.iteritems(self.thvars): + for j, thvar in enumerate(threshold): + d = j+1 + pos, neg = self.ivars[f][d:], self.ivars[f][:d] + + if j == 0: + assert(len(neg) == 1) + self.cnf.append([thvar, neg[-1]]) + self.cnf.append([-thvar, -neg[-1]]) + else: + self.cnf.append([thvar, neg[-1], -threshold[j-1]]) + self.cnf.append([-thvar, threshold[j-1]]) + self.cnf.append([-thvar, -neg[-1]]) + + if j == len(threshold) - 1: + assert(len(pos) == 1) + self.cnf.append([-thvar, pos[0]]) + self.cnf.append([thvar, -pos[0]]) + else: + self.cnf.append([-thvar, pos[0], threshold[j+1]]) + self.cnf.append([thvar, -pos[0]]) + self.cnf.append([thvar, -threshold[j+1]]) + + + + return self.cnf, self.intvs, self.imaps, self.ivars + + +# +#============================================================================== +class SATExplainer(object): + """ + An SAT-inspired minimal explanation extractor for Random Forest models. + """ + + def __init__(self, sat_enc, inps, preamble, target_name, verb=1): + """ + Constructor. + """ + + self.enc = sat_enc + self.inps = inps # input (feature value) variables + self.target_name = target_name + self.preamble = preamble + + self.verbose = verb + + self.slv = None + ##self.slv = Solver(name=options.solver) + ##self.slv = Solver(name="minisat22") + #self.slv = Solver(name="glucose3") + # CNF formula + #self.slv.append_formula(self.enc.cnf) + + + def explain(self, sample, smallest=False): + """ + Hypotheses minimization. + """ + if self.verbose: + print(' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) + + #create a SAT solver + self.slv = Solver(name="glucose3") + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + # adapt the solver to deal with the current sample + #self.csel = [] + self.assums = [] # var selectors to be used as assumptions + self.sel2fid = {} # selectors to original feature ids + self.sel2vid = {} # selectors to categorical feature ids + self.sel2v = {} # selectors to (categorical/interval) values + + #for i in range(self.enc.num_class): + # self.csel.append(self.enc.newVar('class{0}'.format(i))) + #self.csel = self.enc.newVar('class{0}'.format(self.enc.cmaj)) + + # preparing the selectors + for i, (inp, val) in enumerate(zip(self.inps, sample), 1): + if '_' in inp: + + assert (inp not in self.enc.intvs) + + feat = inp.split('_')[0] + selv = self.enc.newVar('selv_{0}'.format(feat)) + + self.assums.append(selv) + if selv not in self.sel2fid: + self.sel2fid[selv] = int(feat[1:]) + self.sel2vid[selv] = [i - 1] + else: + self.sel2vid[selv].append(i - 1) + + p = self.enc.newVar(inp) + if not val: + p = -p + else: + self.sel2v[selv] = p + + self.enc.cnf.append([-selv, p]) + + #self.enc.printLits([-selv, p]) + + elif len(self.enc.intvs[inp]): + #v = None + #for intv in self.enc.intvs[inp]: + # if intv > val: + # v = intv + # break + v = next((intv for intv in self.enc.intvs[inp] if intv > val), None) + assert(v is not None) + + selv = self.enc.newVar('selv_{0}'.format(inp)) + self.assums.append(selv) + + assert (selv not in self.sel2fid) + self.sel2fid[selv] = int(inp[1:]) + self.sel2vid[selv] = [i - 1] + + for j,p in enumerate(self.enc.ivars[inp]): + cl = [-selv] + if j == self.enc.imaps[inp][v]: + cl += [p] + self.sel2v[selv] = p + else: + cl += [-p] + + self.enc.cnf.append(cl) + #self.enc.printLits(cl) + ''' + with open("/tmp/pendigits.cnf", 'w') as fp: + fp.write('p cnf {0} {1}\n'.format(self.enc.cnf.nv, len(self.enc.cnf.clauses))) + for p in self.assums + [-self.csel]: + fp.write('{0} 0\n'.format(str(p))) + + for cl in self.enc.cnf.clauses: + fp.write(' '.join([str(p) for p in cl+[0]])) + fp.write('\n') + fp.close() + print(self.assums + [self.csel]) + ''' + + self.assums = sorted(set(self.assums)) + if self.verbose: + print(' # hypos:', len(self.assums)) + + # pass a CNF formula + self.slv.append_formula(self.enc.cnf) + + ''' + # if unsat, then the observation is not implied by the assumptions + if not self.slv.solve(assumptions=self.assums+[self.csel]): + print(' no implication!') + print(self.slv.get_core()) + sys.exit(1) + + if self.verbose > 1: + self.enc.printLits(self.assums+[self.csel]) + self.print_sat_model() + ''' + + if not smallest: + self.compute_minimal() + else: + raise NotImplementedError('Smallest explanation is not yet implemented.') + #self.compute_smallest() + + self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time + + expl = sorted([self.sel2fid[h] for h in self.assums if h>0 ]) + assert len(expl), 'PI-explanation cannot be an empty-set! otherwise the RF predicts only one class' + + # delete sat solver + self.slv.delete() + self.slv = None + + if self.verbose: + print("expl-selctors: ", expl) + self.preamble = [self.preamble[i] for i in expl] + print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) + print(' # hypos left:', len(expl)) + print(' time: {0:.3f}'.format(self.time)) + + return expl + + def compute_minimal(self): + """ + Compute any subset-minimal explanation. + """ + nsat, nunsat = 0, 0 + stimes, utimes = [], [] + + vtaut = self.enc.newVar('Tautology') + + # simple deletion-based linear search + for i, p in enumerate(self.assums): + to_test = [vtaut] + self.assums[:i] + self.assums[(i + 1):] + [-p, -self.sel2v[p]] + + t0 = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime + + sat = self.slv.solve(assumptions=to_test) + + if not sat: + self.assums[i] = -p + elif self.verbose > 1: + self.print_sat_model() + + t = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ + resource.getrusage(resource.RUSAGE_SELF).ru_utime - t0 + #print("{0} {1:.2f}s".format("SAT" if sat else "UNSAT", t)) + + if sat: + nsat += 1 + stimes.append(t) + if self.verbose > 1: + self.enc.printLits(to_test) + print("SAT") + else: + #print("Core: ",self.slv.get_core()) + nunsat += 1 + utimes.append(t) + if self.verbose > 1: + self.enc.printLits(to_test) + print("UNSAT") + if self.verbose: + print('') + print('#SAT: {0} | #UNSAT: {1}'.format(len(stimes), len(utimes))) + if(nsat): + print('SAT: tot: {0:.2f} | m: {1:.2f} | M: {2:.2f} | avg: {3:.2f}'.format( + sum(stimes), min(stimes), max(stimes), sum(stimes) / len(stimes))) + if(nunsat): + print('UNSAT: tot: {0:.2f} | m: {1:.2f} | M: {2:.2f} | avg: {3:.2f}'.format( + sum(utimes), min(utimes), max(utimes), sum(utimes) / len(utimes))) + print('') + + self.stimes, self.utimes = stimes, utimes + self.nsat, self.nunsat = nsat, nunsat + + + def print_sat_model(self): + assert(self.slv.get_model()) + model = [ p for p in self.slv.get_model() if self.enc.vpool.obj(abs(p)) ] + str_model = [] + lits = [] + for p in model: + if self.enc.vpool.obj(abs(p)) in self.inps: + str_model.append((p, self.enc.vpool.obj(abs(p)))) + + elif ("class" in self.enc.vpool.obj(abs(p))): + str_model.append((p, self.enc.vpool.obj(abs(p)))) + + #elif ("intv" in self.enc.vpool.obj(abs(p))) : + # str_model.append((p, self.enc.vpool.obj(abs(p)))) + + if ("_tr" in self.enc.vpool.obj(abs(p))) : + lits.append(p) + + if ("p_" in self.enc.vpool.obj(abs(p))) : + str_model.append((p, self.enc.vpool.obj(abs(p)))) + if ("z_" in self.enc.vpool.obj(abs(p))) : + str_model.append((p, self.enc.vpool.obj(abs(p)))) + + print("Model:", str_model) + ###print(self.slv.get_model()) + + num_tree = len(self.enc.forest.trees) + num_class = self.enc.num_class + occ = [0]*num_class + + for p in lits: + if p > 0: + j = int(self.enc.vpool.obj(abs(p))[5]) + occ[j] +=1 + print(occ) + diff --git a/pages/application/RandomForest/utils/xrf/tree.py b/pages/application/RandomForest/utils/xrf/tree.py new file mode 100644 index 0000000000000000000000000000000000000000..da81c9820d69d96061446e9d1eafbcb265bf1351 --- /dev/null +++ b/pages/application/RandomForest/utils/xrf/tree.py @@ -0,0 +1,174 @@ +# +#============================================================================== +from anytree import Node, RenderTree,AsciiStyle +import json +import numpy as np +import math +import os + + +# +#============================================================================== +class dt_node(Node): + def __init__(self, id, parent = None): + Node.__init__(self, id, parent) + self.id = id # The node value + self.name = None + self.left_node_id = -1 # Left child + self.right_node_id = -1 # Right child + + self.feature = -1 + self.threshold = None + self.values = -1 + #iai + #self.split = None + + def __str__(self): + pref = ' ' * self.depth + if len(self.children) == 0: + return (pref+ "leaf: {} {}".format(self.id, self.values)) + else: + if(self.name is None): + return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) + else: + return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) + + +#============================================================================== +def build_tree(tree_, feature_names = None): + ## + feature = tree_.feature + threshold = tree_.threshold + values = tree_.value + n_nodes = tree_.node_count + children_left = tree_.children_left + children_right = tree_.children_right + node_depth = np.zeros(shape=n_nodes, dtype=np.int64) + is_leaf = np.zeros(shape=n_nodes, dtype=bool) + stack = [(0, -1)] # seed is the root node id and its parent depth + while len(stack) > 0: + node_id, parent_depth = stack.pop() + node_depth[node_id] = parent_depth + 1 + + # If we have a test node + if (children_left[node_id] != children_right[node_id]): + stack.append((children_left[node_id], parent_depth + 1)) + stack.append((children_right[node_id], parent_depth + 1)) + else: + is_leaf[node_id] = True + ## + + m = tree_.node_count + assert (m > 0), "Empty tree" + + def extract_data(idx, root = None, feature_names = None): + i = idx + assert (i < m), "Error index node" + if (root is None): + node = dt_node(i) + else: + node = dt_node(i, parent = root) + #node.cover = json_node["cover"] + if is_leaf[i]: + node.values = np.argmax(values[i]) + #if(inverse): + # node.values = -node.values + else: + node.feature = feature[i] + if (feature_names is not None): + node.name = feature_names[feature[i]] + node.threshold = threshold[i] + node.left_node_id = children_left[i] + node.right_node_id = children_right[i] + extract_data(node.left_node_id, node, feature_names) #feat < threshold ( < 0.5 False) + extract_data(node.right_node_id, node, feature_names) #feat >= threshold ( >= 0.5 True) + + return node + + root = extract_data(0, None, feature_names) + + return root + + +#============================================================================== +def walk_tree(node): + if (len(node.children) == 0): + # leaf + print(node) + else: + print(node) + walk_tree(node.children[0]) + walk_tree(node.children[1]) + +def count_nodes(root): + def count(node): + if len(node.children): + return sum([1+count(n) for n in node.children]) + else: + return 0 + m = count(root) + 1 + return m + +# +#============================================================================== +def predict_tree(node, sample): + if (len(node.children) == 0): + # leaf + return node.values + else: + feature_branch = node.feature + sample_value = sample[feature_branch] + assert(sample_value is not None) + if(sample_value < node.threshold): + return predict_tree(node.children[0], sample) + else: + return predict_tree(node.children[1], sample) + + +# +#============================================================================== +class Forest: + """ An ensemble of decision trees. + + This object provides a common interface to many different types of models. + """ + def __init__(self, rf, feature_names = None): + #self.rf = rf + self.trees = [ build_tree(dt.tree_, feature_names) for dt in rf.estimators()] + self.sz = sum([dt.tree_.node_count for dt in rf.estimators()]) + self.md = max([dt.tree_.max_depth for dt in rf.estimators()]) + #### + nb_nodes = [dt.tree_.node_count for dt in rf.estimators()] + print("min: {0} | max: {1}".format(min(nb_nodes), max(nb_nodes))) + assert([dt.tree_.node_count for dt in rf.estimators()] == [count_nodes(dt) for dt in self.trees]) + #self.print_trees() + + def print_trees(self): + for i,t in enumerate(self.trees): + print("tree number: ", i) + walk_tree(t) + + def predict_inst(self, inst): + scores = [predict_tree(dt, inst) for dt in self.trees] + scores = np.asarray(scores) + maj = np.argmax(np.bincount(scores)) + return maj + + + def predict(self, samples): + predictions = [] + print("#Trees: ", len(self.trees)) + for sample in np.asarray(samples): + scores = [] + for i,t in enumerate(self.trees): + s = predict_tree(t, sample) + scores.append((s)) + scores = np.asarray(scores) + predictions.append(scores) + predictions = np.asarray(predictions) + #print(predictions) + #np.bincount(x, weights=self._weights_not_none) + maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions) + + return maj + diff --git a/pages/application/application.py b/pages/application/application.py index 29e01f9bc87e6f9d3c57e82ebb2547b17ca70a17..65475b9e6b50c7213810ddf1e10a0dae6e9ed148 100644 --- a/pages/application/application.py +++ b/pages/application/application.py @@ -4,7 +4,10 @@ import dash_daq as daq from pages.application.DecisionTree.DecisionTreeComponent import DecisionTreeComponent from pages.application.NaiveBayes.NaiveBayesComponent import NaiveBayesComponent -import subprocess +from pages.application.RandomForest.RandomForestComponent import RandomForestComponent + +import subprocess + class Application(): def __init__(self, view): @@ -14,9 +17,10 @@ class Application(): class Model(): - def __init__(self, names_models, dict_components): - + def __init__(self, names_models, dict_components, dic_solvers, dic_xtypes): self.dict_components = dict_components + self.dic_solvers = dic_solvers + self.dic_xtypes = dic_xtypes self.ml_models = names_models self.ml_model = '' @@ -26,17 +30,21 @@ class Model(): self.add_info = False self.model_info = '' - self.enum=1 - self.xtype = ['AXp', 'CXp'] - self.solver="g3" + self.enum = 1 + + self.xtypes = [] + self.xtype = "" + + self.solvers = [] + self.solver = "" self.instance = '' self.list_expls = [] self.list_cont_expls = [] - self.expl='' - self.cont_expl='' + self.expl = '' + self.cont_expl = '' self.component_class = '' self.component = '' @@ -44,7 +52,9 @@ class Model(): def update_ml_model(self, ml_model_update): self.ml_model = ml_model_update self.component_class = self.dict_components[self.ml_model] - self.component_class = globals()[self.component_class] + self.component_class = globals()[self.component_class] + self.solvers = self.dic_solvers[self.ml_model] + self.xtypes = self.dic_xtypes[self.ml_model] def update_pretrained_model(self, pretrained_model_update): self.pretrained_model = pretrained_model_update @@ -57,24 +67,29 @@ class Model(): def update_pretrained_model_layout_with_info(self, model_info, model_info_filename): self.model_info = model_info - self.component = self.component_class(self.pretrained_model, info=self.model_info, type_info=model_info_filename) + self.component = self.component_class(self.pretrained_model, info=self.model_info, + type_info=model_info_filename) def update_instance(self, instance): self.instance = instance - self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, self.xtype, self.solver) - + self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, + self.xtype, self.solver) + def update_enum(self, enum): self.enum = enum - self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, self.xtype, self.solver) - + self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, + self.xtype, self.solver) + def update_xtype(self, xtype): self.xtype = xtype - self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, self.xtype, self.solver) + self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, + self.xtype, self.solver) def update_solver(self, solver): self.solver = solver - self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, self.xtype, self.solver) - + self.list_expls, self.list_cont_expls = self.component.update_with_explicability(self.instance, self.enum, + self.xtype, self.solver) + def update_expl(self, expl): self.expl = expl self.component.draw_explanation(self.instance, expl) @@ -83,123 +98,125 @@ class Model(): self.expl = cont_expl self.component.draw_contrastive_explanation(self.instance, cont_expl) + class View(): def __init__(self, model): self.model = model - self.ml_menu_models = html.Div([ - html.Br(), - html.Label("Choose the Machine Learning algorithm :"), - html.Br(), - dcc.Dropdown(self.model.ml_models, - id='ml_model_choice', - className="dropdown")]) + self.ml_menu_models = html.Div([ + html.Br(), + html.Label("Choose the Machine Learning algorithm :"), + html.Br(), + dcc.Dropdown(self.model.ml_models, + id='ml_model_choice', + className="dropdown")]) self.pretrained_model_upload = html.Div([ - html.Hr(), - html.Label("Choose the pretrained model : "), - html.Br(), - dcc.Upload( - id='ml_pretrained_model_choice', - children=html.Div([ - 'Drag and Drop or ', - html.A('Select File') - ]), - className="upload" - ), - html.Div(id='pretrained_model_filename')]) + html.Hr(), + html.Label("Choose the pretrained model : "), + html.Br(), + dcc.Upload( + id='ml_pretrained_model_choice', + children=html.Div([ + 'Drag and Drop or ', + html.A('Select File') + ]), + className="upload" + ), + html.Div(id='pretrained_model_filename')]) self.add_model_info_choice = html.Div([ - html.Hr(), - html.Label("Do you wish to upload more info for your model ? : "), - html.Br(), - daq.BooleanSwitch(id='add_info_model_choice', on=False, color="#000000",)]) + html.Hr(), + html.Label("Do you wish to upload more info for your model ? : "), + html.Br(), + daq.BooleanSwitch(id='add_info_model_choice', on=False, color="#000000", )]) self.model_info = html.Div(id="choice_info_div", - hidden=True, - children=[ - html.Hr(), - html.Label("Choose the pretrained model dataset (csv) or feature definition file (txt): "), - html.Br(), - dcc.Upload( - id='model_info_choice', - children=html.Div([ - 'Drag and Drop or ', - html.A('Select File') - ]), - className="upload" - ), - html.Div(id='info_filename')]) + hidden=True, + children=[ + html.Hr(), + html.Label( + "Choose the pretrained model dataset (csv) or feature definition file (txt): "), + html.Br(), + dcc.Upload( + id='model_info_choice', + children=html.Div([ + 'Drag and Drop or ', + html.A('Select File') + ]), + className="upload" + ), + html.Div(id='info_filename')]) self.instance_upload = html.Div([ - html.Hr(), - html.Label("Choose the instance to explain : "), - html.Br(), - dcc.Upload( - id='ml_instance_choice', - children=html.Div([ - 'Drag and Drop or ', - html.A('Select instance') - ]), - className="upload" - ), - html.Div(id='instance_filename')]) + html.Hr(), + html.Label("Choose the instance to explain : "), + html.Br(), + dcc.Upload( + id='ml_instance_choice', + children=html.Div([ + 'Drag and Drop or ', + html.A('Select instance') + ]), + className="upload" + ), + html.Div(id='instance_filename')]) self.num_explanation = html.Div([ - html.Label("Choose the number of explanations : "), - html.Br(), - dcc.Input( - id="number_explanations", - value=1, - type="number", - placeholder="How many explanations ?", - className="dropdown"), - html.Hr()]) + html.Label("Choose the number of explanations : "), + html.Br(), + dcc.Input( + id="number_explanations", + value=1, + type="number", + placeholder="How many explanations ?", + className="dropdown"), + html.Hr()]) self.type_explanation = html.Div([ - html.Label("Choose the kind of explanation : "), - html.Br(), - dcc.Checklist( - id="explanation_type", - options={'AXp' : "Abductive Explanation", 'CXp': "Contrastive explanation"}, - value = ['AXp', 'CXp'], - className="check-boxes", - inline=True), - html.Hr()]) - - self.solver = html.Div([ html.Label("Choose the SAT solver : "), - html.Br(), - dcc.Dropdown(['g3', 'g4', 'lgl', 'mcb', 'mcm', 'mpl', 'm22', 'mc', 'mgh'], 'g3', id='solver_sat') ]) - + html.Label("Choose the kind of explanation : "), + html.Br(), + dcc.Checklist( + id="explanation_type", + options=self.model.xtypes, + className="check-boxes", + inline=True), + html.Hr()]) + + self.solver = html.Div([html.Label("Choose the SAT solver : "), + html.Br(), + dcc.Dropdown(self.model.solvers, + id='solver_sat')]) + self.sidebar = dcc.Tabs(children=[ - dcc.Tab(label='Basic Parameters', children = [ - self.ml_menu_models, - self.pretrained_model_upload, - self.add_model_info_choice, - self.model_info, - self.instance_upload], className="sidebar"), - dcc.Tab(label='Advanced Parameters', children = [ - html.Br(), - self.num_explanation, - self.type_explanation, - self.solver - ], className="sidebar")]) - - - self.expl_choice = html.Div(id = "interaction_graph", hidden=True, - children=[html.H5("Navigate through the explanations and plot them on the tree : "), - html.Div(children = [dcc.Dropdown(self.model.list_expls, - id='expl_choice', - className="dropdown")]), - html.H5("Navigate through the contrastive explanations and plot them on the tree : "), - html.Div(children = [dcc.Dropdown(self.model.list_cont_expls, - id='cont_expl_choice', - className="dropdown")])]) - - self.layout = dbc.Row([ dbc.Col([self.sidebar], - width=3, class_name="sidebar"), - dbc.Col([dbc.Row(id = "graph", children=[]), - dbc.Row(self.expl_choice)], - width=5, class_name="column_graph"), - dbc.Col(html.Main(id = "explanation", children=[], hidden=True), width=4)]) \ No newline at end of file + dcc.Tab(label='Basic Parameters', children=[ + self.ml_menu_models, + self.pretrained_model_upload, + self.add_model_info_choice, + self.model_info, + self.instance_upload], className="sidebar"), + dcc.Tab(label='Advanced Parameters', children=[ + html.Br(), + self.num_explanation, + self.type_explanation, + self.solver + ], className="sidebar")]) + + self.expl_choice = html.Div(id="interaction_graph", hidden=True, + children=[html.H5("Navigate through the explanations and plot them on the tree : "), + html.Div(children=[dcc.Dropdown(self.model.list_expls, + id='expl_choice', + className="dropdown")]), + html.H5( + "Navigate through the contrastive explanations and plot them on the tree : "), + html.Div(children=[dcc.Dropdown(self.model.list_cont_expls, + id='cont_expl_choice', + className="dropdown")])]) + + self.layout = dbc.Row([dbc.Col([self.sidebar], + width=3, class_name="sidebar"), + dbc.Col([dbc.Row(id="graph", children=[]), + dbc.Row(self.expl_choice)], + width=5, class_name="column_graph"), + dbc.Col(html.Main(id="explanation", children=[], hidden=True), width=4)]) diff --git a/requirements.txt b/requirements.txt index 4f70ae69cdd7f78304ed576c0745589b7d79e127..8f12a33646aa22ec091e89844763ad78408bc265 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,14 @@ scipy>=1.2.1 dash_bootstrap_components dash_interactive_graphviz python-sat[pblib,aiger] -pygraphviz==1.9 +pygraphviz anytree==2.8.0 -dash_daq==0.5.0 \ No newline at end of file +dash_daq==0.5.0 +matplotlib +six +xgboost +lime +shap +anchor-exp +pysmt +anytree diff --git a/utils.py b/utils.py index d9996f5443cd611798e68d37beb3f73d70aa4cc3..9fd8a5498e8323b33ae9d840ddb75025e86d226a 100644 --- a/utils.py +++ b/utils.py @@ -1,19 +1,32 @@ import base64 import io -import pickle -import joblib import json -import numpy as np +import joblib +import pickle from dash import html +from pages.application.RandomForest.utils import xrf +from pages.application.RandomForest.utils.xrf import * +sys.modules['xrf'] = xrf +from pages.application.RandomForest.utils import options +from pages.application.RandomForest.utils.options import * +sys.modules['options'] = options def parse_contents_graph(contents, filename): + content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) - try: - if '.pkl' in filename: + try: + if 'mod.pkl' in filename: + print("in") + print(io.BytesIO(decoded)) + print(pickle.load(io.BytesIO(decoded))) + data = pickle.load(io.BytesIO(decoded)) + elif '.pkl' in filename: data = joblib.load(io.BytesIO(decoded)) + elif '.txt' in filename: + data = decoded.decode('utf-8').strip() except Exception as e: print(e) return html.Div([ @@ -22,10 +35,11 @@ def parse_contents_graph(contents, filename): return data + def parse_contents_data(contents, filename): content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) - try: + try: if '.csv' in filename: data = decoded.decode('utf-8').strip() if '.txt' in filename: @@ -38,6 +52,7 @@ def parse_contents_data(contents, filename): return data + def parse_contents_instance(contents, filename): content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) @@ -49,7 +64,7 @@ def parse_contents_instance(contents, filename): elif '.txt' in filename: data = decoded.decode('utf-8') data = str(data).strip().split(',') - data = list(map(lambda i: tuple([i[0], np.float32(i[1])]), [i.split('=') for i in data])) + data = list(map(lambda i: tuple([i[0], np.float32(i[1])]), [i.split('=') for i in data])) elif '.json' in filename: data = decoded.decode('utf-8').strip() data = json.loads(data) @@ -68,11 +83,15 @@ def parse_contents_instance(contents, filename): def extract_data(data): - names_models = [data[i]['ml_type'] for i in range (len(data))] + names_models = [data[i]['ml_type'] for i in range(len(data))] dict_components = {} - - for i in range (len(data)) : + dic_solvers = {} + dic_xtypes = {} + + for i in range(len(data)): ml_type = data[i]['ml_type'] dict_components[ml_type] = data[i]['component'] + dic_solvers[ml_type] = data[i]['solvers'] + dic_xtypes[ml_type] = data[i]['xtypes'] - return names_models, dict_components + return names_models, dict_components, dic_solvers, dic_xtypes