diff --git a/app.py b/app.py index d4becfdf72a575dc49d33620792892869f08f070..60e0b07722af78050dfa9f5a514ce79b6a27bb60 100644 --- a/app.py +++ b/app.py @@ -8,6 +8,7 @@ from dash import dcc, html from callbacks import register_callbacks from pages.application.application import Application, Model, View + from utils import extract_data app = dash.Dash(external_stylesheets=[dbc.themes.LUX], suppress_callback_exceptions=True, @@ -20,9 +21,26 @@ models_data = open('data_retriever.json') data = json.load(models_data)["data"] # For home directory -page_home = dbc.Row([html.H3("Welcome")]) +welcome_message = html.Div(html.Iframe( + src=app.get_asset_url("welcome.html"), + style={"height": "1067px", "width": "100%"}, + )) +page_home = dbc.Row([welcome_message]) + # For course directory -page_course = dbc.Row([]) +course_data_format = html.Div(html.Iframe( + src=app.get_asset_url("course_data_format.html"), + style={"height": "1067px", "width": "100%"}, + )) +course_decision_tree = html.Iframe( + src="assets/course_decision_tree.html", + style={"height": "1067px", "width": "100%"}, + ) +main_course = dcc.Tabs(children=[ + dcc.Tab(label='Data format', children=[course_data_format]), + dcc.Tab(label='Course Decision Tree', children=[course_decision_tree])]) +page_course = dbc.Row([main_course]) + # For the application names_models, dict_components, dic_solvers, dic_xtypes = extract_data(data) model_application = Model(names_models, dict_components, dic_solvers, dic_xtypes) diff --git a/assets/course_data_format.html b/assets/course_data_format.html new file mode 100644 index 0000000000000000000000000000000000000000..686579221d16c611799f2d3d3de7034878947ae7 --- /dev/null +++ b/assets/course_data_format.html @@ -0,0 +1,35 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <title>What kinf of model, data, or instance can I upload ?</title> +</head> +<body> + +<h1> What kind course_data_format.html of model, data, or instance can I upload ? </h1> + +<h2> What is the option to add information on model ? </h2> + +<h3> Why ? </h3> +<p> There is a switch button, you can use it when you want to attach the csv you trained your model on or a feature mapping. This is useful when you didn't dump the features names in your pkl of model but still wants them to show up, or when the values are categorical. +</p> +<h3> How ? </h3> + +<h2> What kind of model can I upload ? </h2> + +<p> You can only import .pkl models.</p> + +<h2> What should the format of the instance be ? </h2> + +<p> You can either upload a .txt file containing hte instance with the format : feature1=value1,feature2=value2,... where feature1, feature2 are the names of the columns. + +But you can also upload a json of your instance.</p> + +<p> Attention !!!! If you never mention the feature names in your model or additional information, the instance should be of format : f1=...,f2=... precisely</p> + +<h1> What are the advanced parameters ? </h1> + +<p> These are specific to the kind of model you selected, it would mainly be how the explanation should be computed and the kind of explanation.</p> + +</body> +</html> \ No newline at end of file diff --git a/assets/course_decision_tree.html b/assets/course_decision_tree.html new file mode 100644 index 0000000000000000000000000000000000000000..b07a373ce003b2127983fa81d3dfb93c5e81771c --- /dev/null +++ b/assets/course_decision_tree.html @@ -0,0 +1,14 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <title>What kinf of model, data, or instance can I upload ?</title> +</head> +<body> + +<h1> What library am I able to use on the platform ?</h1> + +<p> Only models from scikit-learn are allowed.</p> + +</body> +</html> \ No newline at end of file diff --git a/assets/header.css b/assets/header.css index 3a882eae1b7fdffd40dd37f87941a5508a4f29d1..9005062397d55f1264835cb8ef77e8544e217a36 100644 --- a/assets/header.css +++ b/assets/header.css @@ -1,3 +1,5 @@ + + /* NAVBAR */ .navbar-dark .navbar-brand { diff --git a/assets/welcome.html b/assets/welcome.html new file mode 100644 index 0000000000000000000000000000000000000000..ff972271b73ccd66ad0fc0f98fc19f7df8f1f1dc --- /dev/null +++ b/assets/welcome.html @@ -0,0 +1,12 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <title>What kinf of model, data, or instance can I upload ?</title> +</head> +<body> + +<h1> Welcome </h1> + +</body> +</html> \ No newline at end of file diff --git a/callbacks.py b/callbacks.py index 438d511ac3c1d1fee94043f75e0d4260a86f189b..18c46c5be617a681f43bb20e37006203dff82e5d 100644 --- a/callbacks.py +++ b/callbacks.py @@ -28,14 +28,16 @@ def register_callbacks(page_home, page_course, page_application, app): return active_link[0], active_link[1], active_link[2] @app.callback(Output('solver_sat', 'options'), + Output('solver_sat', 'value'), Output('explanation_type', 'options'), + Output('explanation_type', 'value'), Input('ml_model_choice', 'value'), prevent_initial_call=True ) def update_ml_type_options(value_ml_model): model_application = page_application.model model_application.update_ml_model(value_ml_model) - return model_application.solvers, model_application.xtypes + return model_application.solvers, model_application.solvers[0], model_application.xtypes, [list(model_application.xtypes.keys())[0]] @app.callback( Output('pretrained_model_filename', 'children'), diff --git a/data_retriever.json b/data_retriever.json index 5ef4f56851cd35ba4dfc85f6aea5860c14b5d05e..014d9f766cdec29d7131b283c3ef64b07f60a0c7 100644 --- a/data_retriever.json +++ b/data_retriever.json @@ -9,20 +9,6 @@ ], "xtypes" : { "AXp": "Abductive Explanation", "CXp": "Contrastive explanation"} - }, - { - "ml_type" : "NaiveBayes", - "component" : "NaiveBayesComponent", - "solvers" : [], - "xtypes" : { - "AXp": "Abductive Explanation", "CXp": "Contrastive explanation"} - - }, - { - "ml_type" : "RandomForest", - "component" : "RandomForestComponent", - "solvers" : ["LIME", "ANCHOR", "SHAP"], - "xtypes" : {"H": "Heuristic", "HV": "Heuristic and validation", "G": "Global"} } ] diff --git a/pages/application/DecisionTree/utils/dtree.py b/pages/application/DecisionTree/utils/dtree.py index c2775dfac8d7adbaca3ff30e8033b92ea2570c26..b67d26db0165bdf0dba4ded598ad60d394e79373 100644 --- a/pages/application/DecisionTree/utils/dtree.py +++ b/pages/application/DecisionTree/utils/dtree.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#-*- coding:utf-8 -*- +# -*- coding:utf-8 -*- ## ## dtree.py ## @@ -9,7 +9,7 @@ ## # -#============================================================================== +# ============================================================================== from __future__ import print_function import collections from functools import reduce @@ -25,6 +25,7 @@ except ImportError: # for Python3 from sklearn.tree import _tree import numpy as np + class Node(): """ Node class. @@ -38,8 +39,9 @@ class Node(): self.feat = feat self.vals = vals + # -#============================================================================== +# ============================================================================== class DecisionTree(): """ Simple decision tree class. @@ -62,14 +64,14 @@ class DecisionTree(): self.feids = {} self.fdoms = {} self.fvmap = {} - self.feature_names = {f'f{i}' : feature_names[i] for i, f in enumerate(feature_names)} + self.feature_names = {f'f{i}': feature_names[i] for i, f in enumerate(feature_names)} # OHE mapping OHEMap = collections.namedtuple('OHEMap', ['dir', 'opp']) self.ohmap = OHEMap(dir={}, opp={}) if from_dt: - self.from_dt(from_dt) + self.from_dt(from_dt) if mapfile: self.parse_mapping(mapfile) @@ -103,7 +105,7 @@ class DecisionTree(): self.terms = {} for i in range(self.nof_terms): nd, _, t = lines[i + 4].strip().split() - self.terms[int(nd)] = t #int(t) + self.terms[int(nd)] = t # int(t) # finally, reading the nodes self.nodes = collections.defaultdict(lambda: Node(feat='', vals={})) @@ -132,7 +134,7 @@ class DecisionTree(): # simplifying the features and their domains self.feats = sorted(self.feats) - #self.feids = {f: i for i, f in enumerate(self.feats)} + # self.feids = {f: i for i, f in enumerate(self.feats)} self.fdoms = {f: sorted(self.fdoms[f]) for f in self.fdoms} # here we assume all features are present in the tree @@ -175,10 +177,10 @@ class DecisionTree(): for line in lines[1:]: feat, val, real = line.split() self.fvmap[tuple([feat, int(val)])] = '{0}{1}'.format(self.feature_names[feat], real) - #if feat not in self.feids: + # if feat not in self.feids: # self.feids[feat] = len(self.feids) - #assert len(self.feids) == self.nof_feats + # assert len(self.feids) == self.nof_feats def convert_to_multiedges(self): """ @@ -324,34 +326,38 @@ class DecisionTree(): # returning the set of sets with no duplicates return list(dict.fromkeys(sets)) - def explain(self, inst, enum=1, pathlits=False, xtype = ["AXp"], solver='g3', htype='sorted'): + def explain(self, inst, enum=1, pathlits=False, xtype=["AXp"], solver='g3', htype='sorted'): """ Compute a given number of explanations. """ - #contaiins all the elements for explanation + # contaiins all the elements for explanation explanation_dic = {} self.feids = {f'f{i}': i for i, f in enumerate(inst)} - inst = [(f'f{i}', int(inst[i][1])) for i,f in enumerate(inst)] + inst = [(f'f{i}', int(inst[i][1])) for i, f in enumerate(inst)] path, term, depth = self.execute(inst, pathlits) - #decision path - decision_path_str = 'IF {0} THEN class={1}'.format(' AND '.join([self.fvmap[inst[self.feids[self.nodes[n].feat]]] for n in path]), term) + # decision path + decision_path_str = 'IF {0} THEN class={1}'.format( + ' AND '.join([self.fvmap[inst[self.feids[self.nodes[n].feat]]] for n in path]), term) explanation_dic["Decision path of instance : "] = decision_path_str - explanation_dic["Decision path length : "] = 'Path length is :'+ str(depth) + explanation_dic["Decision path length : "] = 'Path length is :' + str(depth) if self.ohmap.dir: f2v = {fv[0]: fv[1] for fv in inst} # updating fvmap for printing ohe features for fo, fis in self.ohmap.dir.items(): - self.fvmap[tuple([fo, None])] = '(' + ' AND '.join([self.fvmap[tuple([fi, f2v[fi]])] for fi in fis]) + ')' + self.fvmap[tuple([fo, None])] = '(' + ' AND '.join( + [self.fvmap[tuple([fi, f2v[fi]])] for fi in fis]) + ')' # computing the sets to hit to_hit = self.prepare_sets(inst, term) - for type in xtype : + explanation_dic["List of path explanation(s)"] = [] + explanation_dic["List of path contrastive explanation(s)"] = [] + for type in xtype: if type == "AXp": explanation_dic.update(self.enumerate_abductive(to_hit, enum, solver, htype, term)) - else : + else: explanation_dic.update(self.enumerate_contrastive(to_hit, term)) return explanation_dic @@ -367,7 +373,8 @@ class DecisionTree(): expls = [] for i, expl in enumerate(hitman.enumerate(), 1): list_expls.append([self.fvmap[p] for p in sorted(expl, key=lambda p: p[0])]) - list_expls_str.append('Explanation: IF {0} THEN class={1}'.format(' AND '.join([self.fvmap[p] for p in sorted(expl, key=lambda p: p[0])]), term)) + list_expls_str.append('Explanation: IF {0} THEN class={1}'.format( + ' AND '.join([self.fvmap[p] for p in sorted(expl, key=lambda p: p[0])]), term)) expls.append(expl) if i == enum: @@ -375,9 +382,10 @@ class DecisionTree(): explanation["List of path explanation(s)"] = list_expls explanation["List of abductive explanation(s)"] = list_expls_str explanation["Number of abductive explanation(s) : "] = str(i) - explanation["Minimal abductive explanation : "] = str( min([len(e) for e in expls])) - explanation["Maximal abductive explanation : "] = str( max([len(e) for e in expls])) - explanation["Average abductive explanation : "] = '{0:.2f}'.format(sum([len(e) for e in expls]) / len(expls)) + explanation["Minimal abductive explanation : "] = str(min([len(e) for e in expls])) + explanation["Maximal abductive explanation : "] = str(max([len(e) for e in expls])) + explanation["Average abductive explanation : "] = '{0:.2f}'.format( + sum([len(e) for e in expls]) / len(expls)) return explanation @@ -385,6 +393,7 @@ class DecisionTree(): """ Enumerate contrastive explanations. """ + def process_set(done, target): for s in done: if s <= target: @@ -401,14 +410,15 @@ class DecisionTree(): list_expls_str = [] explanation = {} for expl in expls: - list_contrastive_expls.append([self.fvmap[(p[0],1-p[1])] for p in sorted(expl, key=lambda p: p[0])]) - list_expls_str.append('Contrastive: IF {0} THEN class!={1}'.format(' OR '.join(['!{0}'.format(self.fvmap[p]) for p in sorted(expl, key=lambda p: p[0])]), term)) + list_contrastive_expls.append([self.fvmap[(p[0], 1 - p[1])] for p in sorted(expl, key=lambda p: p[0])]) + list_expls_str.append('Contrastive: IF {0} THEN class!={1}'.format( + ' OR '.join(['!{0}'.format(self.fvmap[p]) for p in sorted(expl, key=lambda p: p[0])]), term)) explanation["List of path contrastive explanation(s)"] = list_contrastive_expls explanation["List of contrastive explanation(s)"] = list_expls_str - explanation["Number of contrastive explanation(s) : "]=str(len(expls)) - explanation["Minimal contrastive explanation : "]= str( min([len(e) for e in expls])) - explanation["Maximal contrastive explanation : "]= str( max([len(e) for e in expls])) - explanation["Average contrastive explanation : "]='{0:.2f}'.format(sum([len(e) for e in expls]) / len(expls)) + explanation["Number of contrastive explanation(s) : "] = str(len(expls)) + explanation["Minimal contrastive explanation : "] = str(min([len(e) for e in expls])) + explanation["Maximal contrastive explanation : "] = str(max([len(e) for e in expls])) + explanation["Average contrastive explanation : "] = '{0:.2f}'.format(sum([len(e) for e in expls]) / len(expls)) return explanation diff --git a/pages/application/NaiveBayes/NaiveBayesComponent.py b/pages/application/NaiveBayes/NaiveBayesComponent.py deleted file mode 100644 index 98c7848a12a2e33bc270c8d30a0caab823f10c05..0000000000000000000000000000000000000000 --- a/pages/application/NaiveBayes/NaiveBayesComponent.py +++ /dev/null @@ -1,57 +0,0 @@ -from os import path -import base64 - -import dash_bootstrap_components as dbc -import numpy as np -from dash import dcc, html -import subprocess -import shlex - - - -class NaiveBayesComponent(): - - def __init__(self, model, type_model='SKL', info=None, type_info=''): - - #Conversion model - p=subprocess.Popen(['perl','pages/application/NaiveBayes/utils/cnbc2xlc.pl', model],stdout=subprocess.PIPE) - print(p.stdout.read()) - - self.naive_bayes = model - self.map_file = "" - - self.network = html.Div([]) - self.explanation = [] - - - def update_with_explicability(self, instance, enum, xtype, solver) : - - # Call explanation - p=subprocess.Popen(['perl','pages/application/NaiveBayes/utils/xpxlc.pl', self.naive_bayes, instance, self.map_file],stdout=subprocess.PIPE) - print(p.stdout.read()) - - self.explanation = [] - list_explanations_path=[] - explanation = {} - - self.network = html.Div([]) - - #Creating a clean and nice text component - #instance plotting - self.explanation.append(html.H4("Instance : \n")) - self.explanation.append(html.P(str([str(instance[i]) for i in range (len(instance))]))) - for k in explanation.keys() : - if k != "List of path explanation(s)" and k!= "List of path contrastive explanation(s)" : - if k in ["List of abductive explanation(s)","List of contrastive explanation(s)"] : - self.explanation.append(html.H4(k)) - for expl in explanation[k] : - self.explanation.append(html.Hr()) - self.explanation.append(html.P(expl)) - self.explanation.append(html.Hr()) - else : - self.explanation.append(html.P(k + explanation[k])) - else : - list_explanations_path = explanation["List of path explanation(s)"] - list_contrastive_explanations_path = explanation["List of path contrastive explanation(s)"] - - return list_explanations_path, list_contrastive_explanations_path diff --git a/pages/application/NaiveBayes/utils/Parsers.pm b/pages/application/NaiveBayes/utils/Parsers.pm deleted file mode 100644 index 2fd493bc549790d254761c02222414a6a082eee2..0000000000000000000000000000000000000000 --- a/pages/application/NaiveBayes/utils/Parsers.pm +++ /dev/null @@ -1,319 +0,0 @@ -package Parsers; - -use strict; -use warnings; - -use Data::Dumper; - -use POSIX qw( !assert ); -use Exporter; - -require Utils; # Must use require, to get INC updated -import Utils qw( &get_progname &get_progpath ); - -BEGIN { - @Parsers::ISA = ('Exporter'); - @Parsers::EXPORT_OK = - qw( &parse_xlc &parse_cnbc &parse_xmap - &parse_instance &parse_explanations - &parse_blc &parse_acc ); -} - -use constant F_ERR_MSG => - "Please check file name, existence, permissions, etc.\n"; -use constant HLPMAP => 1; -use constant CCAT_CH => '_'; -use constant CCHK => 0; - -if (CCHK) { - ## Uncomment to use assertions && debug messages - #use Carp::Assert; # Assertions are on. -} - - -# Parse XLC format -sub parse_xlc() -{ - my ($opts, $xlc, $fname) = @_; - - open(my $fh, "<$fname") || die "Unable to open file $fname. " . F_ERR_MSG; - my ($nc, $nr, $rmode) = (0, 0, 0); - while(<$fh>) { - chomp; - next if m/^\s*c\s+$/; - if ($rmode == 0) { # Read number of features - m/^\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($xlc->{NV}, $rmode) = ($1, 1); - } - elsif ($rmode == 1) { # Read w0 - m/^\s*(\-?\d+\.?\d*)\s*$/ || die "Unable to match: $_\n"; - ($xlc->{W0}, $rmode) = ($1, 2); - } - elsif ($rmode == 2) { # Read number of real-valued features - m/^\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($xlc->{NReal}, $rmode) = ($1, 3); - if ($xlc->{NReal} == 0) { $rmode = 4; } - } - elsif ($rmode == 3) { # Read real-valued coefficients - m/^\s*(\-?\d+\.?\d*)\s*$/ || die "Unable to match: $_\n"; - push @{$xlc->{RVs}}, $1; - if (++$nr == $xlc->{NReal}) { ($nr, $rmode) = (0, 4); } - } - elsif ($rmode == 4) { # Read number of categorical features - m/^\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($xlc->{NCat}, $rmode) = ($1, 5); - } - elsif ($rmode == 5) { # Read domains and weights of cat. features - my $cvi = "CVs$nc"; - @{$xlc->{$cvi}} = split(/ +/); - push @{$xlc->{CDs}}, shift @{$xlc->{$cvi}}; - if (++$nc == $xlc->{NCat}) { $rmode = 6; } - } - else { die "Invalid state with input: $_\n"; } - } - close($fh); -} - - -# Parse map file -sub parse_xmap() -{ - my ($opts, $xmap, $fname) = @_; - - open(my $fh, "<$fname") || die "Unable to open file $fname. " . F_ERR_MSG; - my ($cc, $nv, $nc, $nr, $rmode) = (0, 0, 0, 0, 0); - while(<$fh>) { - chomp; - next if m/^\s*c\s+$/; - if ($rmode == 0) { # Read number of classes - m/^\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($xmap->{NC}, $rmode, $cc) = ($1, 1, 0); - if ($xmap->{NC} == 0) { $rmode = 2; } - } - elsif ($rmode == 1) { # Read class name maps - my @toks = split(/ +/); - my $cid = shift @toks; - ${$xmap->{ClMap}}[$cid] = join(CCAT_CH, @toks); - if (++$cc == $xmap->{NC}) { $rmode = 2; } - } - elsif ($rmode == 2) { # Read number of features - m/^\s*(\d+)\s*$/ || die "Unable to match \@ $rmode: $_\n"; - ($xmap->{NV}, $rmode) = ($1, 3); - } - elsif ($rmode == 3) { # Read number of real-valued features - m/^\s*(\d+)\s*$/ || die "Unable to match \@ $rmode: $_\n"; - ($xmap->{NReal}, $rmode, $nr) = ($1, 4, 0); - if ($xmap->{NReal} == 0) { $rmode = 5; } - } - elsif ($rmode == 4) { # Read map of real-value features - my @toks = split(/ +/); - my $rid = shift @toks; - ${$xmap->{VMap}}[$rid] = join(CCAT_CH, @toks); - if (++$nr == $xmap->{NReal}) { $rmode = 5; } - } - elsif ($rmode == 5) { # Read number of categorical features - m/^\s*(\d+)\s*$/ || die "Unable to match \@ $rmode: $_\n"; - ($xmap->{NCat}, $rmode, $nc) = ($1, 6, $nr); - } - elsif ($rmode == 6) { # Read categorical feature - my @toks = split(/ +/); - my $cid = shift @toks; - if (!HLPMAP) { - ${$xmap->{VMap}}[$cid] = join(CCAT_CH, @toks); } - else { - my ($sch, $ech, $jch) = ('', '', ''); - if ($#toks > 0) { ($sch, $ech, $jch) = ('\'', '\'', ' '); } - ${$xmap->{VMap}}[$cid] = $sch . join($jch, @toks) . $ech; - } - $rmode = 7; - if (CCHK) { assert($cid == $nc, "Invalid categorical ID"); } - } - elsif ($rmode == 7) { # Read domain size of current feature - m/^\s*(\d+)\s*$/ || die "Unable to match \@ $rmode: $_\n"; - ($xmap->{CDs}->{$nc}, $rmode, $nv) = ($1, 8, 0); - } - elsif ($rmode == 8) { # Read values of categorical feature - my @toks = split(/ +/); - my $vid = shift @toks; - if (!HLPMAP) { - ${$xmap->{CMap}->{$nc}}[$vid] = join(CCAT_CH, @toks); } - else { - my ($repl, $sch, $ech, $jch) = (0, '', '', ''); - for (my $i=0; $i<=$#toks; ++$i) { - if ($toks[$i] =~ m/${$xmap->{VMap}}[$nc]/) { - $toks[$i] =~ s/${$xmap->{VMap}}[$nc]/\?\?/g; - $repl = 1; - } - } - if ($#toks > 0 && !$repl) { ($sch,$ech,$jch)=('\'','\'',' '); } - ${$xmap->{CMap}->{$nc}}[$vid] = $sch . join($jch, @toks) . $ech; - } - if (++$nv == $xmap->{CDs}->{$nc}) { - if (++$nc == $xmap->{NReal}+$xmap->{NCat}) { $rmode = 9; } - else { $rmode = 6; } - } - } - else { die "Invalid state with input \@ $rmode: $_\n"; } - } - close($fh); -} - - -# Parse CNBC format -- currently hard-coded for 2 classes -sub parse_cnbc() -{ - my ($opts, $cnbc, $fname) = @_; - - open(my $fh, "<$fname") || die "Unable to open file $fname. " . F_ERR_MSG; - my ($cc, $cv, $pol, $rmode) = (0, 0, 0, 0); - while(<$fh>) { - chomp; - if ($rmode == 0) { # Read number of classes - m/^\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($cnbc->{NC}, $rmode, $cc) = ($1, 1, 0); - } - elsif ($rmode == 1) { # Read priors - m/^\s*(\-?\d+\.?\d*)\s*$/ || die "Unable to match: $_\n"; - push @{$cnbc->{Prior}}, $1; - if (++$cc == $cnbc->{NC}) { $rmode = 2; } - } - elsif ($rmode == 2) { # Read number of features - m/^\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($cnbc->{NV}, $cv, $rmode) = ($1, 0, 3); - } - elsif ($rmode == 3) { # Read domain size of feature - my $cpt = "CPT$cv"; - if ($cv == $cnbc->{NV}) { die "Too many features specified?\n"; } - m/^\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($cnbc->{$cpt}->{D}, $cc, $rmode) = ($1, 0, 4); - } - elsif ($rmode == 4) { # Read CPT for feature - my $cpt = "CPT$cv"; - my $ccl = "C$cc"; - my @probs = split(/ +/); - if ($#probs+1 != $cnbc->{$cpt}->{D}) { die "Invalid CPT def\n"; } - for (my $i=0; $i<=$#probs; ++$i) { - $probs[$i] =~ m/(\-?\d+\.?\d*)/ || die "Unable to match: $_\n"; - push @{$cnbc->{$cpt}->{$ccl}}, $probs[$i]; - } - if (++$cc == $cnbc->{NC}) { - ($cv, $cc, $rmode) = ($cv+1, 0, 3); # Move to next feature - } - } else { die "Unexpected read mode in CNBC file\n"; } - } - close($fh); -} - - -# Parse BLC format -sub parse_blc() -{ - my ($opts, $blc, $fname) = @_; - open(my $fh, "<$fname") || die "Unable to open file $fname. " . F_ERR_MSG; - my ($rmode, $cnt) = (0, 0); - while(<$fh>) { - next if m/^\s*$/ || m/^c\s+/; - chomp; - if ($rmode == 0) { - m/\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($blc->{NV}, $rmode) = ($1, 1); - } - elsif ($rmode == 1) { - if ($cnt == $blc->{NV}+1) { - die "Too many lines in BLC description??\n"; } - m/^\s*(\-?\d+\.?\d*)\s*$/ || die "Unable to match: $_\n"; - ${$blc->{Ws}}[$cnt++] = $1; - } - } - close($fh); -} - -# Parse ACC format -sub parse_acc() -{ - my ($opts, $acc, $fname) = @_; - - open(my $fh, "<$fname") || die "Unable to open file $fname. " . F_ERR_MSG; - my ($cc, $cv, $pol, $rmode) = (0, 0, 0, 0); - while(<$fh>) { - next if m/^\s*$/ || m/^c\s+/; - chomp; - if ($rmode == 0) { - m/\s*(\d)\s*$/ || die "Unable to match: $_\n"; - ($acc->{NC}, $rmode) = ($1, 1); - } - elsif ($rmode == 1) { - m/\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($acc->{NV}, $rmode) = ($1, 2); - } - elsif ($rmode == 2) { - my $class = "C$cc"; - m/^\s*(\-?\d+\.?\d*)\s*$/ || die "Unable to match: $_\n"; - $acc->{VV}->{$class}->{W0} = $1; - $rmode = 3; - } - elsif ($rmode == 3) { - my $class = "C$cc"; - my $polarity = "P$pol"; - m/^\s*(\-?\d+\.?\d*)\s*$/ || die "Unable to match: $_\n"; - ${$acc->{VV}->{$class}->{$polarity}}[$cv] = $1; - $pol = 1 - $pol; - if ($pol == 0) { $cv++; } - if ($cv == $acc->{NV}) { - ($cc, $cv, $pol) = ($cc+1, 0, 0); - if ($cc == $acc->{NC}) { last; } - $rmode = 2; - } - } - } - close($fh); -} - - -# Parse instance format -sub parse_instance() -{ - my ($opts, $inst, $fname) = @_; - - open(my $fh, "<$fname") || die "Unable to open file $fname. " . F_ERR_MSG; - my ($cnt, $rmode) = (0, 0); - while(<$fh>) { - next if m/^\s*$/ || m/^c\s+/; - chomp; - if ($rmode == 0) { - m/\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($inst->{NV}, $rmode) = ($1, 1); - } - elsif ($rmode == 1) { - m/\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ${$inst->{E}}[$cnt++] = $1; - if ($cnt == $inst->{NV}) { $rmode = 2; } - } - elsif ($rmode == 2) { - m/\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - $inst->{C} = $1; - } - } - close($fh); -} - -# Parse explanations -sub parse_explanations() -{ - my ($fname, $xpl) = @_; - open(my $fh, "<$fname") || die "Unable to open file $fname. " . F_ERR_MSG; - while(<$fh>) { - next if m/^\s*$/ || m/^c\s+/; - chomp; - my @lits = split(/ +/); - shift @lits; # Drop 'Expl: ' - push @{$xpl->{Expl}}, \@lits; - } - close($fh); -} - - -END { -} - -1; # to ensure that the 'require' or 'use' succeeds diff --git a/pages/application/NaiveBayes/utils/Utils.pm b/pages/application/NaiveBayes/utils/Utils.pm deleted file mode 100644 index d694e44dec936f672410a803ea8bda1eaf08a93d..0000000000000000000000000000000000000000 --- a/pages/application/NaiveBayes/utils/Utils.pm +++ /dev/null @@ -1,114 +0,0 @@ -package Utils; - -use strict; -use warnings; - -use Data::Dumper; - -use POSIX; -use Exporter(); -use Sys::Hostname; - -BEGIN { - @Utils::ISA = ('Exporter'); - @Utils::EXPORT_OK = qw( &get_progname &get_progpath &round &SIG_handler ); -} - - -#------------------------------------------------------------------------------# -# Execution path handling -#------------------------------------------------------------------------------# - -sub get_progname() { - my @progname_toks = split(/\//, $0); - my $progname = $progname_toks[$#progname_toks]; - #print "$progname\n"; - return $progname; -} - -sub get_progpath() { - my @progname_toks = split(/\//, $0); - pop @progname_toks; - my $progpath = join('/', @progname_toks); - if ($progpath eq '') { $progpath = '\.\/'; } - #print "Prog Path: $progpath\n"; #exit; - return $progpath; -} - -sub get_hostname() { - my $full_host_name = &Sys::Hostname::hostname(); - $full_host_name =~ m/(\w+)\.?/; - my $rhname = $1; - #print "|$hostname|\n"; exit; - return $rhname; -} - -sub resolve_inc() { # Kept here as a template; need a copy in each script... - my ($cref, $pmname) = @_; - my @progname_toks = split(/\//, $0); - pop @progname_toks; - my $progpath = join('/', @progname_toks); - my $fullname = $progpath . '/' . $pmname; - my $fh; - open($fh, "<$fullname") || die "non-existing file: $pmname\n"; - return $fh; -} - - -#------------------------------------------------------------------------------# -# Signal handling utilities -#------------------------------------------------------------------------------# - -sub register_handlers() -{ - $SIG{'INT'} = 'Utils::INT_handler'; - $SIG{'TERM'} = 'Utils::INT_handler'; - $SIG{'ABRT'} = 'Utils::SIG_handler'; - $SIG{'SEGV'} = 'Utils::SIG_handler'; - $SIG{'BUS'} = 'Utils::SIG_handler'; - $SIG{'QUIT'} = 'Utils::SIG_handler'; - $SIG{'XCPU'} = 'Utils::SIG_handler'; -} - -my @args = (); -my @callback = (); - -sub push_arg() -{ - push @args, shift; -} - -sub push_callback() -{ - push @callback, shift; -} - -sub SIG_handler() -{ - &Utils::INT_handler(); -} - -sub INT_handler() -{ - # call any declared callbacks, e.g. to prints stats, summaries, etc. - print "\nReceived system signal. Cleaning up & terminating...\n"; - foreach my $cback (@callback) { - &{$cback}(\@args); - } - exit 20; # 20 denotes resources exceeded condition (see below) -} - - -#------------------------------------------------------------------------------# -# Useful utils -#------------------------------------------------------------------------------# - -sub round() { - my ($rval) = @_; - return int($rval + 0.5); -} - -END { -} - -1; # to ensure that the 'require' or 'use' succeeds diff --git a/pages/application/NaiveBayes/utils/Writers.pm b/pages/application/NaiveBayes/utils/Writers.pm deleted file mode 100644 index e281cec9d853630b4b6e7c8c5486f9d37974c693..0000000000000000000000000000000000000000 --- a/pages/application/NaiveBayes/utils/Writers.pm +++ /dev/null @@ -1,42 +0,0 @@ -package Writers; - -use strict; -use warnings; - -use Data::Dumper; - -use POSIX; -use Exporter; - -require Utils; # Must use require, to get INC updated -import Utils qw( &get_progname &get_progpath ); - -BEGIN { - @Writers::ISA = ('Exporter'); - @Writers::EXPORT_OK = qw( &write_xlc ); -} - - -# Export XLC format -sub write_xlc() -{ - my ($opts, $xlc) = @_; - print("$xlc->{NV}\n"); - print("$xlc->{W0}\n"); - print("$xlc->{NReal}\n"); - for (my $i=0; $i<$xlc->{NReal}; ++$i) { - print("${$xlc->{RVs}}[$i]\n"); - } - print("$xlc->{NCat}\n"); - for (my $i=0; $i<$xlc->{NCat}; ++$i) { - my $cvi = "CVs$i"; - print("${$xlc->{CDs}}[$i] "); - print("@{$xlc->{$cvi}}\n"); - } -} - - -END { -} - -1; # to ensure that the 'require' or 'use' succeeds diff --git a/pages/application/NaiveBayes/utils/cnbc2xlc.pl b/pages/application/NaiveBayes/utils/cnbc2xlc.pl deleted file mode 100755 index b0383d086c28cc4dc7ad51c7361f71540963eb4a..0000000000000000000000000000000000000000 --- a/pages/application/NaiveBayes/utils/cnbc2xlc.pl +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env perl - -## Tool for translating the probabilities of an CNBC into a -## sequence of non-negative weights which are then represented -## in the XLC format. -## Script specifically assumes *2* classes - -push @INC, \&resolve_inc; - -use strict; -use warnings; -use Data::Dumper; -use Getopt::Std; - -require Parsers; -import Parsers qw( parse_cnbc ); - -require Writers; -import Writers qw( write_xlc ); - -use constant DBG => 0; ## Also, comment out unused 'uses' -use constant CHK => 0; - -my $f_err_msg = "Please check file name, existence, permissions, etc.\n"; - -# 0. Read command line arguments -my %opts = (); -&read_opts(\%opts); - - -if ((CHK || DBG) && (defined($opts{k}) || defined($opts{d}))) { - ## Uncomment to use assertions && debug messages - #use Carp::Assert; # Assertions are on. - #if (DBG && $opts{d}) { - # use Data::Dumper; - #} -} -if (defined($opts{o})) { - open ($opts{FH}, '>', $opts{o}); - select($opts{FH}); -} - - -# 1. Data structures -my %cnbc = (); -my %xlc = (); -my $mval = 0; -my $tval = 0; - -# 2. Read ML model (definition of (C)NBC in CNBC format) -&parse_cnbc(\%opts, \%cnbc, $opts{f}); -if ($opts{d}) { warn Data::Dumper->Dump([ \%cnbc ], [ qw(cnbc) ]); } - -# 3. Translate CNBC weights (i.e. probs) into CNBC weights (i.e. additive & >=0) -&process_weights(\%opts, \%cnbc); -if ($opts{d}) { warn Data::Dumper->Dump([ \%cnbc ], [ qw(cnbc) ]); } - -#4. Reduce CNBC (w/ weights) into XLC -&reduce_cnbc_xlc(\%opts, \%cnbc, \%xlc); -if ($opts{d}) { warn Data::Dumper->Dump([ \%xlc ], [ qw(xlc) ]); } - -# 4. Print ML model in ACC format -&write_xlc(\%opts, \%xlc); - -1; - - -# Core functions - -# Goal is to apply a translation to the prob values -sub process_weights() -{ - my ($opts, $cnbc) = @_; - if (CHK && $opts->{k}) { - assert($cnbc->{NC}==2, "Cannot handle $cnbc->{NC} classes\n"); - } - - # 1. First traversal: compute & sum logarithms and flag 0 probs - my ($hasp0, $sumlogs, $minv, $logv) = (0, 0, 0, 0); - for(my $i=0; $i<=$#{$cnbc->{Prior}}; ++$i) { - if (${$cnbc->{Prior}}[$i] == 0) { $hasp0 = 1; } - else { - $logv = log(${$cnbc->{Prior}}[$i]); - $sumlogs += $logv; - ${$cnbc->{Prior}}[$i] = $logv; - if ($logv < $minv) { $minv = $logv; } - } - } - for(my $j=0; $j<$cnbc->{NV}; ++$j) { - my $cpt = "CPT$j"; - for(my $i=0; $i<=$#{$cnbc->{Prior}}; ++$i) { - my $ccl = "C$i"; - for(my $k=0; $k<$cnbc->{$cpt}->{D}; ++$k) { - if (${$cnbc->{$cpt}->{$ccl}}[$k] == 0) { $hasp0 = 1; } - else { - $logv = log(${$cnbc->{$cpt}->{$ccl}}[$k]); - $sumlogs += $logv; - ${$cnbc->{$cpt}->{$ccl}}[$k] = $logv; - if ($logv < $minv) { $minv = $logv; } - } - } - } - } - $mval = $sumlogs - 1; - $tval = ($hasp0) ? -$mval : -$minv; - # 2. Second traversal: update 0 probs, offset weights by T - for(my $i=0; $i<=$#{$cnbc->{Prior}}; ++$i) { - if (${$cnbc->{Prior}}[$i] == 0) { - ${$cnbc->{Prior}}[$i] = $mval; - } - ${$cnbc->{Prior}}[$i] += $tval; - } - for(my $j=0; $j<$cnbc->{NV}; ++$j) { - my $cpt = "CPT$j"; - for(my $i=0; $i<=$#{$cnbc->{Prior}}; ++$i) { - my $ccl = "C$i"; - for(my $k=0; $k<$cnbc->{$cpt}->{D}; ++$k) { - if (${$cnbc->{$cpt}->{$ccl}}[$k] == 0) { - ${$cnbc->{$cpt}->{$ccl}}[$k] = $mval; - } - ${$cnbc->{$cpt}->{$ccl}}[$k] += $tval; - } - } - } - if ($opts->{d}) { warn Data::Dumper->Dump([ $cnbc ], [ qw(cnbc_pw) ]); } -} - -sub reduce_cnbc_xlc() -{ - my ($opts, $cnbc, $xlc) = @_; - $xlc->{NV} = $cnbc->{NV}; - $xlc->{W0} = ${$cnbc->{Prior}}[0] - ${$cnbc->{Prior}}[1]; - $xlc->{NReal} = 0; - $xlc->{NCat} = $cnbc->{NV}; - for(my $j=0; $j<$cnbc->{NV}; ++$j) { - my $cpt = "CPT$j"; - my $cvj = "CVs$j"; - my ($ccl0, $ccl1) = ('C0', 'C1'); - push @{$xlc->{CDs}}, $cnbc->{$cpt}->{D}; - for(my $k=0; $k<$cnbc->{$cpt}->{D}; ++$k) { - my $vdiff = - ${$cnbc->{$cpt}->{$ccl0}}[$k] - ${$cnbc->{$cpt}->{$ccl1}}[$k]; - push @{$xlc->{$cvj}}, $vdiff; - } - } -} - - -# Format parsing functions - -sub read_acc_spec() -{ - my ($fname, $acc) = @_; - - die "Must use common parser!!!!\n"; - - open(my $fh, "<$fname") || - die "Unable to open file $fname. " . $f_err_msg; - my ($cc, $cv, $pol, $rmode) = (0, 0, 0, 0); - while(<$fh>) { - chomp; - if ($rmode == 0) { - m/\s*(\d)\s*$/ || die "Unable to match: $_\n"; - ($acc->{NC}, $rmode) = ($1, 1); - } - elsif ($rmode == 1) { - m/\s*(\d+)\s*$/ || die "Unable to match: $_\n"; - ($acc->{NV}, $rmode) = ($1, 2); - } - elsif ($rmode == 2) { - my $class = "C$cc"; - m/\s*(\-?\d+\.?\d*)\s*$/ || die "Unable to match: $_\n"; - $acc->{VV}->{$class}->{W0} = $1; - $rmode = 3; - } - elsif ($rmode == 3) { - my $class = "C$cc"; - my $polarity = "P$pol"; - m/\s*(\-?\d+\.?\d*)\s*$/ || die "Unable to match: $_\n"; - ${$acc->{VV}->{$class}->{$polarity}}[$cv] = $1; - $pol = 1 - $pol; - if ($pol == 0) { $cv++; } - if ($cv == $acc->{NV}) { - ($cc, $cv, $pol) = ($cc+1, 0, 0); - if ($cc == $acc->{NC}) { last; } - $rmode = 2; - } - } else { die "Unexpected line in file: $_\n"; } - } - close($fh); -} - -# Utilities - -sub read_opts() -{ - my ($opts) = @_; - getopts("hdvkf:o:", $opts); - - if ($opts->{h}) { - &prt_help(); - } - elsif (!defined($opts->{f})) { - die "Usage: $0 [-h] [-d] [-v] [-k] [-o <out-file>] -f <cnbc-file>\n" ; - } -} - -sub prt_help() -{ - my $tname = &toolname($0); - print <<"EOF"; -$tname: Translate CNBC format into XLC format -Usage: $tname [-h] [-d] [-v] [-k] [-o <out-file>] -f <cnbc-file> - -f <cnbc-file> specification of CNBC file - -o <out-file> output file for exporting XLC format - -k perform consistency checks & exit if error - -v verbose mode - -d debug mode - -h prints this help - Author: joao.marques-silva\@univ-toulouse.fr -EOF - exit(); -} - -sub toolname() -{ - my ($tname) = @_; - $tname =~ m/([\.\_\-a-zA-Z0-9]+)$/; - return $1; -} - - -#------------------------------------------------------------------------------# -# Auxiliary functions -#------------------------------------------------------------------------------# - -sub resolve_inc() { # Copy from template kept in UTILS package - my ($cref, $pmname) = @_; - my @progname_toks = split(/\//, $0); - pop @progname_toks; - my $progpath = join('/', @progname_toks); - my $fullname = $progpath . '/' . $pmname; - open(my $fh, "<$fullname") || die "non-existing file: $pmname\n"; - return $fh; -} - -# jpms diff --git a/pages/application/NaiveBayes/utils/xpxlc.pl b/pages/application/NaiveBayes/utils/xpxlc.pl deleted file mode 100755 index cbd47596c624e073c40595c393c73e980ab429f8..0000000000000000000000000000000000000000 --- a/pages/application/NaiveBayes/utils/xpxlc.pl +++ /dev/null @@ -1,722 +0,0 @@ -#!/usr/bin/env perl - -## Tool for reasoning about explanations in XLC's. Starting from a -## XLC and associated instance, the tool can enumerate one or more -## explanations. The tool can also validate explanations given in a -## file of explanations. The default mode of operation is to enumerate -## all explanations. -## One example of an ML model that can be reduced to XLC is the NBC. -## The details of the algorithm are included in the accompanying paper. -## The script specifically assumes classification problems with *two* -## classes. The handling of multiple classes is beyond the scope of -## the work. - -## To run the tool: -## <script-name> [-h] [-d] [-v] [-C] [-t] [-s] [-w] [-x] [-k <KKK>] [-n <NNN>] [-p <prt-file>] [-c <xpl-file> [-r]] [-m <xmap-file] -i <cat-inst-file> -f <xlc-file> - -push @INC, \&resolve_inc; - -use strict; -use warnings; - -use Getopt::Std; -use List::Util qw(sum0); ##qw( max min sum sum0); - -use constant DBG => 0; ## Also, comment out unused 'uses' -use constant CHK => 0; - -require Parsers; -import Parsers qw( parse_xlc parse_instance parse_explanations parse_xmap ); - -# 0. Read command line arguments -my %opts = (); -&read_opts(\%opts); - -if ((CHK || DBG) && (defined($opts{k}) || defined($opts{d}))) { - ## Uncomment to use assertions && debug messages - #use Carp::Assert; # Assertions are on. - #if (DBG && $opts{d}) { - # use Data::Dumper; - #} -} -if (defined($opts{p})) { - open ($opts{FH}, '>', $opts{p}); - select($opts{FH}); -} - - -# 1a. Data structures -my %xlc = (); -my %xmap = (); -my %inst = (); -my %xpl = (); - -# 1b. Prepare interrupts -if ($opts{C}) { # If catching system signals - &Utils::register_handlers(); - &Utils::push_arg(\%opts); - &Utils::push_arg(\%xlc); - if ($opts{t}) { &Utils::push_callback(\&print_stats_int); } - if ($opts{s}) { &Utils::push_callback(\&print_summaries_int); } -} - -# 2. Parse NBC XLC -&parse_xlc(\%opts, \%xlc, $opts{f}); -if (DBG && $opts{d}) { print Data::Dumper->Dump([ \%xlc ], [ qw(xlc) ]); } -if (CHK && $xlc{NReal}!=0) { die "Unable to handle real-valued features.\n"; } - -# 3. Parse instance -&parse_instance(\%opts, \%inst, $opts{i}); -if (DBG && $opts{d}) { print Data::Dumper->Dump([ \%inst ], [ qw(inst) ]); } - -# 4. If map specified, load map -if (defined($opts{m})) { - &parse_xmap(\%opts, \%xmap, $opts{m}); -} else { - &set_def_xmap(\%opts, \%xmap, \%xlc); -} -if (DBG && $opts{d}) { print Data::Dumper->Dump([ \%xmap ], [ qw(xmap) ]); } - -# 5. Compute XLC values & preprocess XLC -&simulate_xlc(\%opts, \%xlc, \%inst); -&preprocess_xlc(\%opts, \%xlc, \%inst); -&initialize_data(\%opts, \%xlc, \%inst, \%xmap); - -# 6. If *check* mode: read & validate one or more explanations -if ($opts{c}) { - &parse_explanations($opts{c}, \%xpl); - &validate_explanations(\%opts, \%xlc, \%inst, \%xmap, \%xpl); - &print_xpl_status(\%opts, \%xpl); - exit(); -} - -# 7. Else, compute & report explanations -if ($opts{x}) { - &compute_explanations_xl(\%opts, \%xlc, \%inst); -} -else { - &compute_explanations(\%opts, \%xlc, \%inst); -} - -# 8. Print summaries & stats -if ($opts{s}) { &print_summaries(\%opts, \%xlc); } -if ($opts{t}) { &print_stats(\%opts, \%xlc); } - -1; - -# Simulate XLC -sub simulate_xlc() -{ - my ($opts, $xlc, $inst) = @_; - - # Start with the intercept W0 - my $simval = $xlc->{W0}; - - # Add the contribution of real-value variables (currently assumed to be 0) - # ... - if (CHK && $xlc->{NReal} > 0) { - die "Simulation of real-valued features no ready yet.\n"; } - # ... - - # Add the contribution of categorical variables - for (my $i=0; $i<$xlc->{NCat}; ++$i) { - my $cvi = "CVs$i"; - $simval += ${$xlc->{$cvi}}[${$inst->{E}}[$i]]; - } - $xlc->{C} = ($simval > 0) ? 0 : 1; - $xlc->{Gamma} = abs($simval); - - # Validate results - if (CHK && defined($opts->{k})) { - assert($xlc->{C} == $inst->{C}, 'simulated prediction differs'); } - if ($xlc->{C} == 1) { &complement_parameters($opts, $xlc, $inst); } - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_sim)]); } -} - -# If class is 1, then complement all values -sub complement_parameters() -{ - my ($opts, $xlc, $inst) = @_; - - $xlc->{W0} = -$xlc->{W0}; - for(my $i=0; $i<$xlc->{NReal}; ++$i) { - ${$xlc->{RVs}}[$i] = -${$xlc->{RVs}}[$i]; - } - for(my $i=0; $i<$xlc->{NCat}; ++$i) { - my $cvi = "CVs$i"; - for(my $j=0; $j<${$xlc->{CDs}}[$i]; ++$j) { - ${$xlc->{$cvi}}[$j] = -${$xlc->{$cvi}}[$j]; - } - } - $xlc->{C} = 1 - $xlc->{C}; - - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_cps)]); } -} - -# Preprocess XLC -sub preprocess_xlc() -{ - my ($opts, $xlc, $inst) = @_; - # Compute delta's, Delta and Phi [$xlc->{Delta}, $xlc->{DeltaSum}] - &compute_deltas($opts, $xlc, $inst); - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_Ds)]); } - # - # Sort delta's by non-increasing value [$xlc->{SortedDelta}] - &reorder_deltas($opts, $xlc, $inst); - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_Sort)]); } - # - &calc_partial_sums($opts, $xlc); -} - -sub compute_deltas() -{ - my ($opts, $xlc, $inst) = @_; - - # a. For each feature, sort weights, and pick smallest - my $sumdelta = 0; - for (my $i=0; $i<$xlc->{NCat}; ++$i) { - my $cvi = "CVs$i"; - my $tval = ${$xlc->{$cvi}}[${$inst->{E}}[$i]]; - my @scvs = sort { $a <=> $b} @{$xlc->{$cvi}}; - ${$xlc->{Delta}}[$i] = $tval - $scvs[0]; - ##print ("i=$i: tval=$tval vs. minv=$scvs[0] vs. delta=${$xlc->{Delta}}[$i]\n"); - $sumdelta += ${$xlc->{Delta}}[$i]; - } - $xlc->{DeltaSum} = $sumdelta; - $xlc->{Phi} = $xlc->{DeltaSum} - $xlc->{Gamma}; - $xlc->{PhiRef} = $xlc->{Phi}; - - # b. Validations - if (DBG && $opts->{d}) { print "SumDelta: $sumdelta\n"; } - if (CHK && defined($opts->{k}) && $sumdelta <= $xlc->{Phi}) { - my $msg = 'XLC prediction cannot be changed!?'; - if ($opts->{k}>1) { &prt_err_exit($msg); } - elsif ($opts->{k}==1) { &prt_warn($msg); } - } -} - -sub reorder_deltas() -{ - my ($opts, $xlc, $inst) = @_; - - my %DMap = (); - $xlc->{DeltaMap} = {}; - for(my $i=0; $i<$xlc->{NCat}; ++$i) { - my $rval = ${$xlc->{Delta}}[$i]; - push @{$xlc->{DeltaMap}->{$rval}}, $i; - $DMap{$rval} = 1; - } - @{$xlc->{SortedDelta}} = sort { $b <=> $a } @{$xlc->{Delta}}; - @{$xlc->{SDelta}} = (); - for(my $i=0; $i<=$#{$xlc->{SortedDelta}}; ++$i) { - my $rval = ${$xlc->{SortedDelta}}[$i]; - if ($DMap{$rval} == 0) { next; } - if (DBG && $opts->{d}) { - print "A: SDelta \@ i=$i: @{$xlc->{SDelta}} && rval=$rval\n"; } - push @{$xlc->{SDelta}}, @{$xlc->{DeltaMap}->{$rval}}; - $DMap{$rval} = 0; - } - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_Reord)]); } - if (CHK && defined($opts->{k})) { - my ($sdz, $dz) = ($#{$xlc->{SDelta}}+1, $#{$xlc->{Delta}}+1); - assert($sdz == $dz, "Different sizes: $sdz vs. $dz"); } -} - -sub calc_partial_sums() -{ - my ($opts, $xlc) = @_; - - my ($depth, $tmpv) = ($xlc->{NCat}-1, 0); - while($depth>=0) { - $tmpv += ${$xlc->{SortedDelta}}[$depth]; - $xlc->{SumFrom}[$depth--] = $tmpv; - } -} - -sub set_def_xmap() -{ - my ($opts, $xmap, $xlc) = @_; - - $xmap->{NC} = 2; # Default number of classes... - @{$xmap->{ClMap}} = ('0', '1'); - $xmap->{NC} = $xlc->{NV}; - $xmap->{NReal} = $xlc->{NReal}; - for (my $i=0; $i<$xlc->{NReal}; ++$i) { - ${$xmap->{VMap}}[$i] = "v$i"; - } - $xmap->{NCat} = $xlc->{NCat}; - @{$xmap->{CDs}} = @{$xlc->{CDs}}; - for (my $i=0; $i<$xlc->{NCat}; ++$i) { - my $cid = $xmap->{NReal}+$i; - ${$xmap->{VMap}}[$i] = "v$cid"; - for (my $j=0; $j<${$xlc->{CDs}}[$i]; ++$j) { - ${$xmap->{CMap}->{$i}}[$j] = "$j"; - } - } - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xmap],[qw(xmap_def)]); } -} - -sub initialize_data() -{ - my ($opts, $xlc, $inst, $xmap) = @_; - - ($xlc->{XplNum}, $xlc->{XplSz}) = (0, 0); - ($xlc->{XplMin}, $xlc->{XplMax}) = ($xlc->{NV}, 0); - for(my $idx=0; $idx<=$xlc->{NV}; ++$idx) { - ${$xlc->{CNTS}}[$idx] = 0; - } - for(my $idx=0; $idx<$xlc->{NReal}; ++$idx) { - die "Handling of real-valued features not yet implemented...\n"; - } - for(my $idx=0; $idx<$xlc->{NCat}; ++$idx) { - # Categorical feature name - my $cval = ${$inst->{E}}[$idx]; - my $vname = ${$xmap->{VMap}}[$xmap->{NReal}+$idx]; - my $cname = ${$xmap->{CMap}->{$idx}}[$cval]; - ${$xlc->{LITS}}[$idx] = "$vname=$cname"; - } - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_init)]); } -} - - -# Reference implementation -sub compute_explanations() -{ - my ($opts, $xlc, $inst) = @_; - - ##if ($xlc->{Lambda} < 0) { print ("Expl: true\n"); return; } ???? - # - my @xp = (-1) x ($xlc->{NV}+1); my @tog = (-1) x $xlc->{NV}; my $depth=-1; - my $cntxp = (defined($opts->{n})) ? 1 : 0; my $numxp = 0; $xp[0] = 1; - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_xpA)]); } - while (1) { - # 1. Find another explanation - #if (DBG && $opts->{d}) { print("\@Depth: $depth\n"); - # &prt_xp_snapshot($xlc,\%xp,\@tog,$depth,1); } - $depth = &find_one_explanation($opts, $xlc, $inst, \@tog, \@xp, $depth); - &report_explanation($opts, $xlc, \@xp); - if ($cntxp && ++$numxp == $opts->{n}) { last; } - # 2. Enter consistent state - $depth = &enter_valid_state($opts, $xlc, $inst, \@tog, \@xp, $depth); - if ($depth < 0) { return; } - if (DBG && $opts->{d}) { &prt_xp_snapshot($xlc,\@xp,\@tog,$depth,0); } - } - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_xpB) ]); } -} - -sub find_one_explanation() -{ - my ($opts, $xlc, $inst, $tog, $xp, $idx) = @_; - - while ($xlc->{Phi} >= 0) { - if (DBG && defined($opts->{d})) { print "Depth(down): $idx\n"; } - if (CHK && defined($opts->{k})) { - assert($idx<$xlc->{NV}); - assert($idx==$xlc->{NV}-1 || ${$tog}[$idx+1]==-1); } - ${$tog}[++$idx] = 0; - $xlc->{Phi} -= ${$xlc->{SortedDelta}}[$idx]; - ®_literal($opts, $xp, $xlc, $inst, $idx); - if (DBG && $opts->{d}) { &prt_xp_snapshot($xlc, $xp, $tog, $idx, 0); } - } - if (CHK && defined($opts->{k})) { - assert($xlc->{Phi}<0); &chk_explanation($opts, $xlc, $xp, $tog); } - return $idx; -} - -sub enter_valid_state() -{ - my ($opts, $xlc, $inst, $tog, $xp, $idx) = @_; - - while (!&consistent_state($opts, $xlc, $idx)) { - if (DBG && defined($opts->{d})) { print "Depth(up): $idx\n"; } - while ($idx>=0 && ${$tog}[$idx]==1) { ${$tog}[$idx--] = -1; } - if ($idx < 0) { return $idx; } # Terminate - # Drop literal from explanation - if (CHK && defined($opts->{k})) { assert(${$tog}[$idx]==0); } - &unreg_literal($opts, $xp, $xlc, $inst, $idx); - $xlc->{Phi} += ${$xlc->{SortedDelta}}[$idx]; - if (CHK && defined($opts->{k})) { assert(${$tog}[$idx]==0); } - ${$tog}[$idx] = 1; - if (DBG && $opts->{d}) { &prt_xp_snapshot($xlc, $xp, $tog, $idx, 1); } - } - return $idx; -} - -sub consistent_state() -{ - my ($opts, $xlc, $idx) = @_; - - my $stok = - ($xlc->{Phi} < 0 || $idx == $xlc->{NV}-1 || - ${$xlc->{SumFrom}}[$idx+1] <= $xlc->{Phi}); - return ($stok) ? 0 : 1; -} - -sub reg_literal() -{ - my ($opts, $xp, $xlc, $inst, $idx) = @_; - my $lit = ${$xlc->{SDelta}}[$idx]; - if (CHK) { assert(${$xp}[0] <= $#{$xp}, "XP idx above limit??"); } - ${$xp}[${$xp}[0]++] = $lit; - if (CHK) { assert(${$xp}[${$xp}[0]] == -1, "Pointing to wrong pos!?"); } -} - -sub unreg_literal() -{ - my ($opts, $xp, $xlc, $inst, $idx) = @_; - if (CHK) { assert(${$xp}[0] > 0, "XP idx below limit??"); } - ${$xp}[--${$xp}[0]] = -1; - if (CHK) { assert(${$xp}[${$xp}[0]] == -1, "Pointing to wrong pos!?"); } -} - -sub report_explanation() -{ - my ($opts, $xlc, $xp) = @_; - - # Obs: No actual need to sort; we can keep a sorted list. This is faster... - if ($opts->{w}) { - ##$" = ', '; - my $tlits = $xlc->{LITS}; - my @slice = @{$xp}[1 .. (${$xp}[0]-1)]; - if (DBG && $opts->{d}) { print ("Slice: @slice\n"); } - my @sslice = sort { ${$tlits}[$a] cmp ${$tlits}[$b] } @slice; - if (DBG && $opts->{d}) { print ("Sorted Slice: @slice\n"); } - my @xplits = map { ${$xlc->{LITS}}[$_] } @sslice; - if (DBG && $opts->{d}) { print ("Exp Lits: @xplits\n"); } - - if (CHK && $opts->{k}) { - for(my $i=1; $i<=$#xplits; ++$i) { - assert($xplits[$i-1] ne $xplits[$i], - "Duplicate literals in explanation: $xplits[$i-1] vs. $xplits[$i]\n" . - "Exp: @xplits\n"); - } - } - - #my @xplits = sort {abs($a) <=> abs($b)} keys %{$xp}; - if (!$opts->{v}) { - ##local $"=', '; - print("Expl: @xplits\n"); - } - else { - my $sz = sprintf("_(\#%d/%d", $#xplits+1,$xlc->{NV}); - my $wt = (defined($opts->{k})) ? - sprintf(";W:%3.2f)", $xlc->{Phi}) : ')'; - ##{ ##local $"=', '; - print("Expl$sz$wt: @xplits\n"); - ## } - } - } - if ($opts->{t} || $opts->{s}) { - if ($opts->{t}) { - my $nlits = ${$xp}[0]-1; - $xlc->{XplSz} += $nlits; - if ($xlc->{XplMin} > $nlits) { $xlc->{XplMin} = $nlits; } - if ($xlc->{XplMax} < $nlits) { $xlc->{XplMax} = $nlits; } - } - if ($opts->{s}) { - my ($Cnts, $num) = ($xlc->{CNTS}, $xp->[0]); - for (my $idx=1; $idx<$num; ++$idx) { - ${$Cnts}[$xp->[$idx]]++; - } - } - $xlc->{XplNum}++; - } - if (DBG && $opts->{d}) { &prt_flush(); } -} - -sub prt_xp_snapshot() -{ - my ($xlc, $xp, $tog, $depth, $mf) = @_; - - my $msg = ($mf) ? '@Up:' : '@Down:'; - print ("$msg\n"); - print ("Phi: $xlc->{Phi}\n"); - print ("Deltas: [ @{$xlc->{SortedDelta}} ]\n"); - print ("SDelta: [ @{$xlc->{SDelta}} ]\n"); - print ("CNTS: [ @{$xlc->{CNTS}} ]\n"); - print ("LITS: [ @{$xlc->{LITS}} ]\n"); - my $lstidx = ${$xp}[0]-1; - print ("XP keys: ${$xp}[0] + [ @{$xp}[1..$lstidx] ]\n"); - print ("XP vect: [ @{$xp} ]\n"); - print ("Togs: [ @{$tog} ]\n"); - print ("Depth: $depth\n"); - &prt_flush(); -} - -sub chk_explanation() -{ - my ($opts, $xlc, $xp, $tog) = @_; - - my ($phi, $ntogs) = ($xlc->{PhiRef}, 0); - for(my $i=0; $i<=$#{$tog}; ++$i) { - if (${$tog}[$i]==0) { - $phi -= ${$xlc->{SortedDelta}}[$i]; - $ntogs++; - } - } - assert($phi < 0); - assert($ntogs == ${$xp}[0]-1); -} - - -# Alternative (faster) implementation -sub compute_explanations_xl() -{ - my ($opts, $xlc, $inst) = @_; - - my @SortedDelta = @{$xlc->{SortedDelta}}; - my @SDelta = @{$xlc->{SDelta}}; - my @SumFrom = @{$xlc->{SumFrom}}; - my @xp = (-1) x ($xlc->{NV}+1); my @tog = (-1) x $xlc->{NV}; my $depth=-1; - my $cntxp = (defined($opts->{n})) ? 1 : 0; my $numxp = 0; $xp[0]=1; - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_xpA)]); } - while (1) { - # 1. Find another explanation - while ($xlc->{Phi} >= 0) { - if (DBG && defined($opts->{d})) { print "Depth(down): $depth\n"; } - if (CHK && defined($opts->{k})) { - assert($depth<$xlc->{NV}); - assert($depth==$xlc->{NV}-1 || $tog[$depth+1]==-1); } - $tog[++$depth] = 0; - $xlc->{Phi} -= $SortedDelta[$depth]; - $xp[$xp[0]++] = $SDelta[$depth]; - if (DBG && $opts->{d}) { - &prt_xp_snapshot($xlc,\@xp,\@tog,$depth,0); } - } - if (CHK && defined($opts->{k})) { - assert($xlc->{Phi}<0); &chk_explanation($opts,$xlc,\@xp,\@tog); } - &report_explanation($opts, $xlc, \@xp); - if ($cntxp && ++$numxp == $opts->{n}) { last; } - - # 2. Enter consistent state - while ($xlc->{Phi} < 0 || $depth == $xlc->{NV}-1 || - $SumFrom[$depth+1] <= $xlc->{Phi}) { - if (DBG && defined($opts->{d})) { print "Depth(up): $depth\n"; } - while ($depth>=0 && $tog[$depth]==1) { $tog[$depth--] = -1; } - if ($depth < 0) { return $depth; } # Terminate - # Drop literal from explanation - if (CHK && defined($opts->{k})) { assert($tog[$depth]==0); } - $xp[--$xp[0]] = 0; - $xlc->{Phi} += $SortedDelta[$depth]; - if (CHK && defined($opts->{k})) { assert($tog[$depth]==0); } - $tog[$depth] = 1; - if (DBG && $opts->{d}) { - &prt_xp_snapshot($xlc,\@xp,\@tog,$depth,1); } - } - if ($depth < 0) { return; } - if (DBG && $opts->{d}) { &prt_xp_snapshot($xlc,\@xp,\@tog,$depth,0); } - } - if (DBG && $opts->{d}) { print Data::Dumper->Dump([$xlc], [qw(xlc_xpB) ]); } -} - -sub validate_explanations() -{ - my ($opts, $xlc, $inst, $xmap, $xpl) = @_; - - %{$xmap->{IVMap}} = (); - for (my $i=0; $i<=$#{$xmap->{VMap}}; ++$i) { - $xmap->{IVMap}->{${$xmap->{VMap}}[$i]} = $i; - } - # Traverse & validate given explanations - ($xpl->{XPStr}, $xpl->{Status}, $xpl->{RedLits}) = ([], [], []); - foreach my $xpvec (@{$xpl->{Expl}}) { - push @{$xpl->{XPStr}}, "@{$xpvec}"; - # 1. Check entailment - my $phi = $xlc->{PhiRef}; - foreach my $lit (@{$xpvec}) { - $lit =~ m/([^=]+)=([^=]+)/ || die "Unable to match literal: $lit\n"; - my ($svar, $sval) = ($1, $2); - ##print ("(svar,sval)=($svar,$sval)\n"); - ##print ("IVMap{svar}: $xmap->{IVMap}->{$svar}\n"); - my $var = $xmap->{IVMap}->{$svar}-$xmap->{NReal}; - $phi -= ${$xlc->{Delta}}[$var]; - ##print ("Current Phi:$phi\n"); - } - if ($phi >= 0) { - push @{$xpl->{Status}}, -1; - push @{$xpl->{RedLits}}, []; - next; - } - # 2. Check redundancy - if (CHK && defined($opts->{k})) { assert($phi < 0); } - my $RedLits = []; - foreach my $lit (@{$xpvec}) { - $lit =~ m/([^=]+)=([^=]+)/ || die "Unable to match literal: $lit\n"; - my ($svar, $sval) = ($1, $2); - my $var = $xmap->{IVMap}->{$svar}-$xmap->{NReal}; - if ($phi + $xlc->{Delta}[$var] < 0) { push @{$RedLits}, $lit; } - } - push @{$xpl->{RedLits}}, $RedLits; - if (@{$RedLits}) { push @{$xpl->{Status}}, 1; next; } - push @{$xpl->{Status}}, 0; - } - return; -} - -sub print_xpl_status() -{ - my ($opts, $xpl) = @_; - - ###($xpl->{XPStr}, $xpl->{Status}, $xpl->{RedLits}) = ([], [], []); - for(my $i=0; $i<=$#{$xpl->{XPStr}}; ++$i) { - print ("Expl: ${$xpl->{XPStr}}[$i] => "); - my $xpst = ${$xpl->{Status}}[$i]; - my ($msg, $redlits) = ('', ''); - if ($xpst == 0) { - $msg = 'Confirmed as (subset-minimal) explanation'; - } - elsif ($xpst < 0) { - $msg = 'NOT an explanation, i.e. entailment does not hold'; - } - else { - $msg = 'Redundant explanation. Example of redundant literals: '; - $redlits = "@{${$xpl->{RedLits}}[$i]}"; - } - print ("$msg$redlits\n"); - } -} - -sub print_stats() -{ - my ($opts, $xlc) = @_; - - my $tname = uc(&toolname($0)); - print "\n$tname stats:\n"; - my $tsz = (defined($opts->{n})) ? "$opts->{n}" : 'all'; - print ("Target explanations: $tsz\n"); - my $avgsz = sprintf("%.2f", $xlc->{XplSz} / $xlc->{XplNum}); - print ("Number of explanations: $xlc->{XplNum}\n"); - print ("Average explanation size: $avgsz\n"); - print ("Smallest explanation: $xlc->{XplMin}\n"); - print ("Largest explanation: $xlc->{XplMax}\n"); -} - -sub print_summaries() -{ - my ($opts, $xlc) = @_; - - my $tname = uc(&toolname($0)); - print "\n$tname summary:\n"; - my $hsz = 0; - for (my $idx=0; $idx <= $#{$xlc->{CNTS}}; ++$idx) { - if (${$xlc->{CNTS}}[$idx] != 0) { $hsz++; } - } - my $tsz = (defined($opts->{n})) ? "$opts->{n}" : 'all'; - print "Target explanations: $tsz\n"; - my $avgsz = sprintf("%.2f", $xlc->{XplSz} / $xlc->{XplNum}); - print "Number of explanations: $xlc->{XplNum}\n"; - print "Histogram size: $hsz\n"; - print "Literal distribution in explanations:\n"; - my $tcnts = $xlc->{CNTS}; - my @skeys = (0 .. $xlc->{NV}-1); - @skeys = sort { abs(${$tcnts}[$a]) <=> abs(${$tcnts}[$b]) } @skeys; - foreach my $key (@skeys) { - next if ${$xlc->{CNTS}}[$key] <= 0; - my $lit = ${$xlc->{LITS}}[$key]; - print("$lit: ${$xlc->{CNTS}}[$key]\n"); - } -} - -sub print_stats_int() -{ - my $args = shift @_; - - my ($opts, $xlc) = @{$args}; - &print_stats($opts, $xlc); -} - -sub print_summaries_int() -{ - my $args = shift @_; - - my ($opts, $xlc) = @{$args}; - &print_summaries($opts, $xlc); -} - - -# Utilities - -sub read_opts() -{ - my ($opts) = @_; - getopts("hdvCtswxk:n:c:rp:m:f:i:", $opts); - - if ($opts->{h}) { - &prt_help(); - } - elsif (!defined($opts->{f}) || !defined($opts->{i})) { - ##|| - ##(defined($opts->{c}) && defined($opts->{i})) || - ##(!defined($opts->{c}) && !defined($opts->{i}))) { - die "Usage: $0 [-h] [-d] [-v] [-C] [-t] [-s] [-w] [-x] [-k <KKK>] [-n <NNN>] [-p <prt-file>] [-c <chk-xpl> [-r]] [-m <xmap-file>] -i <cat-inst-file> -f <xlc-file>\n" ; - } -} - -sub prt_help() -{ - my $tname = &toolname($0); - print <<"EOF"; -$tname: Compute explanations of XLCs (including NBCs) with polynomial delay -Usage: $tname [-h] [-d] [-v] [-C] [-t] [-s] [-w] [-x] [-k <KKK>] [-n <NNN>] [-p <prt-file>] [-c <xpl-file> [-r]] [-m <xmap-file>] -i <cat-inst-file> -f <xlc-file> - -f <xlc-file> specification of XLC file - -i <inst-file> specification of instance - -c <xpl-file> check/validate explanation - -m <xmap-file> map file - -p <prt-file> print to file - -n <NNN> number of NNN explanations to list (the default is all) - -k <KKK> apply consistency checks & issue warnings (1) or exit (>1) - -r repair explanations (when validating explanations) [not yet available] - -x run faster implementation - -w write computed explanations - -s summarize computed explanations - -t gather stats on computed explanations - -C enable catching system signals - -v verbose mode - -d debug mode - -h prints this help - Author: joao.marques-silva\@univ-toulouse.fr -EOF - exit(); -} - -sub prt_warn() -{ - my ($msg) = @_; - print("*** $0 warning ***: $msg\n"); -} - -sub prt_err_exit() -{ - my ($msg) = @_; - print("*** $0 error ***: $msg\n"); - exit(); -} - -sub toolname() -{ - my ($tname) = @_; - $tname =~ m/([\.\_\-a-zA-Z0-9]+)$/; - return $1; -} - -sub prt_flush() -{ - select()->flush(); -} - - -#------------------------------------------------------------------------------# -# Auxiliary functions -#------------------------------------------------------------------------------# - -sub resolve_inc() { # Copy from template kept in UTILS package - my ($cref, $pmname) = @_; - my @progname_toks = split(/\//, $0); - pop @progname_toks; - my $progpath = join('/', @progname_toks); - my $fullname = $progpath . '/' . $pmname; - open(my $fh, "<$fullname") || die "non-existing file: $pmname\n"; - return $fh; -} - -# jpms diff --git a/pages/application/RandomForest/RandomForestComponent.py b/pages/application/RandomForest/RandomForestComponent.py deleted file mode 100644 index 50dd5d332e9c3b3e1e66ed57007f10415152d221..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/RandomForestComponent.py +++ /dev/null @@ -1,83 +0,0 @@ -import base64 - -import dash_bootstrap_components as dbc -import numpy as np -from dash import dcc, html - -from pages.application.RandomForest.utils.data import Data -from pages.application.RandomForest.utils.anchor_wrap import anchor_call -from pages.application.RandomForest.utils.lime_wrap import lime_call -from pages.application.RandomForest.utils.shap_wrap import shap_call -from pages.application.RandomForest.utils.xgbooster import XGBooster, preprocess_dataset -from pages.application.RandomForest.utils.xgbrf import XGBRandomForest - - -class RandomForestComponent: - - def __init__(self, model, type_model='SKL', info=None, type_info=''): - - if info is not None and '.csv' in type_info: - self.data = Data(info) - - # Conversion model - if type_model == "RF": - self.random_forest = XGBRandomForest(info, from_model=model) - else: - self.random_forest = XGBooster(info, from_model=model) - - # self.random_forest.encode(test_on=info) - - self.map_file = "" - - self.network = html.Div([]) - self.explanation = [] - - def update_with_explicability(self, instance, enum_feats=None, validation=None, xtype=None, solver=None, ): - - # Call explanation - - if not enum_feats and self.data is not None: - enum_feats = len(self.data.names) - 1 - - expl = self.random_forest.explain(instance, - use_lime=lime_call if solver == "lime" else None, - use_anchor=anchor_call if solver == "anchor" else None, - use_shap=shap_call if solver == "shap" else None, - nof_feats=enum_feats) - - if validation: - coex = self.random_forest.validate(instance, expl) - if coex: - # repairing the local explanation - gexpl = self.random_forest.explain(instance, expl_ext=expl, prefer_ext=True) - else: - # an attempt to refine the local explanation further - gexpl = self.random_forest.explain(instance, expl_ext=expl) - - print(expl) - - self.explanation = [] - list_explanations_path = [] - explanation = {} - - self.network = html.Div([]) - - # Creating a clean and nice text component - # instance plotting - self.explanation.append(html.H4("Instance : \n")) - self.explanation.append(html.P(str([str(instance[i]) for i in range(len(instance))]))) - for k in explanation.keys(): - if k != "List of path explanation(s)" and k != "List of path contrastive explanation(s)": - if k in ["List of abductive explanation(s)", "List of contrastive explanation(s)"]: - self.explanation.append(html.H4(k)) - for expl in explanation[k]: - self.explanation.append(html.Hr()) - self.explanation.append(html.P(expl)) - self.explanation.append(html.Hr()) - else: - self.explanation.append(html.P(k + explanation[k])) - else: - list_explanations_path = explanation["List of path explanation(s)"] - list_contrastive_explanations_path = explanation["List of path contrastive explanation(s)"] - - return list_explanations_path, list_contrastive_explanations_path diff --git a/pages/application/RandomForest/utils/anchor_wrap/__init__.py b/pages/application/RandomForest/utils/anchor_wrap/__init__.py deleted file mode 100644 index 3d4bcf2c92b14c5fdda52e9d291aa6995e713bce..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/anchor_wrap/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .anchor_wrap import * diff --git a/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py b/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py deleted file mode 100644 index f66ec0a8fd48dc0e25b2760968d20e16b392968f..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/anchor_wrap/anchor_wrap.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## anchor_wrap.py (reuses parts of the code of SHAP) -## -## Created on: Jan 6, 2019 -## Author: Nina Narodytska, Alexey Ignatiev -## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import json -import numpy as np -import xgboost as xgb -import math -import resource -from anchor import utils -from anchor import anchor_tabular -import sklearn -import sklearn.ensemble - - -# -#============================================================================== -def anchor_call(xgb, sample=None, nb_samples=5, feats='all', - nb_features_in_exp=5, threshold=0.95): - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # we need a way to say that features are categorical ? - # we do not have this informations. - explainer = anchor_tabular.AnchorTabularExplainer( - class_names=xgb.target_name, - feature_names=xgb.feature_names, - train_data=xgb.X, - categorical_names=xgb.categorical_names if xgb.use_categorical else {}) - # if (len(xgb.X_test) != 0): - # explainer.fit(xgb.X_train, xgb.Y_train, xgb.X_test, xgb.Y_test) - # else: - # explainer.fit(xgb.X_train, xgb.Y_train, xgb.X_train, xgb.Y_train) - predict_fn_xgb = lambda x: xgb.model.predict(xgb.transform(x)).astype(int) - - f2imap = {} - for i, f in enumerate(xgb.feature_names): - f2imap[f.strip()] = i - - if (sample is not None): - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except Exception as inst: - print("Cannot parse input sample:", sample, inst) - exit() - print("\n\n\nStarting Anchor explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(xgb.X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = xgb.transform(feat_sample_exp) - y_pred = xgb.model.predict(feat_sample_exp)[0] - y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] - #hack testiing that we use the same onehot encoding - # test_feat_sample_exp = explainer.encoder.transform(feat_sample_exp) - test_y_pred = xgb.model.predict(feat_sample_exp)[0] - test_y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] - assert(np.allclose(y_pred_prob, test_y_pred_prob)) - print('Prediction: ', explainer.class_names[predict_fn_xgb(feat_sample.reshape(1, -1))[0]]) - # exp = explainer.explain_instance(feat_sample, xgb.model.predict, threshold=threshold) - print('sample ====== ', feat_sample) - exp = explainer.explain_instance(feat_sample, predict_fn_xgb, threshold=threshold) - print('Anchor: %s' % (' AND '.join(exp.names()))) - print('Precision: %.2f' % exp.precision()) - print('Coverage: %.2f' % exp.coverage()) - - # explanation - expl = [] - - if (xgb.use_categorical): - for k, v in enumerate(exp.features()): - expl.append(v) - print("Clause ", k, end=": ") - print("feature (", v, ",", explainer.feature_names[v], end="); ") - print("value (", feat_sample[v], ",", explainer.categorical_names[v][int(feat_sample[v])] , ")") - else: - print("We only support datasets with categorical features for Anchor. Please pre-process your data.") - exit() - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - - return sorted(expl) - - ###################################### TESTING - max_sample = nb_samples - y_pred_prob = xgb.model.predict_proba(xgb.X_test) - y_pred = xgb.model.predict(xgb.X_test) - - nb_tests = min(max_sample,len(xgb.Y_test)) - top_labels = 1 - for sample in range(nb_tests): - np.set_printoptions(precision=2) - feat_sample = xgb.X_test[sample] - print("Considering a sample with features:", feat_sample) - if (False): - feat_sample[4] = 3000 - y_pred_prob_sample = xgb.model.predict_proba([feat_sample]) - print(y_pred_prob_sample) - print("\t Predictions:", y_pred_prob[sample]) - exp = explainer.explain_instance(feat_sample, - predict_fn_xgb, - num_features= xgb.num_class, - top_labels = 1, - labels = list(range(xgb.num_class))) - for i in range(xgb.num_class): - if (i != y_pred[sample]): - continue - print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[sample][i], ")") - print("\t \t Features in explanations: ", exp.as_list(label=i)) - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - return diff --git a/pages/application/RandomForest/utils/data.py b/pages/application/RandomForest/utils/data.py deleted file mode 100644 index 6c94e3da69d365d8270afa00f5f8fa7db1506ab7..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/data.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## data.py -## -## Created on: Sep 20, 2017 -## Author: Alexey Ignatiev, Nina Narodytska -## E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com -## - -# -#============================================================================== -from __future__ import print_function -import collections -import itertools -import pickle -import six -from six.moves import range -import numpy as np - - -# -#============================================================================== -class Data(object): - """ - Class for representing data (transactions). - """ - - def __init__(self, filename=None, fpointer=None, mapfile=None, - separator=' ', use_categorical = False): - """ - Constructor and parser. - """ - - self.names = None - self.nm2id = None - self.samps = None - self.wghts = None - self.feats = None - self.fvmap = None - self.ovmap = {} - self.fvars = None - self.fname = filename - self.mname = mapfile - self.deleted = set([]) - - if filename: - with open(filename, 'r') as fp: - self.parse(fp, separator) - elif fpointer: - self.parse(fpointer, separator) - - if self.mname: - self.read_orig_values() - - # check if we have extra info about categorical_features - - if (use_categorical): - extra_file = filename+".pkl" - try: - f = open(extra_file, "rb") - print("Attempt: loading extra data from ", extra_file) - extra_info = pickle.load(f) - print("loaded") - f.close() - self.categorical_features = extra_info["categorical_features"] - self.categorical_names = extra_info["categorical_names"] - self.class_names = extra_info["class_names"] - self.categorical_onehot_names = extra_info["categorical_names"].copy() - - for i, name in enumerate(self.class_names): - self.class_names[i] = str(name).replace("b'","'") - for c in self.categorical_names.items(): - clean_feature_names = [] - for i, name in enumerate(c[1]): - name = str(name).replace("b'","'") - clean_feature_names.append(name) - self.categorical_names[c[0]] = clean_feature_names - - except Exception as e: - f.close() - print("Please provide info about categorical features or omit option -c", e) - exit() - - def parse(self, fp, separator): - """ - Parse input file. - """ - - # reading data set from file - lines = fp.readlines() - - # reading preamble - self.names = lines[0].strip().split(separator) - self.feats = [set([]) for n in self.names] - del(lines[0]) - - # filling name to id mapping - self.nm2id = {name: i for i, name in enumerate(self.names)} - - self.nonbin2bin = {} - for name in self.nm2id: - spl = name.rsplit(':',1) - if (spl[0] not in self.nonbin2bin): - self.nonbin2bin[spl[0]] = [name] - else: - self.nonbin2bin[spl[0]].append(name) - - # reading training samples - self.samps, self.wghts = [], [] - - for line, w in six.iteritems(collections.Counter(lines)): - sample = line.strip().split(separator) - for i, f in enumerate(sample): - if f: - self.feats[i].add(f) - self.samps.append(sample) - self.wghts.append(w) - - # direct and opposite mappings for items - idpool = itertools.count(start=0) - FVMap = collections.namedtuple('FVMap', ['dir', 'opp']) - self.fvmap = FVMap(dir={}, opp={}) - - # mapping features to ids - for i in range(len(self.names) - 1): - feats = sorted(list(self.feats[i]), reverse=True) - if len(feats) > 2: - for l in feats: - self.fvmap.dir[(self.names[i], l)] = l - else: - self.fvmap.dir[(self.names[i], feats[0])] = 1 - if len(feats) == 2: - self.fvmap.dir[(self.names[i], feats[1])] = 0 - - # opposite mapping - for key, val in six.iteritems(self.fvmap.dir): - self.fvmap.opp[val] = key - - # determining feature variables (excluding class variables) - for v, pair in six.iteritems(self.fvmap.opp): - if pair[0] == self.names[-1]: - self.fvars = v - 1 - break - - def read_orig_values(self): - """ - Read original values for all the features. - (from a separate CSV file) - """ - - self.ovmap = {} - - for line in open(self.mname, 'r'): - featval, bits = line.strip().split(',') - feat, val = featval.split(':') - - for i, b in enumerate(bits): - f = '{0}:b{1}'.format(feat, i + 1) - v = self.fvmap.dir[(f, '1')] - - if v not in self.ovmap: - self.ovmap[v] = [feat] - - if -v not in self.ovmap: - self.ovmap[-v] = [feat] - - self.ovmap[v if b == '1' else -v].append(val) diff --git a/pages/application/RandomForest/utils/lime_wrap/__init__.py b/pages/application/RandomForest/utils/lime_wrap/__init__.py deleted file mode 100644 index 32487979bf8f14f3ab733fa02b82973843e3078b..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/lime_wrap/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .lime_wrap import * diff --git a/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py b/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py deleted file mode 100644 index 0146ed1f3ae756ea8a0d7bf0f8d6a79449b951fb..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/lime_wrap/lime_wrap.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## lime_wrap.py (reuses parts of the code of SHAP) -## -## Created on: Dec 12, 2018 -## Author: Nina Narodytska, Alexey Ignatiev -## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -import json -import numpy as np -import xgboost as xgb -import math -import lime -import lime.lime_tabular -import resource - - -# -#============================================================================== -def lime_call(xgb, sample = None, nb_samples = 5, feats='all', - nb_features_in_exp=5): - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # we need a way to say that features are categorical ? - # we do not have this informations. - predict_fn_xgb = lambda x: xgb.model.predict_proba(xgb.transform(x)).astype(float) - explainer = lime.lime_tabular.LimeTabularExplainer( - xgb.X_train, - feature_names=xgb.feature_names, - categorical_features=xgb.categorical_features if xgb.use_categorical else None, - class_names=xgb.target_name, - discretize_continuous=True, - ) - - f2imap = {} - for i, f in enumerate(xgb.feature_names): - f2imap[f.strip()] = i - - if (sample is not None): - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except: - print("Cannot parse input sample:", sample) - exit() - print("\n\n\nStarting LIME explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(xgb.X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = xgb.transform(feat_sample_exp) - y_pred = xgb.model.predict(feat_sample_exp)[0] - y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] - - exp = explainer.explain_instance(feat_sample, - predict_fn_xgb, - num_features = nb_features_in_exp, - top_labels = 1)#, - #labels = list(range(xgb.num_class))) - - expl = [] - - # choose which features in the explanation to focus on - if feats in ('p', 'pos', '+'): - feats = 1 - elif feats in ('n', 'neg', '-'): - feats = -1 - else: - feats = 0 - - for i in range(xgb.num_class): - if (i != y_pred): - continue - print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[i], ")") - print("\t \t Features in explanations: ", exp.as_list(label=i)) - - s_human_readable = "" - for k, v in enumerate(exp.as_list(label=i)): - if (feats == 1 and v[1] < 0) or (feats == -1 and v[1] >= 0): - continue - - if not (('<' in v[0]) or ('>' in v[0])): - a = v[0].split('=') - f = a[0].strip() - l = a[1].strip() - u = l - - if (xgb.use_categorical): - fid = f2imap[f] - fvid = int(a[1]) - #s_human_readable = s_human_readable + f + " = [" + str(xgb.categorical_names[fid][fvid]) +"," + str(v[1])+ "] " - s_human_readable = s_human_readable + "\t \t id = {}, name = {}, score = {}\n".format(fid, f, str(v[1])) - - - else: - a = v[0].split('<') - - if len(a) == 1: - a = v[0].split('>') - - if len(a) == 2: - f = a[0].strip() - - if '>' in v[0]: - l, u = float(a[1].strip(' =')), None - else: - l, u = None, float(a[1].strip(' =')) - else: - l = float(a[0].strip()) - f = a[1].strip(' =') - u = float(a[2].strip(' =')) - - # expl.append(tuple([f2imap[f], l, u, v[1] >= 0])) - expl.append(f2imap[f]) - - if (xgb.use_categorical): - if (len(s_human_readable) > 0): - print("\t \t Features in explanations (with provided categorical labels): \n", s_human_readable) - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - - return sorted(expl) - - ###################################### TESTING - max_sample = nb_samples - y_pred_prob = xgb.model.predict_proba(xgb.X_test) - y_pred = xgb.model.predict(xgb.X_test) - - nb_tests = min(max_sample,len(xgb.Y_test)) - top_labels = 1 - for sample in range(nb_tests): - np.set_printoptions(precision=2) - feat_sample = xgb.X_test[sample] - print("Considering a sample with features:", feat_sample) - if (False): - feat_sample[4] = 3000 - y_pred_prob_sample = xgb.model.predict_proba([feat_sample]) - print(y_pred_prob_sample) - print("\t Predictions:", y_pred_prob[sample]) - exp = explainer.explain_instance(feat_sample, - predict_fn_xgb, - num_features= xgb.num_class, - top_labels = 1, - labels = list(range(xgb.num_class))) - for i in range(xgb.num_class): - if (i != y_pred[sample]): - continue - print("\t \t Explanations for the winner class", i, " (xgboost confidence = ", y_pred_prob[sample][i], ")") - print("\t \t Features in explanations: ", exp.as_list(label=i)) - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - return diff --git a/pages/application/RandomForest/utils/options.py b/pages/application/RandomForest/utils/options.py deleted file mode 100644 index 8ba5a91707abf42368d67f65044806199cf4c8d4..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/options.py +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## options.py -## -## Created on: Dec 7, 2018 -## Author: Alexey Ignatiev, Nina Narodytska -## E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com -## - -# -#============================================================================== -from __future__ import print_function -import getopt -import math -import os -import sys - - -# -#============================================================================== -class Options(object): - """ - Class for representing command-line options. - """ - - def __init__(self, command): - """ - Constructor. - """ - - # actions - self.train = False - self.encode = 'none' - self.explain = '' - self.useanchor = False - self.uselime = False - self.useshap = False - self.limefeats = 5 - self.validate = False - self.use_categorical = False - self.preprocess_categorical = False - self.preprocess_categorical_files = "" - - # training options - self.accmin = 0.95 - self.n_estimators = 100 - self.num_boost_round = 10 - self.maxdepth = 3 - self.testsplit = 0.2 - self.seed = 7 - - # other options - self.files = None - self.output = 'Classifiers' - self.mapfile = None - self.separator = ',' - self.smallest = False - self.solver = 'z3' - self.verb = 0 - - # random forest - self.rf = False - self.pi_check = False - self.repair = False - self.refine = False - - if command: - self.parse(command) - - def parse(self, command): - """ - Parser. - """ - - self.command = command - - try: - opts, args = getopt.getopt(command[1:], - 'a:ce:d:hL:lm:Mn:o:pPr:Rqs:tvVwx:', - ['accmin=', - 'encode=', - 'help', - 'map-file=', - 'use-anchor=', - 'lime-feats=', - 'use-lime=', - 'use-shap=', - 'use-categorical=', - 'preprocess-categorical=', - 'pfiles=', - 'maxdepth=', - 'minimum', - 'nbestims=', - 'output=', - 'prime-implicant', - 'rounds=', - 'random-forest', - 'repair', - 'refine', - 'seed=', - 'sep=', - 'solver=', - 'testsplit=', - 'train', - 'validate', - 'verbose', - 'explain=' - ]) - except getopt.GetoptError as err: - sys.stderr.write(str(err).capitalize()) - self.usage() - sys.exit(1) - - for opt, arg in opts: - if opt in ('-a', '--accmin'): - self.accmin = float(arg) - elif opt in ('-c', '--use-categorical'): - self.use_categorical = True - elif opt in ('-d', '--maxdepth'): - self.maxdepth = int(arg) - elif opt in ('-e', '--encode'): - self.encode = str(arg) - elif opt in ('-h', '--help'): - self.usage() - sys.exit(0) - elif opt in ('-l', '--use-lime'): - self.uselime = True - elif opt in ('-L', '--lime-feats'): - self.limefeats = 0 if arg == 'all' else int(arg) - elif opt in ('-m', '--map-file'): - self.mapfile = str(arg) - elif opt in ('-M', '--minimum'): - self.smallest = True - elif opt in ('-n', '--nbestims'): - self.n_estimators = int(arg) - elif opt in ('-o', '--output'): - self.output = str(arg) - elif opt in ('-q', '--use-anchor'): - self.useanchor = True - elif opt in ('-P', '--prime-implicant'): - self.pi_check = True - elif opt in ('-r', '--rounds'): - self.num_boost_round = int(arg) - elif opt in ('-R', '--random-forest'): - self.rf = True - elif opt == '--repair': - self.repair = True - elif opt == '--refine': - self.refine = True - elif opt == '--seed': - self.seed = int(arg) - elif opt == '--sep': - self.separator = str(arg) - elif opt in ('-s', '--solver'): - self.solver = str(arg) - elif opt == '--testsplit': - self.testsplit = float(arg) - elif opt in ('-t', '--train'): - self.train = True - elif opt in ('-V', '--validate'): - self.validate = True - elif opt in ('-v', '--verbose'): - self.verb += 1 - elif opt in ('-w', '--use-shap'): - self.useshap = True - elif opt in ('-x', '--explain'): - self.explain = str(arg) - elif opt in ('-p', '--preprocess-categorical'): - self.preprocess_categorical = True - elif opt in ('--pfiles'): - self.preprocess_categorical_files = str(arg) #train_file, test_file(or empty, resulting file - else: - assert False, 'Unhandled option: {0} {1}'.format(opt, arg) - - if self.encode == 'none': - self.encode = None - - self.files = args - - def usage(self): - """ - Print usage message. - """ - - print('Usage: ' + os.path.basename(self.command[0]) + ' [options] input-file') - print('Options:') - print(' -a, --accmin=<float> Minimal accuracy') - print(' Available values: [0.0, 1.0] (default = 0.95)') - print(' -c, --use-categorical Treat categorical features as categorical (with categorical features info if available)') - print(' -d, --maxdepth=<int> Maximal depth of a tree') - print(' Available values: [1, INT_MAX] (default = 3)') - print(' -e, --encode=<smt> Encode a previously trained model') - print(' Available values: smt, smtbool, none (default = none)') - print(' -h, --help Show this message') - print(' -l, --use-lime Use LIME to compute an explanation') - print(' -L, --lime-feats Instruct LIME to compute an explanation of this size') - print(' Available values: [1, INT_MAX], all (default = 5)') - print(' -m, --map-file=<string> Path to a file containing a mapping to original feature values. (default: none)') - print(' -M, --minimum Compute a smallest size explanation (instead of a subset-minimal one)') - print(' -n, --nbestims=<int> Number of trees per class') - print(' Available values: [1, INT_MAX] (default = 100)') - print(' -o, --output=<string> Directory where output files will be stored (default: \'temp\')') - print(' -p, Preprocess categorical data') - print(' --pfiles Filenames to use when preprocessing') - print(' --prime-implicant Check explanation if it is a prime implicant') - print(' -q, --use-anchor Use Anchor to compute an explanation') - print(' -r, --rounds=<int> Number of training rounds') - print(' -R, --random-forest Use Random Forest model') - print(' --refine try to refine the (optimistic) local explanation') - print(' --repair try to repair the (pessimistic) local explanation') - print(' Available values: [1, INT_MAX] (default = 10)') - print(' --seed=<int> Seed for random splitting') - print(' Available values: [1, INT_MAX] (default = 7)') - print(' --sep=<string> Field separator used in input file (default = \',\')') - print(' -s, --solver=<string> An SMT reasoner to use') - print(' Available values: cvc4, mathsat, yices, z3 (default = z3)') - print(' -t, --train Train a model of a given dataset') - print(' --testsplit=<float> Training and test sets split') - print(' Available values: [0.0, 1.0] (default = 0.2)') - print(' -v, --verbose Increase verbosity level') - print(' -V, --validate Validate explanation (show that it is too optimistic)') - print(' -w, --use-shap Use SHAP to compute an explanation') - print(' -x, --explain=<string> Explain a decision for a given comma-separated sample (default: none)') diff --git a/pages/application/RandomForest/utils/shap_wrap/__init__.py b/pages/application/RandomForest/utils/shap_wrap/__init__.py deleted file mode 100644 index 845cbd2bb73191edd950fc9eb09a641ab8cf2088..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/shap_wrap/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .shap_wrap import * diff --git a/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py b/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py deleted file mode 100644 index 4eadc21f76cc03f78c656fcf7d7cde1ed4ea2a3b..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/shap_wrap/shap_wrap.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## shap_wrap.py (reuses parts of the code of SHAP) -## -## Created on: Sep 25, 2019 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -import json -import numpy as np -import xgboost as xgb -import math -import shap -import resource - - -# -#============================================================================== -def shap_call(xgb, sample = None, feats='all', nb_features_in_exp = None): - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - f2imap = {} - for i, f in enumerate(xgb.feature_names): - f2imap[f.strip()] = i - - if (sample is not None): - if (nb_features_in_exp is None): - nb_features_in_exp = len(sample) - - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except: - print("Cannot parse input sample:", sample) - exit() - print("\n\nStarting SHAP explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(xgb.X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(xgb.X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = xgb.transform(feat_sample_exp) - y_pred = xgb.model.predict(feat_sample_exp)[0] - y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0] - - # No need to pass dataset as it is recored in model - # https://shap.readthedocs.io/en/latest/ - - explainer = shap.TreeExplainer(xgb.model) - shap_values = explainer.shap_values(feat_sample_exp) - - shap_values_sample = shap_values[-1] - transformed_sample = feat_sample_exp[-1] - - - - - # we need to sum values per feature - # https://github.com/slundberg/shap/issues/397 - sum_values = [] - if (xgb.use_categorical): - p = 0 - for f in xgb.categorical_features: - nb_values = len(xgb.categorical_names[f]) - sum_v = 0 - for i in range(nb_values): - sum_v = sum_v + shap_values_sample[p+i] - p = p + nb_values - sum_values.append(sum_v) - else: - sum_values = shap_values_sample - expl = [] - - # choose which features in the explanation to focus on - if feats in ('p', 'pos', '+'): - feats = 1 - elif feats in ('n', 'neg', '-'): - feats = -1 - else: - feats = 0 - - print("\t \t Explanations for the winner class", y_pred, " (xgboost confidence = ", y_pred_prob[int(y_pred)], ")") - print("base_value = {}, predicted_value = {}".format(explainer.expected_value, np.sum(sum_values) + explainer.expected_value)) - - abs_sum_values = np.abs(sum_values) - sorted_by_abs_sum_values =np.argsort(-abs_sum_values) - - for k1, v1 in enumerate(sorted_by_abs_sum_values): - - k = v1 - v = sum_values[v1] - - if (feats == 1 and v < 0) or (feats == -1 and v >= 0): - continue - - expl.append(f2imap[xgb.feature_names[k]]) - print("id = {}, name = {}, score = {}".format(f2imap[xgb.feature_names[k]], xgb.feature_names[k], v)) - - if (len(expl) == nb_features_in_exp): - break - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - print(' time: {0:.2f}'.format(timer)) - - return sorted(expl[:nb_features_in_exp]) diff --git a/pages/application/RandomForest/utils/xgbooster/__init__.py b/pages/application/RandomForest/utils/xgbooster/__init__.py deleted file mode 100644 index 88bdad8d34e0f8a335ca167595de25c965e8b021..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .encode import * -from .tree import * -from .xgbooster import * -from .preprocess import * \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xgbooster/encode.py b/pages/application/RandomForest/utils/xgbooster/encode.py deleted file mode 100644 index 6a77fb3afb792f0cd15276c11e03b4b4005f5109..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/encode.py +++ /dev/null @@ -1,363 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## encode.py -## -## Created on: Dec 7, 2018 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import collections -from pysat.formula import IDPool -from pysmt.smtlib.parser import SmtLibParser -from pysmt.shortcuts import And, BOOL, Iff, Implies, Not, Or, Symbol, get_model -from pysmt.shortcuts import Equals, ExactlyOne, LT, Plus, REAL, Real, write_smtlib -from .tree import TreeEnsemble, scores_tree -import six -from six.moves import range - -try: # for Python2 - from cStringIO import StringIO -except ImportError: # for Python3 - from io import StringIO - - -# -#============================================================================== -class SMTEncoder(object): - """ - Encoder of XGBoost tree ensembles into SMT. - """ - - def __init__(self, model, feats, nof_classes, xgb, from_file=None): - """ - Constructor. - """ - - self.model = model - self.feats = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - self.optns = xgb.options - - # xgbooster will also be needed - self.xgb = xgb - - # for interval-based encoding - self.intvs, self.imaps, self.ivars = None, None, None - - if from_file: - self.load_from(from_file) - - def traverse(self, tree, tvar, prefix=[]): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - pos, neg = self.encode_node(tree) - - self.traverse(tree.children[0], tvar, prefix + [pos]) - self.traverse(tree.children[1], tvar, prefix + [neg]) - else: # leaf node - if prefix: - self.enc.append(Implies(And(prefix), Equals(tvar, Real(tree.values)))) - else: - self.enc.append(Equals(tvar, Real(tree.values))) - - def encode_node(self, node): - """ - Encode a node of a tree. - """ - - if '_' not in node.name: - # continuous features => expecting an upper bound - # feature and its upper bound (value) - f, v = node.name, node.threshold - - existing = True if tuple([f, v]) in self.idmgr.obj2id else False - vid = self.idmgr.id(tuple([f, v])) - bv = Symbol('bvar{0}'.format(vid), typename=BOOL) - - if not existing: - if self.intvs: - d = self.imaps[f][v] + 1 - pos, neg = self.ivars[f][:d], self.ivars[f][d:] - self.enc.append(Iff(bv, Or(pos))) - self.enc.append(Iff(Not(bv), Or(neg))) - else: - fvar, fval = Symbol(f, typename=REAL), Real(v) - self.enc.append(Iff(bv, LT(fvar, fval))) - - return bv, Not(bv) - else: - # all features are expected to be categorical and - # encoded with one-hot encoding into Booleans - # each node is expected to be of the form: f_i < 0.5 - bv = Symbol(node.name, typename=BOOL) - - # left branch is positive, i.e. bv is true - # right branch is negative, i.e. bv is false - return Not(bv), bv - - def compute_intervals(self): - """ - Traverse all trees in the ensemble and extract intervals for each - feature. - - At this point, the method only works for numerical datasets! - """ - - def traverse_intervals(tree): - """ - Auxiliary function. Recursive tree traversal. - """ - - if tree.children: - f = tree.name - v = tree.threshold - self.intvs[f].add(v) - - traverse_intervals(tree.children[0]) - traverse_intervals(tree.children[1]) - - # initializing the intervals - self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} - - for tree in self.ensemble.trees: - traverse_intervals(tree) - - # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} - - self.imaps, self.ivars = {}, {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - - ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) - self.ivars[feat].append(ivar) - - def encode(self): - """ - Do the job. - """ - - self.enc = [] - - # getting a tree ensemble - self.ensemble = TreeEnsemble(self.model, - self.xgb.extended_feature_names_as_array_strings, - nb_classes=self.nofcl) - - # introducing class score variables - csum = [] - for j in range(self.nofcl): - cvar = Symbol('class{0}_score'.format(j), typename=REAL) - csum.append(tuple([cvar, []])) - - # if targeting interval-based encoding, - # traverse all trees and extract all possible intervals - # for each feature - if self.optns.encode == 'smtbool': - self.compute_intervals() - - # traversing and encoding each tree - for i, tree in enumerate(self.ensemble.trees): - # getting class id - clid = i % self.nofcl - - # encoding the tree - tvar = Symbol('tr{0}_score'.format(i + 1), typename=REAL) - self.traverse(tree, tvar, prefix=[]) - - # this tree contributes to class with clid - csum[clid][1].append(tvar) - - # encoding the sums - for pair in csum: - cvar, tvars = pair - self.enc.append(Equals(cvar, Plus(tvars))) - - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' in f: - categories[f.split('_')[0]].append(Symbol(name=f, typename=BOOL)) - for c, feats in six.iteritems(categories): - self.enc.append(ExactlyOne(feats)) - - # number of assertions - nof_asserts = len(self.enc) - - # making conjunction - self.enc = And(self.enc) - - # number of variables - nof_vars = len(self.enc.get_free_variables()) - - if self.optns.verb: - print('encoding vars:', nof_vars) - print('encoding asserts:', nof_asserts) - - return self.enc, self.intvs, self.imaps, self.ivars - - def test_sample(self, sample): - """ - Check whether or not the encoding "predicts" the same class - as the classifier given an input sample. - """ - - # first, compute the scores for all classes as would be - # predicted by the classifier - - # score arrays computed for each class - csum = [[] for c in range(self.nofcl)] - - if self.optns.verb: - print('testing sample:', list(sample)) - - sample_internal = list(self.xgb.transform(sample)[0]) - - # traversing all trees - for i, tree in enumerate(self.ensemble.trees): - # getting class id - clid = i % self.nofcl - - # a score computed by the current tree - score = scores_tree(tree, sample_internal) - - # this tree contributes to class with clid - csum[clid].append(score) - - # final scores for each class - cscores = [sum(scores) for scores in csum] - - # second, get the scores computed with the use of the encoding - - # asserting the sample - hypos = [] - - if not self.intvs: - for i, fval in enumerate(sample_internal): - feat, vid = self.xgb.transform_inverse_by_index(i) - fid = self.feats[feat] - - if vid == None: - fvar = Symbol('f{0}'.format(fid), typename=REAL) - hypos.append(Equals(fvar, Real(float(fval)))) - else: - fvar = Symbol('f{0}_{1}'.format(fid, vid), typename=BOOL) - if int(fval) == 1: - hypos.append(fvar) - else: - hypos.append(Not(fvar)) - else: - for i, fval in enumerate(sample_internal): - feat, _ = self.xgb.transform_inverse_by_index(i) - feat = 'f{0}'.format(self.feats[feat]) - - # determining the right interval and the corresponding variable - for ub, fvar in zip(self.intvs[feat], self.ivars[feat]): - if ub == '+' or fval < ub: - hypos.append(fvar) - break - else: - assert 0, 'No proper interval found for {0}'.format(feat) - - # now, getting the model - escores = [] - model = get_model(And(self.enc, *hypos), solver_name=self.optns.solver) - for c in range(self.nofcl): - v = Symbol('class{0}_score'.format(c), typename=REAL) - escores.append(float(model.get_py_value(v))) - - assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \ - 'wrong prediction: {0} vs {1}'.format(cscores, escores) - - if self.optns.verb: - print('xgb scores:', cscores) - print('enc scores:', escores) - - def save_to(self, outfile): - """ - Save the encoding into a file with a given name. - """ - - if outfile.endswith('.txt'): - outfile = outfile[:-3] + 'smt2' - - write_smtlib(self.enc, outfile) - - # appending additional information - with open(outfile, 'r') as fp: - contents = fp.readlines() - - # comments - comments = ['; features: {0}\n'.format(', '.join(self.feats)), - '; classes: {0}\n'.format(self.nofcl)] - - if self.intvs: - for f in self.xgb.extended_feature_names_as_array_strings: - c = '; i {0}: '.format(f) - c += ', '.join(['{0}<->{1}'.format(u, v) for u, v in zip(self.intvs[f], self.ivars[f])]) - comments.append(c + '\n') - - contents = comments + contents - with open(outfile, 'w') as fp: - fp.writelines(contents) - - def load_from(self, infile): - """ - Loads the encoding from an input file. - """ - - with open(infile, 'r') as fp: - file_content = fp.readlines() - - # empty intervals for the standard encoding - self.intvs, self.imaps, self.ivars = {}, {}, {} - - for line in file_content: - if line[0] != ';': - break - elif line.startswith('; i '): - f, arr = line[4:].strip().split(': ', 1) - f = f.replace('-', '_') - self.intvs[f], self.imaps[f], self.ivars[f] = [], {}, [] - - for i, pair in enumerate(arr.split(', ')): - ub, symb = pair.split('<->') - - if ub[0] != '+': - ub = float(ub) - symb = Symbol(symb, typename=BOOL) - - self.intvs[f].append(ub) - self.ivars[f].append(symb) - self.imaps[f][ub] = i - - elif line.startswith('; features:'): - self.feats = line[11:].strip().split(', ') - elif line.startswith('; classes:'): - self.nofcl = int(line[10:].strip()) - - parser = SmtLibParser() - script = parser.get_script(StringIO(''.join(file_content))) - - self.enc = script.get_last_formula() - - def access(self): - """ - Get access to the encoding, features names, and the number of - classes. - """ - - return self.enc, self.intvs, self.imaps, self.ivars, self.feats, self.nofcl diff --git a/pages/application/RandomForest/utils/xgbooster/explain.py b/pages/application/RandomForest/utils/xgbooster/explain.py deleted file mode 100644 index ca078749949fa2ad3fa3d01cb080653965effdf5..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/explain.py +++ /dev/null @@ -1,311 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## explain.py -## -## Created on: Dec 14, 2018 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import numpy as np -import os -from pysat.examples.hitman import Hitman -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GT, Int, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTExplainer(object): - """ - An SMT-inspired minimal explanation extractor for XGBoost models. - """ - - def __init__(self, formula, intvs, imaps, ivars, feats, nof_classes, - options, xgb): - """ - Constructor. - """ - - self.feats = feats - self.intvs = intvs - self.imaps = imaps - self.ivars = ivars - self.nofcl = nof_classes - self.optns = options - self.idmgr = IDPool() - - # saving XGBooster - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - - # current selector - self.selv = None - - def prepare(self, sample): - """ - Prepare the oracle for computing an explanation. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - self.sel2fid = {} # selectors to original feature ids - self.sel2vid = {} # selectors to categorical feature ids - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - if selv not in self.sel2fid: - self.sel2fid[selv] = int(feat[1:]) - self.sel2vid[selv] = [i - 1] - else: - self.sel2vid[selv].append(i - 1) - - # adding relaxed hypotheses to the oracle - if not self.intvs: - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - else: - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - inp = inp.symbol_name() - # determining the right interval and the corresponding variable - for ub, fvar in zip(self.intvs[inp], self.ivars[inp]): - if ub == '+' or val < ub: - hypo = Implies(self.selv, Implies(sel, fvar)) - break - - self.oracle.add_assertion(hypo) - - # in case of categorical data, there are selector duplicates - # and we need to remove them - self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - self.out_id = maxoval[1] - self.output = self.xgb.target_name[self.out_id] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != self.out_id: - disj.append(GT(self.outs[i], self.outs[self.out_id])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - if self.verbose: - inpvals = self.xgb.readable_sample(sample) - - self.preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in str(v): - self.preamble.append('{0} = {1}'.format(f, v)) - else: - self.preamble.append(v) - - print(' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.output)) - - def explain(self, sample, smallest, expl_ext=None, prefer_ext=False): - """ - Hypotheses minimization. - """ - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.prepare(sample) - - # saving external explanation to be minimized further - if expl_ext == None or prefer_ext: - self.to_consider = [True for h in self.rhypos] - else: - eexpl = set(expl_ext) - self.to_consider = [True if i in eexpl else False for i, h in enumerate(self.rhypos)] - - # if satisfiable, then the observation is not implied by the hypotheses - if self.oracle.solve([self.selv] + [h for h, c in zip(self.rhypos, self.to_consider) if c]): - print(' no implication!') - print(self.oracle.get_model()) - sys.exit(1) - - if not smallest: - self.compute_minimal(prefer_ext=prefer_ext) - else: - self.compute_smallest() - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - expl = sorted([self.sel2fid[h] for h in self.rhypos]) - - if self.verbose: - self.preamble = [self.preamble[i] for i in expl] - print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id])) - print(' # hypos left:', len(self.rhypos)) - print(' time: {0:.2f}'.format(self.time)) - - return expl - - def compute_minimal(self, prefer_ext=False): - """ - Compute any subset-minimal explanation. - """ - - i = 0 - - if not prefer_ext: - # here, we want to reduce external explanation - - # filtering out unnecessary features if external explanation is given - self.rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if c] - else: - # here, we want to compute an explanation that is preferred - # to be similar to the given external one - # for that, we try to postpone removing features that are - # in the external explanation provided - - rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if not c] - rhypos += [h for h, c in zip(self.rhypos, self.to_consider) if c] - self.rhypos = rhypos - - # simple deletion-based linear search - while i < len(self.rhypos): - to_test = self.rhypos[:i] + self.rhypos[(i + 1):] - - if self.oracle.solve([self.selv] + to_test): - i += 1 - else: - self.rhypos = to_test - - def compute_smallest(self): - """ - Compute a cardinality-minimal explanation. - """ - - # result - rhypos = [] - - with Hitman(bootstrap_with=[[i for i in range(len(self.rhypos)) if self.to_consider[i]]]) as hitman: - # computing unit-size MCSes - for i, hypo in enumerate(self.rhypos): - if self.to_consider[i] == False: - continue - - if self.oracle.solve([self.selv] + self.rhypos[:i] + self.rhypos[(i + 1):]): - hitman.hit([i]) - - # main loop - iters = 0 - while True: - hset = hitman.get() - iters += 1 - - if self.verbose > 1: - print('iter:', iters) - print('cand:', hset) - - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset]): - to_hit = [] - satisfied, unsatisfied = [], [] - - removed = list(set(range(len(self.rhypos))).difference(set(hset))) - - model = self.oracle.get_model() - for h in removed: - i = self.sel2fid[self.rhypos[h]] - if '_' not in self.inps[i].symbol_name(): - # feature variable and its expected value - var, exp = self.inps[i], self.sample[i] - - # true value - true_val = float(model.get_py_value(var)) - - if not exp - 0.001 <= true_val <= exp + 0.001: - unsatisfied.append(h) - else: - hset.append(h) - else: - for vid in self.sel2vid[self.rhypos[h]]: - var, exp = self.inps[vid], int(self.sample[vid]) - - # true value - true_val = int(model.get_py_value(var)) - - if exp != true_val: - unsatisfied.append(h) - break - else: - hset.append(h) - - # computing an MCS (expensive) - for h in unsatisfied: - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset] + [self.rhypos[h]]): - hset.append(h) - else: - to_hit.append(h) - - if self.verbose > 1: - print('coex:', to_hit) - - hitman.hit(to_hit) - else: - self.rhypos = [self.rhypos[i] for i in hset] - break diff --git a/pages/application/RandomForest/utils/xgbooster/preprocess.py b/pages/application/RandomForest/utils/xgbooster/preprocess.py deleted file mode 100644 index d66cc338fe6b232fff0155aea5055e7fe9241dac..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/preprocess.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## preprocess.py -## -## Created on: Jan 10, 2019 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -import json -import numpy as np -import xgboost as xgb -import math -import pandas as pd -import numpy as np -import sklearn -import pickle - - -# -#============================================================================== -def preprocess_dataset(raw_data_path, files, use_categ=True): - print("preprocess dataset from ", raw_data_path) - files = files.split(",") - - data_file = files[0] - dataset_name = files[1] - - categorical_features = [] - if use_categ: - try: - catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) - categorical_features = np.concatenate(catcols.values).tolist() - - print(categorical_features) - except Exception as e: - print("Please provide info about categorical columns/original datasets or omit option -p", e) - exit() - - try: - data_raw = pd.read_csv(raw_data_path + data_file, sep=',', na_values= ['']) - #catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) - #categorical_features = np.concatenate(catcols.values).tolist() - - - for i in range(len(data_raw.values[0])): - if i in categorical_features: - data_raw.fillna('',inplace=True) - else: - data_raw.fillna(0,inplace=True) - dataset_all = data_raw - dataset = dataset_all.values.copy() - - print(categorical_features) - except Exception as e: - print("Please provide info about categorical columns/original datasets or omit option -p", e) - exit() - - # move categrorical columns forward - - feature_names = dataset_all.columns - print(feature_names) - - ############################## - extra_info = {} - categorical_names = {} - print(categorical_features) - dataset_new = dataset_all.values.copy() - for feature in categorical_features: - print("feature", feature) - print(dataset[:, feature]) - le = sklearn.preprocessing.LabelEncoder() - le.fit(dataset[:, feature]) - categorical_names[feature] = le.classes_ - dataset_new[:, feature] = le.transform(dataset[:, feature]) - - ###################################3 - # target as categorical - labels_new = [] - - le = sklearn.preprocessing.LabelEncoder() - le.fit(dataset[:, -1]) - dataset_new[:, -1]= le.transform(dataset[:, -1]) - class_names = le.classes_ - ######################################33 - - - if (False): - dataset_new = np.delete(dataset_new, -1, axis=1) - oneencoder = sklearn.preprocessing.OneHotEncoder() - oneencoder.fit(dataset_new[:, categorical_features]) - print(oneencoder.categories_) - n_transformed_features = sum([len(cats) for cats in oneencoder.categories_]) - print(n_transformed_features) - print(dataset_new.shape) - X = dataset_new[:,categorical_features][0] - print(X) - x = np.expand_dims(X, axis=0) - print("x", x, x.shape) - y = dataset_new[0].copy() - print(y.shape, oneencoder.transform(x).shape) - y[categorical_features] = oneencoder.transform(x).toarray() - - print("y", y, y.shape) - - z = oneencoder.inverse_transform(y) - print(z.shape) - exit() - - ###########################################################################3 - extra_info = {"categorical_features": categorical_features, - "categorical_names": categorical_names, - "feature_names": feature_names, - "class_names": class_names} - - new_file_train = raw_data_path + dataset_name + '_data.csv' - df = pd.DataFrame(data=dataset_new) - df.columns = list(feature_names) - df.to_csv(new_file_train, mode = 'w', index=False) - print("new dataset", new_file_train) - - - f = open(raw_data_path + dataset_name + '_data.csv.pkl', "wb") - pickle.dump(extra_info, f) - f.close() diff --git a/pages/application/RandomForest/utils/xgbooster/tree.py b/pages/application/RandomForest/utils/xgbooster/tree.py deleted file mode 100644 index ebaf24dbf4672e985943f242f238dad8b841e604..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/tree.py +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## tree.py (reuses parts of the code of SHAP) -## -## Created on: Dec 7, 2018 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -from anytree import Node, RenderTree,AsciiStyle -import json -import numpy as np -import xgboost as xgb -import math - - -# -#============================================================================== -class xgnode(Node): - def __init__(self, id, parent = None): - Node.__init__(self, id, parent) - self.id = id # The node value - self.name = None - self.left_node_id = -1 # Left child - self.right_node_id = -1 # Right child - self.missing_node_id = -1 - - self.feature = -1 - self.threshold = -1 - - self.cover = -1 - self.values = -1 - - def __str__(self): - pref = ' ' * self.depth - if (len(self.children) == 0): - return (pref+ "leaf: {} {}".format(self.id, self.values)) - else: - if(self.name is None): - return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) - else: - return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) - - -# -#============================================================================== -def build_tree(json_tree, node = None, feature_names = None, inverse = False): - def max_id(node): - if "children" in node: - return max(node["nodeid"], *[max_id(n) for n in node["children"]]) - else: - return node["nodeid"] - m = max_id(json_tree) + 1 - def extract_data(json_node, root = None, feature_names = None): - i = json_node["nodeid"] - if (root is None): - node = xgnode(i) - else: - node = xgnode(i, parent = root) - node.cover = json_node["cover"] - if "children" in json_node: - - node.left_node_id = json_node["yes"] - node.right_node_id = json_node["no"] - node.missing_node_id = json_node["missing"] - node.feature = json_node["split"] - if (feature_names is not None): - node.name = feature_names[node.feature] - node.threshold = json_node["split_condition"] - for c, n in enumerate(json_node["children"]): - child = extract_data(n, node, feature_names) - elif "leaf" in json_node: - node.values = json_node["leaf"] - if(inverse): - node.values = -node.values - return node - - root = extract_data(json_tree, None, feature_names) - return root - -# -#============================================================================== -def walk_tree(node): - if (len(node.children) == 0): - # leaf - print(node) - else: - print(node) - walk_tree(node.children[0]) - walk_tree(node.children[1]) - -def count_nodes(root): - def count(node): - if len(node.children): - return sum([1+count(n) for n in node.children]) - else: - return 0 - m = count(root) + 1 - return m - -# -#============================================================================== -def scores_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feature_branch = node.feature - sample_value = sample[feature_branch] - assert(sample_value is not None) - if(sample_value < node.threshold): - return scores_tree(node.children[0], sample) - else: - return scores_tree(node.children[1], sample) - - -# -#============================================================================== -class TreeEnsemble: - """ An ensemble of decision trees. - - This object provides a common interface to many different types of models. - """ - def __init__(self, model, feature_names = None, nb_classes = 0): - self.model_type = "xgboost" - self.original_model = model.get_booster() - self.base_offset = None - json_trees = get_xgboost_json(self.original_model) - self.trees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - if(nb_classes == 2): - # NASTY trick for binary - # We change signs of values in leaves so that we can just sum all the values in leaves for class X - # and take max to get the right class - self.otrees = [build_tree(json.loads(t), None, feature_names, inverse = True) for t in json_trees] - self.itrees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - self.trees = [] - for i,_ in enumerate(self.otrees): - self.trees.append(self.otrees[i]) - self.trees.append(self.itrees[i]) - self.feature_names = feature_names - self.sz = sum([count_nodes(dt) for dt in self.trees]) - def print_tree(self): - for i,t in enumerate(self.trees): - print("tree number: ", i) - walk_tree(t) - - def invert_tree_prob(self, node): - if (len(node.children) == 0): - node.values = -node.values - return node - else: - self.invert_tree_prob(node.children[0]) - self.invert_tree_prob(node.children[1]) - return node - def predict(self, samples, nb_classes): - # https://github.com/dmlc/xgboost/issues/1746#issuecomment-290130695 - prob = [] - for sample in np.asarray(samples): - scores = [] - for i,t in enumerate(self.trees): - s = scores_tree(t, sample) - scores.append((s)) - scores = np.asarray(scores) - class_scores = [] - if (nb_classes == 2): - - for i in range(nb_classes): - class_scores.append(math.exp(-(scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor - s0 = class_scores[0] - s1 = class_scores[1] - v0 = 1/(1 + s0) - v1 = 1/(1 + s1) - class_scores[0] = v0 - class_scores[1] = v1 - else: - for i in range(nb_classes): - class_scores.append(math.exp((scores[i::nb_classes]).sum())) - class_scores = np.asarray(class_scores) - prob.append(class_scores/class_scores.sum()) - return np.asarray(prob).reshape((-1, nb_classes)) - - -# -#============================================================================== -def get_xgboost_json(model): - """ REUSED FROM SHAP - This gets a JSON dump of an XGBoost model while ensuring the feature names are their indexes. - """ - fnames = model.feature_names - model.feature_names = None - json_trees = model.get_dump(with_stats=True, dump_format="json") - model.feature_names = fnames - return json_trees diff --git a/pages/application/RandomForest/utils/xgbooster/validate.py b/pages/application/RandomForest/utils/xgbooster/validate.py deleted file mode 100644 index c3c6f82a6f241a7f158ead0694b1718893d89c5c..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/validate.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## validate.py -## -## Created on: Jan 4, 2019 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import getopt -import numpy as np -import os -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTValidator(object): - """ - Validating Anchor's explanations using SMT solving. - """ - - def __init__(self, formula, feats, nof_classes, xgb): - """ - Constructor. - """ - - self.ftids = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - self.optns = xgb.options - - # xgbooster will also be needed - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=self.xgb.options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - - # current selector - self.selv = None - - def prepare(self, sample, expl): - """ - Prepare the oracle for validating an explanation given a sample. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - - # adding relaxed hypotheses to the oracle - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - true_output = maxoval[1] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != true_output: - disj.append(GT(self.outs[i], self.outs[true_output])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - # removing all hypotheses except for those in the explanation - hypos = [] - for i, hypo in enumerate(self.rhypos): - j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] - if j in expl: - hypos.append(hypo) - self.rhypos = hypos - - if self.verbose: - inpvals = self.xgb.readable_sample(sample) - - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) - - def validate(self, sample, expl): - """ - Make an effort to show that the explanation is too optimistic. - """ - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.prepare(sample, expl) - - # if satisfiable, then there is a counterexample - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - inpvals = [float(model.get_py_value(i)) for i in self.inps] - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - inpvals = self.xgb.transform_inverse(np.array(inpvals))[0] - self.coex = tuple([inpvals, maxoval[1]]) - inpvals = self.xgb.readable_sample(inpvals) - - if self.verbose: - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation is incorrect') - print(' counterexample: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[maxoval[1]])) - else: - self.coex = None - - if self.verbose: - print(' explanation is correct') - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - if self.verbose: - print(' time: {0:.2f}'.format(self.time)) - - return self.coex diff --git a/pages/application/RandomForest/utils/xgbooster/xgbooster.py b/pages/application/RandomForest/utils/xgbooster/xgbooster.py deleted file mode 100644 index 25cd86ce6de6ec5e5f0836344354afa7f7b87d26..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbooster/xgbooster.py +++ /dev/null @@ -1,445 +0,0 @@ -#!/us/bin/env python -# -*- coding:utf-8 -*- -## -## xgbooster.py -## -## Created on: Dec 7, 2018 -## Author: Nina Narodytska, Alexey Ignatiev -## E-mail: narodytska@vmware.com, aignatiev@ciencias.ulisboa.pt -## - -# -# ============================================================================== -from __future__ import print_function -from .validate import SMTValidator -from .encode import SMTEncoder -from .explain import SMTExplainer -import numpy as np -import os -import resource -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -import sklearn -# print('The scikit-learn version is {}.'.format(sklearn.__version__)) - -from sklearn.preprocessing import OneHotEncoder, LabelEncoder -import sys -from six.moves import range -from .tree import TreeEnsemble -import xgboost as xgb -from xgboost import XGBClassifier, Booster -import pickle - - -# -# ============================================================================== -class XGBooster(object): - """ - The main class to train/encode/explain XGBoost models. - """ - - def __init__(self, options, from_data=None, from_model=None, - from_encoding=None): - """ - Constructor. - """ - - assert from_data or from_model or from_encoding, \ - 'At least one input file should be specified' - - self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime - - # saving command-line options - self.options = options - self.seed = self.options.seed - np.random.seed(self.seed) - - if from_data: - self.use_categorical = self.options.use_categorical - # saving data - self.data = from_data - ## - samps = np.asarray(self.data.samps) - if not all(c.isnumeric() for c in samps[:, -1]): - le = LabelEncoder() - le.fit(samps[:, -1]) - samps[:, -1] = le.transform(samps[:, -1]) - # self.class_names = le.classes_ - # print(le.classes_) - ## - dataset = np.asarray(samps, dtype=np.float32) - # dataset = np.asarray(self.data.samps, dtype=np.float32) - - # split data into X and y - self.feature_names = self.data.names[:-1] - self.nb_features = len(self.feature_names) - - self.X = dataset[:, 0:self.nb_features] - self.Y = dataset[:, self.nb_features] - self.num_class = len(set(self.Y)) - self.target_name = list(range(self.num_class)) - - param_dist = {'n_estimators': self.options.n_estimators, - 'max_depth': self.options.maxdepth} - - if (self.num_class == 2): - param_dist['objective'] = 'binary:logistic' - - self.model = XGBClassifier(**param_dist) - - # split data into train and test sets - self.test_size = self.options.testsplit - if (self.test_size > 0): - self.X_train, self.X_test, self.Y_train, self.Y_test = \ - train_test_split(self.X, self.Y, test_size=self.test_size, - random_state=self.seed) - else: - self.X_train = self.X - self.X_test = [] # need a fix - self.Y_train = self.Y - self.Y_test = [] # need a fix - - # check if we have info about categorical features - if (self.use_categorical): - self.categorical_features = from_data.categorical_features - self.categorical_names = from_data.categorical_names - self.target_name = from_data.class_names - - #################################### - # this is a set of checks to make sure that we use the same as anchor encoding - cat_names = sorted(self.categorical_names.keys()) - assert (cat_names == self.categorical_features) - self.encoder = {} - for i in self.categorical_features: - self.encoder.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , - self.encoder[i].fit(self.X[:, [i]]) - - else: - self.categorical_features = [] - self.categorical_names = [] - self.encoder = [] - - fname = from_data - - elif from_model: - fname = from_model - self.load_datainfo(from_model) - if (self.use_categorical is False) and (self.options.use_categorical is True): - print( - "Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions") - exit() - # load model - - elif from_encoding: - fname = from_encoding - - # encoding, feature names, and number of classes - # are read from an input file - enc = SMTEncoder(None, None, None, self, from_encoding) - self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \ - self.num_class = enc.access() - - # create extra file names - try: - os.stat(options.output) - except: - os.mkdir(options.output) - - self.mapping_features() - ################# - self.test_encoding_transformes() - - bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] - bench_dir_name = options.output + "/bt/" + bench_name - try: - os.stat(bench_dir_name) - except: - os.mkdir(bench_dir_name) - - self.basename = (os.path.join(bench_dir_name, bench_name + - "_nbestim_" + str(options.n_estimators) + - "_maxdepth_" + str(options.maxdepth) + - "_testsplit_" + str(options.testsplit))) - - data_suffix = '.splitdata.pkl' - self.modfile = self.basename + '.mod.pkl' - - self.mod_plainfile = self.basename + '.mod.txt' - - self.resfile = self.basename + '.res.txt' - self.encfile = self.basename + '.enc.txt' - self.expfile = self.basename + '.exp.txt' - - def load_datainfo(self, model_from_pkl, data_from_pkl): - self.model = XGBClassifier() - self.model = model_from_pkl - loaded_data = data_from_pkl - self.X = loaded_data["X"] - self.Y = loaded_data["Y"] - self.X_train = loaded_data["X_train"] - self.X_test = loaded_data["X_test"] - self.Y_train = loaded_data["Y_train"] - self.Y_test = loaded_data["Y_test"] - self.feature_names = loaded_data["feature_names"] - self.target_name = loaded_data["target_name"] - self.num_class = loaded_data["num_class"] - self.nb_features = len(self.feature_names) - self.categorical_features = loaded_data["categorical_features"] - self.categorical_names = loaded_data["categorical_names"] - self.encoder = loaded_data["encoder"] - self.use_categorical = loaded_data["use_categorical"] - - def train(self, outfile=None): - """ - Train a tree ensemble using XGBoost. - """ - - return self.build_xgbtree(outfile) - - def encode(self, test_on=None): - """ - Encode a tree ensemble trained previously. - """ - - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, self) - self.enc, self.intvs, self.imaps, self.ivars = encoder.encode() - - if test_on: - encoder.test_sample(np.array(test_on)) - - # encoder.save_to(self.encfile) - - def explain(self, sample, use_lime=None, use_anchor=None, use_shap=None, - expl_ext=None, prefer_ext=False, nof_feats=5): - """ - Explain a prediction made for a given sample with a previously - trained tree ensemble. - """ - - if use_lime: - expl = use_lime(self, sample=sample, nb_samples=5, - nb_features_in_exp=nof_feats) - elif use_anchor: - expl = use_anchor(self, sample=sample, nb_samples=5, - nb_features_in_exp=nof_feats, threshold=0.95) - elif use_shap: - expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats) - else: - if 'x' not in dir(self): - self.x = SMTExplainer(self.enc, self.intvs, self.imaps, - self.ivars, self.feature_names, self.num_class, - self.options, self) - - expl = self.x.explain(np.array(sample), self.options.smallest, - expl_ext, prefer_ext) - - # returning the explanation - return expl - - def validate(self, sample, expl): - """ - Make an attempt to show that a given explanation is optimistic. - """ - - # there must exist an encoding - if 'enc' not in dir(self): - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, - self) - self.enc, _, _, _ = encoder.encode() - - if 'v' not in dir(self): - self.v = SMTValidator(self.enc, self.feature_names, self.num_class, - self) - - # try to compute a counterexample - return self.v.validate(np.array(sample), expl) - - def transform(self, x): - if (len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert (self.encoder != []) - tx = [] - for i in range(self.nb_features): - self.encoder[i].drop = None - if (i in self.categorical_features): - tx_aux = self.encoder[i].transform(x[:, [i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:, [i]]) - tx = np.hstack(tx) - return tx - else: - return x - - def transform_inverse(self, x): - if (len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert (self.encoder != []) - inverse_x = [] - for i, xi in enumerate(x): - inverse_xi = np.zeros(self.nb_features) - for f in range(self.nb_features): - if f in self.categorical_features: - nb_values = len(self.categorical_names[f]) - v = xi[:nb_values] - v = np.expand_dims(v, axis=0) - iv = self.encoder[f].inverse_transform(v) - inverse_xi[f] = iv - xi = xi[nb_values:] - - else: - inverse_xi[f] = xi[0] - xi = xi[1:] - inverse_x.append(inverse_xi) - return inverse_x - else: - return x - - def transform_inverse_by_index(self, idx): - if (idx in self.extended_feature_names): - return self.extended_feature_names[idx] - else: - print("Warning there is no feature {} in the internal mapping".format(idx)) - return None - - def transform_by_value(self, feat_value_pair): - if (feat_value_pair in self.extended_feature_names.values()): - keys = ( - list(self.extended_feature_names.keys())[list(self.extended_feature_names.values()).index(feat_value_pair)]) - return keys - else: - print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) - return None - - def mapping_features(self): - self.extended_feature_names = {} - self.extended_feature_names_as_array_strings = [] - counter = 0 - if (self.use_categorical): - for i in range(self.nb_features): - if (i in self.categorical_features): - for j, _ in enumerate(self.encoder[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append( - "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) - counter = counter + 1 - else: - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - else: - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - - def readable_sample(self, x): - readable_x = [] - for i, v in enumerate(x): - if (i in self.categorical_features): - readable_x.append(self.categorical_names[i][int(v)]) - else: - readable_x.append(v) - return np.asarray(readable_x) - - def test_encoding_transformes(self): - # test encoding - - X = self.X_train[[0], :] - - print("Sample of length", len(X[0]), " : ", X) - enc_X = self.transform(X) - print("Encoded sample of length", len(enc_X[0]), " : ", enc_X) - inv_X = self.transform_inverse(enc_X) - print("Back to sample", inv_X) - print("Readable sample", self.readable_sample(inv_X[0])) - assert ((inv_X == X).all()) - - if (self.options.verb > 1): - for i in range(len(self.extended_feature_names)): - print(i, self.transform_inverse_by_index(i)) - for key, value in self.extended_feature_names.items(): - print(value, self.transform_by_value(value)) - - def transfomed_sample_info(self, i): - print(enc.categories_) - - def build_xgbtree(self, outfile=None): - """ - Build an ensemble of trees. - """ - - if (outfile is None): - outfile = self.modfile - else: - self.datafile = sefl.form_datefile_name(outfile) - - # fit model no training data - - if (len(self.X_test) > 0): - eval_set = [(self.transform(self.X_train), self.Y_train), (self.transform(self.X_test), self.Y_test)] - else: - eval_set = [(self.transform(self.X_train), self.Y_train)] - - print("start xgb") - self.model.fit(self.transform(self.X_train), self.Y_train, - eval_set=eval_set, - verbose=self.options.verb) # eval_set=[(X_test, Y_test)], - print("end xgb") - - evals_result = self.model.evals_result() - ########## saving model - self.save_datainfo(outfile) - print("saving plain model to ", self.mod_plainfile) - self.model._Booster.dump_model(self.mod_plainfile) - - ensemble = TreeEnsemble(self.model, self.extended_feature_names_as_array_strings, nb_classes=self.num_class) - - y_pred_prob = self.model.predict_proba(self.transform(self.X_train[:10])) - y_pred_prob_compute = ensemble.predict(self.transform(self.X_train[:10]), self.num_class) - - assert (np.absolute(y_pred_prob_compute - y_pred_prob).sum() < 0.01 * len(y_pred_prob)) - - ### accuracy - try: - train_accuracy = round(1 - evals_result['validation_0']['merror'][-1], 2) - except: - try: - train_accuracy = round(1 - evals_result['validation_0']['error'][-1], 2) - except: - assert (False) - - try: - test_accuracy = round(1 - evals_result['validation_1']['merror'][-1], 2) - except: - try: - test_accuracy = round(1 - evals_result['validation_1']['error'][-1], 2) - except: - print("no results test data") - test_accuracy = 0 - - #### saving - print("saving results to ", self.resfile) - with open(self.resfile, 'w') as f: - f.write("{} & {} & {} &{} &{} & {} \\\\ \n \hline \n".format( - os.path.basename(self.options.files[0]).replace("_", "-"), - train_accuracy, - test_accuracy, - self.options.n_estimators, - self.options.maxdepth, - self.options.testsplit)) - f.close() - - print("c BT sz:", ensemble.sz) - print("Train accuracy: %.2f%%" % (train_accuracy * 100.0)) - print("Test accuracy: %.2f%%" % (test_accuracy * 100.0)) - - return train_accuracy, test_accuracy, self.model diff --git a/pages/application/RandomForest/utils/xgbrf/__init__.py b/pages/application/RandomForest/utils/xgbrf/__init__.py deleted file mode 100644 index 7cf92d3ca20939258ac326649b20fc5c0a79abb7..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .encode import * -from .tree import * -from .xgb_rf import * -from .preprocess import * \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xgbrf/encode.py b/pages/application/RandomForest/utils/xgbrf/encode.py deleted file mode 100644 index 6a77fb3afb792f0cd15276c11e03b4b4005f5109..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/encode.py +++ /dev/null @@ -1,363 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## encode.py -## -## Created on: Dec 7, 2018 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import collections -from pysat.formula import IDPool -from pysmt.smtlib.parser import SmtLibParser -from pysmt.shortcuts import And, BOOL, Iff, Implies, Not, Or, Symbol, get_model -from pysmt.shortcuts import Equals, ExactlyOne, LT, Plus, REAL, Real, write_smtlib -from .tree import TreeEnsemble, scores_tree -import six -from six.moves import range - -try: # for Python2 - from cStringIO import StringIO -except ImportError: # for Python3 - from io import StringIO - - -# -#============================================================================== -class SMTEncoder(object): - """ - Encoder of XGBoost tree ensembles into SMT. - """ - - def __init__(self, model, feats, nof_classes, xgb, from_file=None): - """ - Constructor. - """ - - self.model = model - self.feats = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - self.optns = xgb.options - - # xgbooster will also be needed - self.xgb = xgb - - # for interval-based encoding - self.intvs, self.imaps, self.ivars = None, None, None - - if from_file: - self.load_from(from_file) - - def traverse(self, tree, tvar, prefix=[]): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - pos, neg = self.encode_node(tree) - - self.traverse(tree.children[0], tvar, prefix + [pos]) - self.traverse(tree.children[1], tvar, prefix + [neg]) - else: # leaf node - if prefix: - self.enc.append(Implies(And(prefix), Equals(tvar, Real(tree.values)))) - else: - self.enc.append(Equals(tvar, Real(tree.values))) - - def encode_node(self, node): - """ - Encode a node of a tree. - """ - - if '_' not in node.name: - # continuous features => expecting an upper bound - # feature and its upper bound (value) - f, v = node.name, node.threshold - - existing = True if tuple([f, v]) in self.idmgr.obj2id else False - vid = self.idmgr.id(tuple([f, v])) - bv = Symbol('bvar{0}'.format(vid), typename=BOOL) - - if not existing: - if self.intvs: - d = self.imaps[f][v] + 1 - pos, neg = self.ivars[f][:d], self.ivars[f][d:] - self.enc.append(Iff(bv, Or(pos))) - self.enc.append(Iff(Not(bv), Or(neg))) - else: - fvar, fval = Symbol(f, typename=REAL), Real(v) - self.enc.append(Iff(bv, LT(fvar, fval))) - - return bv, Not(bv) - else: - # all features are expected to be categorical and - # encoded with one-hot encoding into Booleans - # each node is expected to be of the form: f_i < 0.5 - bv = Symbol(node.name, typename=BOOL) - - # left branch is positive, i.e. bv is true - # right branch is negative, i.e. bv is false - return Not(bv), bv - - def compute_intervals(self): - """ - Traverse all trees in the ensemble and extract intervals for each - feature. - - At this point, the method only works for numerical datasets! - """ - - def traverse_intervals(tree): - """ - Auxiliary function. Recursive tree traversal. - """ - - if tree.children: - f = tree.name - v = tree.threshold - self.intvs[f].add(v) - - traverse_intervals(tree.children[0]) - traverse_intervals(tree.children[1]) - - # initializing the intervals - self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} - - for tree in self.ensemble.trees: - traverse_intervals(tree) - - # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} - - self.imaps, self.ivars = {}, {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - - ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) - self.ivars[feat].append(ivar) - - def encode(self): - """ - Do the job. - """ - - self.enc = [] - - # getting a tree ensemble - self.ensemble = TreeEnsemble(self.model, - self.xgb.extended_feature_names_as_array_strings, - nb_classes=self.nofcl) - - # introducing class score variables - csum = [] - for j in range(self.nofcl): - cvar = Symbol('class{0}_score'.format(j), typename=REAL) - csum.append(tuple([cvar, []])) - - # if targeting interval-based encoding, - # traverse all trees and extract all possible intervals - # for each feature - if self.optns.encode == 'smtbool': - self.compute_intervals() - - # traversing and encoding each tree - for i, tree in enumerate(self.ensemble.trees): - # getting class id - clid = i % self.nofcl - - # encoding the tree - tvar = Symbol('tr{0}_score'.format(i + 1), typename=REAL) - self.traverse(tree, tvar, prefix=[]) - - # this tree contributes to class with clid - csum[clid][1].append(tvar) - - # encoding the sums - for pair in csum: - cvar, tvars = pair - self.enc.append(Equals(cvar, Plus(tvars))) - - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' in f: - categories[f.split('_')[0]].append(Symbol(name=f, typename=BOOL)) - for c, feats in six.iteritems(categories): - self.enc.append(ExactlyOne(feats)) - - # number of assertions - nof_asserts = len(self.enc) - - # making conjunction - self.enc = And(self.enc) - - # number of variables - nof_vars = len(self.enc.get_free_variables()) - - if self.optns.verb: - print('encoding vars:', nof_vars) - print('encoding asserts:', nof_asserts) - - return self.enc, self.intvs, self.imaps, self.ivars - - def test_sample(self, sample): - """ - Check whether or not the encoding "predicts" the same class - as the classifier given an input sample. - """ - - # first, compute the scores for all classes as would be - # predicted by the classifier - - # score arrays computed for each class - csum = [[] for c in range(self.nofcl)] - - if self.optns.verb: - print('testing sample:', list(sample)) - - sample_internal = list(self.xgb.transform(sample)[0]) - - # traversing all trees - for i, tree in enumerate(self.ensemble.trees): - # getting class id - clid = i % self.nofcl - - # a score computed by the current tree - score = scores_tree(tree, sample_internal) - - # this tree contributes to class with clid - csum[clid].append(score) - - # final scores for each class - cscores = [sum(scores) for scores in csum] - - # second, get the scores computed with the use of the encoding - - # asserting the sample - hypos = [] - - if not self.intvs: - for i, fval in enumerate(sample_internal): - feat, vid = self.xgb.transform_inverse_by_index(i) - fid = self.feats[feat] - - if vid == None: - fvar = Symbol('f{0}'.format(fid), typename=REAL) - hypos.append(Equals(fvar, Real(float(fval)))) - else: - fvar = Symbol('f{0}_{1}'.format(fid, vid), typename=BOOL) - if int(fval) == 1: - hypos.append(fvar) - else: - hypos.append(Not(fvar)) - else: - for i, fval in enumerate(sample_internal): - feat, _ = self.xgb.transform_inverse_by_index(i) - feat = 'f{0}'.format(self.feats[feat]) - - # determining the right interval and the corresponding variable - for ub, fvar in zip(self.intvs[feat], self.ivars[feat]): - if ub == '+' or fval < ub: - hypos.append(fvar) - break - else: - assert 0, 'No proper interval found for {0}'.format(feat) - - # now, getting the model - escores = [] - model = get_model(And(self.enc, *hypos), solver_name=self.optns.solver) - for c in range(self.nofcl): - v = Symbol('class{0}_score'.format(c), typename=REAL) - escores.append(float(model.get_py_value(v))) - - assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \ - 'wrong prediction: {0} vs {1}'.format(cscores, escores) - - if self.optns.verb: - print('xgb scores:', cscores) - print('enc scores:', escores) - - def save_to(self, outfile): - """ - Save the encoding into a file with a given name. - """ - - if outfile.endswith('.txt'): - outfile = outfile[:-3] + 'smt2' - - write_smtlib(self.enc, outfile) - - # appending additional information - with open(outfile, 'r') as fp: - contents = fp.readlines() - - # comments - comments = ['; features: {0}\n'.format(', '.join(self.feats)), - '; classes: {0}\n'.format(self.nofcl)] - - if self.intvs: - for f in self.xgb.extended_feature_names_as_array_strings: - c = '; i {0}: '.format(f) - c += ', '.join(['{0}<->{1}'.format(u, v) for u, v in zip(self.intvs[f], self.ivars[f])]) - comments.append(c + '\n') - - contents = comments + contents - with open(outfile, 'w') as fp: - fp.writelines(contents) - - def load_from(self, infile): - """ - Loads the encoding from an input file. - """ - - with open(infile, 'r') as fp: - file_content = fp.readlines() - - # empty intervals for the standard encoding - self.intvs, self.imaps, self.ivars = {}, {}, {} - - for line in file_content: - if line[0] != ';': - break - elif line.startswith('; i '): - f, arr = line[4:].strip().split(': ', 1) - f = f.replace('-', '_') - self.intvs[f], self.imaps[f], self.ivars[f] = [], {}, [] - - for i, pair in enumerate(arr.split(', ')): - ub, symb = pair.split('<->') - - if ub[0] != '+': - ub = float(ub) - symb = Symbol(symb, typename=BOOL) - - self.intvs[f].append(ub) - self.ivars[f].append(symb) - self.imaps[f][ub] = i - - elif line.startswith('; features:'): - self.feats = line[11:].strip().split(', ') - elif line.startswith('; classes:'): - self.nofcl = int(line[10:].strip()) - - parser = SmtLibParser() - script = parser.get_script(StringIO(''.join(file_content))) - - self.enc = script.get_last_formula() - - def access(self): - """ - Get access to the encoding, features names, and the number of - classes. - """ - - return self.enc, self.intvs, self.imaps, self.ivars, self.feats, self.nofcl diff --git a/pages/application/RandomForest/utils/xgbrf/explain.py b/pages/application/RandomForest/utils/xgbrf/explain.py deleted file mode 100644 index 7487f0eb683ad30b5c19546a5eb5d8db9009b28e..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/explain.py +++ /dev/null @@ -1,312 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## explain.py -## -## Created on: Dec 14, 2018 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import numpy as np -import os -from pysat.examples.hitman import Hitman -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GT, Int, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTExplainer(object): - """ - An SMT-inspired minimal explanation extractor for XGBoost models. - """ - - def __init__(self, formula, intvs, imaps, ivars, feats, nof_classes, - options, xgb): - """ - Constructor. - """ - - self.feats = feats - self.intvs = intvs - self.imaps = imaps - self.ivars = ivars - self.nofcl = nof_classes - self.optns = options - self.idmgr = IDPool() - - # saving XGBooster - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - - # current selector - self.selv = None - - def prepare(self, sample): - """ - Prepare the oracle for computing an explanation. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - self.sel2fid = {} # selectors to original feature ids - self.sel2vid = {} # selectors to categorical feature ids - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - if selv not in self.sel2fid: - self.sel2fid[selv] = int(feat[1:]) - self.sel2vid[selv] = [i - 1] - else: - self.sel2vid[selv].append(i - 1) - - # adding relaxed hypotheses to the oracle - if not self.intvs: - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - else: - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - inp = inp.symbol_name() - # determining the right interval and the corresponding variable - for ub, fvar in zip(self.intvs[inp], self.ivars[inp]): - if ub == '+' or val < ub: - hypo = Implies(self.selv, Implies(sel, fvar)) - break - - self.oracle.add_assertion(hypo) - - # in case of categorical data, there are selector duplicates - # and we need to remove them - self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - self.out_id = maxoval[1] - self.output = self.xgb.target_name[self.out_id] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != self.out_id: - disj.append(GT(self.outs[i], self.outs[self.out_id])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - if self.verbose: - inpvals = self.xgb.readable_sample(sample) - - self.preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - self.preamble.append('{0} = {1}'.format(f, v)) - else: - self.preamble.append(v) - - print('\n explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.output)) - - def explain(self, sample, smallest, expl_ext=None, prefer_ext=False): - """ - Hypotheses minimization. - """ - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.prepare(sample) - - # saving external explanation to be minimized further - if expl_ext == None or prefer_ext: - self.to_consider = [True for h in self.rhypos] - else: - eexpl = set(expl_ext) - self.to_consider = [True if i in eexpl else False for i, h in enumerate(self.rhypos)] - - # if satisfiable, then the observation is not implied by the hypotheses - if self.oracle.solve([self.selv] + [h for h, c in zip(self.rhypos, self.to_consider) if c]): - print(' no implication!') - print(self.oracle.get_model()) - sys.exit(1) - - if not smallest: - self.compute_minimal(prefer_ext=prefer_ext) - else: - self.compute_smallest() - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - expl = sorted([self.sel2fid[h] for h in self.rhypos]) - #print('expl >>>> : ', expl) - - if self.verbose: - self.preamble = [self.preamble[i] for i in expl] - print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id])) - print(' # hypos left:', len(self.rhypos)) - print(' time: {0:.2f}'.format(self.time)) - - return expl - - def compute_minimal(self, prefer_ext=False): - """ - Compute any subset-minimal explanation. - """ - - i = 0 - - if not prefer_ext: - # here, we want to reduce external explanation - - # filtering out unnecessary features if external explanation is given - self.rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if c] - else: - # here, we want to compute an explanation that is preferred - # to be similar to the given external one - # for that, we try to postpone removing features that are - # in the external explanation provided - - rhypos = [h for h, c in zip(self.rhypos, self.to_consider) if not c] - rhypos += [h for h, c in zip(self.rhypos, self.to_consider) if c] - self.rhypos = rhypos - - # simple deletion-based linear search - while i < len(self.rhypos): - to_test = self.rhypos[:i] + self.rhypos[(i + 1):] - - if self.oracle.solve([self.selv] + to_test): - i += 1 - else: - self.rhypos = to_test - - def compute_smallest(self): - """ - Compute a cardinality-minimal explanation. - """ - - # result - rhypos = [] - - with Hitman(bootstrap_with=[[i for i in range(len(self.rhypos)) if self.to_consider[i]]]) as hitman: - # computing unit-size MCSes - for i, hypo in enumerate(self.rhypos): - if self.to_consider[i] == False: - continue - - if self.oracle.solve([self.selv] + self.rhypos[:i] + self.rhypos[(i + 1):]): - hitman.hit([i]) - - # main loop - iters = 0 - while True: - hset = hitman.get() - iters += 1 - - if self.verbose > 1: - print('iter:', iters) - print('cand:', hset) - - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset]): - to_hit = [] - satisfied, unsatisfied = [], [] - - removed = list(set(range(len(self.rhypos))).difference(set(hset))) - - model = self.oracle.get_model() - for h in removed: - i = self.sel2fid[self.rhypos[h]] - if '_' not in self.inps[i].symbol_name(): - # feature variable and its expected value - var, exp = self.inps[i], self.sample[i] - - # true value - true_val = float(model.get_py_value(var)) - - if not exp - 0.001 <= true_val <= exp + 0.001: - unsatisfied.append(h) - else: - hset.append(h) - else: - for vid in self.sel2vid[self.rhypos[h]]: - var, exp = self.inps[vid], int(self.sample[vid]) - - # true value - true_val = int(model.get_py_value(var)) - - if exp != true_val: - unsatisfied.append(h) - break - else: - hset.append(h) - - # computing an MCS (expensive) - for h in unsatisfied: - if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset] + [self.rhypos[h]]): - hset.append(h) - else: - to_hit.append(h) - - if self.verbose > 1: - print('coex:', to_hit) - - hitman.hit(to_hit) - else: - self.rhypos = [self.rhypos[i] for i in hset] - break diff --git a/pages/application/RandomForest/utils/xgbrf/pi_checker.py b/pages/application/RandomForest/utils/xgbrf/pi_checker.py deleted file mode 100644 index f3f0d33cbd6118a1d746ded0a478f6b4ba4a0d90..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/pi_checker.py +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## pi_checker.py -## -## Created on: -## Author: -## E-mail: -## - -# -#============================================================================== -from __future__ import print_function -import getopt -import numpy as np -import os -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTChecker(object): - """ - checking explanation if is a Prime Implicant using SMT solving. - """ - - def __init__(self, formula, feats, nof_classes, xgb): - """ - Constructor. - """ - - self.ftids = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - self.optns = xgb.options - - # xgbooster will also be needed - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=self.xgb.options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - #print('+++++ ',len(self.oracle._assertion_stack)) - - # current selector - self.selv = None - - def prepare(self, sample, expl): - """ - Prepare the oracle for validating an explanation given a sample. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - - - # adding relaxed hypotheses to the oracle - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - true_output = maxoval[1] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != true_output: - disj.append(GT(self.outs[i], self.outs[true_output])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - # removing all hypotheses except for those in the explanation - hypos = [] - for i, hypo in enumerate(self.rhypos): - j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] - if j in expl: - hypos.append(hypo) - self.rhypos = hypos - #print('assumps: ', self.rhypos) - #print('expl: ', expl) - - ''' - if self.verbose: - inpvals = self.xgb.readable_sample(sample) - - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) - ''' - #print('+++++ ',self.oracle._assertion_stack[len(self.oracle._assertion_stack)-1 : ]) - - def check(self, sample, expl): - """ - Check the explanation. - """ - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.prepare(sample, expl) - - # if satisfiable, then there is a counterexample - if self.oracle.solve([self.selv] + self.rhypos): - print('\n explanation is incorrect') - #print(self.oracle.get_model()) - return False - else: - if self.verbose: - print('\n explanation is correct') - - # in case of categorical data, there are selector duplicates - # and we need to remove them - self.rhypos = sorted(set(self.rhypos), key=lambda x: int(x.symbol_name()[6:])) - #print(self.rhypos) - - i = 0 - # simple deletion-based linear search - while i < len(self.rhypos): - to_test = self.rhypos[:i] + self.rhypos[(i + 1):] - #print(self.rhypos[i]) - - if self.oracle.solve([self.selv] + to_test): - i += 1 - else: - print(' explanation is not a prime implicant') - return False - - - print(' explanation is a prime implicant') - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - if self.verbose: - print(' time: {0:.2f}'.format(self.time)) - - return True diff --git a/pages/application/RandomForest/utils/xgbrf/preprocess.py b/pages/application/RandomForest/utils/xgbrf/preprocess.py deleted file mode 100644 index cdcd2cb3976c8764a21b512d2a5c80a073b739f1..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/preprocess.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## preprocess.py -## -## Created on: Jan 10, 2019 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -import json -import numpy as np -import xgboost as xgb -import math -import pandas as pd -import numpy as np -import sklearn -import pickle - - -# -#============================================================================== -def preprocess_dataset(raw_data_path, files): - print("preprocess dataset from ", raw_data_path) - files = files.split(",") - - data_file = files[0] - dataset_name = files[1] - - try: - data_raw = pd.read_csv(raw_data_path + data_file, sep=',', na_values= ['']) - catcols = pd.read_csv(raw_data_path + data_file + ".catcol", header = None) - categorical_features = np.concatenate(catcols.values).tolist() - - - for i in range(len(data_raw.values[0])): - if i in categorical_features: - data_raw.fillna('',inplace=True) - else: - data_raw.fillna(0,inplace=True) - dataset_all = data_raw - dataset = dataset_all.values.copy() - - print(categorical_features) - except Exception as e: - print("Please provide info about categorical columns/original datasets or omit option -p", e) - exit() - - # move categrorical columns forward - - feature_names = dataset_all.columns - print(feature_names) - - ############################## - extra_info = {} - categorical_names = {} - print(categorical_features) - dataset_new = dataset_all.values.copy() - for feature in categorical_features: - print("feature", feature) - print(dataset[:, feature]) - le = sklearn.preprocessing.LabelEncoder() - le.fit(dataset[:, feature]) - categorical_names[feature] = le.classes_ - dataset_new[:, feature] = le.transform(dataset[:, feature]) - - ###################################3 - # target as categorical - labels_new = [] - - le = sklearn.preprocessing.LabelEncoder() - le.fit(dataset[:, -1]) - dataset_new[:, -1]= le.transform(dataset[:, -1]) - class_names = le.classes_ - ######################################33 - - - if (False): - dataset_new = np.delete(dataset_new, -1, axis=1) - oneencoder = sklearn.preprocessing.OneHotEncoder() - oneencoder.fit(dataset_new[:, categorical_features]) - print(oneencoder.categories_) - n_transformed_features = sum([len(cats) for cats in oneencoder.categories_]) - print(n_transformed_features) - print(dataset_new.shape) - X = dataset_new[:,categorical_features][0] - print(X) - x = np.expand_dims(X, axis=0) - print("x", x, x.shape) - y = dataset_new[0].copy() - print(y.shape, oneencoder.transform(x).shape) - y[categorical_features] = oneencoder.transform(x).toarray() - - print("y", y, y.shape) - - z = oneencoder.inverse_transform(y) - print(z.shape) - exit() - - ###########################################################################3 - extra_info = {"categorical_features": categorical_features, - "categorical_names": categorical_names, - "feature_names": feature_names, - "class_names": class_names} - - new_file_train = raw_data_path + dataset_name + '_data.csv' - df = pd.DataFrame(data=dataset_new) - df.columns = list(feature_names) - df.to_csv(new_file_train, mode = 'w', index=False) - print("new dataset", new_file_train) - - - f = open(raw_data_path + dataset_name + '_data.csv.pkl', "wb") - pickle.dump(extra_info, f) - f.close() diff --git a/pages/application/RandomForest/utils/xgbrf/tree.py b/pages/application/RandomForest/utils/xgbrf/tree.py deleted file mode 100644 index afe34d97e331057f9a5b7c0ccef63e85801a572d..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/tree.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## tree.py (reuses parts of the code of SHAP) -## -## Created on: Dec 7, 2018 -## Author: Nina Narodytska -## E-mail: narodytska@vmware.com -## - -# -#============================================================================== -from anytree import Node, RenderTree,AsciiStyle -import json -import numpy as np -import xgboost as xgb -import math - - -# -#============================================================================== -class xgnode(Node): - def __init__(self, id, parent = None): - Node.__init__(self, id, parent) - self.id = id # The node value - self.name = None - self.left_node_id = -1 # Left child - self.right_node_id = -1 # Right child - self.missing_node_id = -1 - - self.feature = -1 - self.threshold = -1 - - self.cover = -1 - self.values = -1 - - def __str__(self): - pref = ' ' * self.depth - if (len(self.children) == 0): - return (pref+ "leaf: {} {}".format(self.id, self.values)) - else: - if(self.name is None): - return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) - else: - return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) - - -# -#============================================================================== -def build_tree(json_tree, node = None, feature_names = None, inverse = False): - def max_id(node): - if "children" in node: - return max(node["nodeid"], *[max_id(n) for n in node["children"]]) - else: - return node["nodeid"] - m = max_id(json_tree) + 1 - def extract_data(json_node, root = None, feature_names = None): - i = json_node["nodeid"] - if (root is None): - node = xgnode(i) - else: - node = xgnode(i, parent = root) - node.cover = json_node["cover"] - if "children" in json_node: - - node.left_node_id = json_node["yes"] - node.right_node_id = json_node["no"] - node.missing_node_id = json_node["missing"] - node.feature = json_node["split"] - if (feature_names is not None): - node.name = feature_names[node.feature] - node.threshold = json_node["split_condition"] - for c, n in enumerate(json_node["children"]): - child = extract_data(n, node, feature_names) - elif "leaf" in json_node: - node.values = json_node["leaf"] - if(inverse): - node.values = -node.values - return node - - root = extract_data(json_tree, None, feature_names) - return root - - -# -#============================================================================== -def walk_tree(node): - if (len(node.children) == 0): - # leaf - print(node) - else: - print(node) - walk_tree(node.children[0]) - walk_tree(node.children[1]) - - -# -#============================================================================== -def scores_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feature_branch = node.feature - sample_value = sample[feature_branch] - assert(sample_value is not None) - if(sample_value < node.threshold): - return scores_tree(node.children[0], sample) - else: - return scores_tree(node.children[1], sample) - - -# -#============================================================================== -class TreeEnsemble: - """ An ensemble of decision trees. - - This object provides a common interface to many different types of models. - """ - def __init__(self, model, feature_names = None, nb_classes = 0): - self.model_type = "xgboost" - #self.original_model = model.get_booster() - self.original_model = model - #### - self.base_offset = None - json_trees = get_xgboost_json(self.original_model) - self.trees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - if(nb_classes == 2): - # NASTY trick for binary - # We change signs of values in leaves so that we can just sum all the values in leaves for class X - # and take max to get the right class - self.otrees = [build_tree(json.loads(t), None, feature_names, inverse = True) for t in json_trees] - self.itrees = [build_tree(json.loads(t), None, feature_names) for t in json_trees] - self.trees = [] - for i,_ in enumerate(self.otrees): - self.trees.append(self.otrees[i]) - self.trees.append(self.itrees[i]) - self.feature_names = feature_names - def print_tree(self): - for i,t in enumerate(self.trees): - print("tree number: ", i) - walk_tree(t) - - def invert_tree_prob(self, node): - if (len(node.children) == 0): - node.values = -node.values - return node - else: - self.invert_tree_prob(node.children[0]) - self.invert_tree_prob(node.children[1]) - return node - def predict(self, samples, nb_classes): - # https://github.com/dmlc/xgboost/issues/1746#issuecomment-290130695 - prob = [] - nb_estimators = int(len(self.trees)/nb_classes) - for sample in np.asarray(samples): - scores = [] - for i,t in enumerate(self.trees): - s = scores_tree(t, sample) - scores.append((s)) - scores = np.asarray(scores) - class_scores = [] - if (nb_classes == 2): - - for i in range(nb_classes): - class_scores.append(math.exp(-(scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor - s0 = class_scores[0] - s1 = class_scores[1] - v0 = 1/(1 + s0) - v1 = 1/(1 + s1) - class_scores[0] = v0 - class_scores[1] = v1 - else: - for i in range(0,nb_classes*nb_estimators,nb_estimators): - class_scores.append(math.exp((scores[i:i+nb_estimators]).sum())) - #for i in range(nb_classes): - # class_scores.append(math.exp((scores[i::nb_classes]).sum())) - class_scores = np.asarray(class_scores) - prob.append(class_scores/class_scores.sum()) - return np.asarray(prob).reshape((-1, nb_classes)) - - -# -#============================================================================== -def get_xgboost_json(model): - """ REUSED FROM SHAP - This gets a JSON dump of an XGBoost model while ensuring the feature names are their indexes. - """ - fnames = model.feature_names - model.feature_names = None - json_trees = model.get_dump(with_stats=True, dump_format="json") - model.feature_names = fnames - return json_trees diff --git a/pages/application/RandomForest/utils/xgbrf/validate.py b/pages/application/RandomForest/utils/xgbrf/validate.py deleted file mode 100644 index 024a6800f454c7bae39104a0f17781c2755ce0ea..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/validate.py +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## validate.py -## -## Created on: Jan 4, 2019 -## Author: Alexey Ignatiev -## E-mail: aignatiev@ciencias.ulisboa.pt -## - -# -#============================================================================== -from __future__ import print_function -import getopt -import numpy as np -import os -from pysat.formula import IDPool -from pysmt.shortcuts import Solver -from pysmt.shortcuts import And, BOOL, Implies, Not, Or, Symbol -from pysmt.shortcuts import Equals, GE, GT, LE, LT, Real, REAL -import resource -from six.moves import range -import sys - - -# -#============================================================================== -class SMTValidator(object): - """ - Validating Anchor's explanations using SMT solving. - """ - - def __init__(self, formula, feats, nof_classes, xgb): - """ - Constructor. - """ - - self.ftids = {f: i for i, f in enumerate(feats)} - self.nofcl = nof_classes - self.idmgr = IDPool() - self.optns = xgb.options - - # xgbooster will also be needed - self.xgb = xgb - - self.verbose = self.optns.verb - self.oracle = Solver(name=self.xgb.options.solver) - - self.inps = [] # input (feature value) variables - for f in self.xgb.extended_feature_names_as_array_strings: - if '_' not in f: - self.inps.append(Symbol(f, typename=REAL)) - else: - self.inps.append(Symbol(f, typename=BOOL)) - - self.outs = [] # output (class score) variables - for c in range(self.nofcl): - self.outs.append(Symbol('class{0}_score'.format(c), typename=REAL)) - - # theory - self.oracle.add_assertion(formula) - - # current selector - self.selv = None - - def prepare(self, sample, expl): - """ - Prepare the oracle for validating an explanation given a sample. - """ - - if self.selv: - # disable the previous assumption if any - self.oracle.add_assertion(Not(self.selv)) - - # creating a fresh selector for a new sample - sname = ','.join([str(v).strip() for v in sample]) - - # the samples should not repeat; otherwise, they will be - # inconsistent with the previously introduced selectors - assert sname not in self.idmgr.obj2id, 'this sample has been considered before (sample {0})'.format(self.idmgr.id(sname)) - self.selv = Symbol('sample{0}_selv'.format(self.idmgr.id(sname)), typename=BOOL) - - self.rhypos = [] # relaxed hypotheses - - # transformed sample - self.sample = list(self.xgb.transform(sample)[0]) - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, self.sample), 1): - feat = inp.symbol_name().split('_')[0] - selv = Symbol('selv_{0}'.format(feat)) - val = float(val) - - self.rhypos.append(selv) - - # adding relaxed hypotheses to the oracle - for inp, val, sel in zip(self.inps, self.sample, self.rhypos): - if '_' not in inp.symbol_name(): - hypo = Implies(self.selv, Implies(sel, Equals(inp, Real(float(val))))) - else: - hypo = Implies(self.selv, Implies(sel, inp if val else Not(inp))) - - self.oracle.add_assertion(hypo) - - # propagating the true observation - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - else: - assert 0, 'Formula is unsatisfiable under given assumptions' - - # choosing the maximum - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - # correct class id (corresponds to the maximum computed) - true_output = maxoval[1] - - # forcing a misclassification, i.e. a wrong observation - disj = [] - for i in range(len(self.outs)): - if i != true_output: - disj.append(GT(self.outs[i], self.outs[true_output])) - self.oracle.add_assertion(Implies(self.selv, Or(disj))) - - # removing all hypotheses except for those in the explanation - hypos = [] - for i, hypo in enumerate(self.rhypos): - j = self.ftids[self.xgb.transform_inverse_by_index(i)[0]] - if j in expl: - hypos.append(hypo) - self.rhypos = hypos - - if self.verbose: - inpvals = self.xgb.readable_sample(sample) - - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation for: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[true_output])) - - def validate(self, sample, expl): - """ - Make an effort to show that the explanation is too optimistic. - """ - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.prepare(sample, expl) - - # if satisfiable, then there is a counterexample - if self.oracle.solve([self.selv] + self.rhypos): - model = self.oracle.get_model() - inpvals = [float(model.get_py_value(i)) for i in self.inps] - outvals = [float(model.get_py_value(o)) for o in self.outs] - maxoval = max(zip(outvals, range(len(outvals)))) - - inpvals = self.xgb.transform_inverse(np.array(inpvals))[0] - self.coex = tuple([inpvals, maxoval[1]]) - inpvals = self.xgb.readable_sample(inpvals) - - if self.verbose: - preamble = [] - for f, v in zip(self.xgb.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - print(' explanation is incorrect') - print(' counterexample: "IF {0} THEN {1}"'.format(' AND '.join(preamble), self.xgb.target_name[maxoval[1]])) - else: - self.coex = None - - if self.verbose: - print(' explanation is correct') - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - if self.verbose: - print(' time: {0:.2f}'.format(self.time)) - - return self.coex - diff --git a/pages/application/RandomForest/utils/xgbrf/xgb_rf.py b/pages/application/RandomForest/utils/xgbrf/xgb_rf.py deleted file mode 100644 index 024225fe8cf140a3eaaeef5f8dfc653c85db8d0f..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xgbrf/xgb_rf.py +++ /dev/null @@ -1,600 +0,0 @@ -#!/us/bin/env python -#-*- coding:utf-8 -*- -## -## xgb_rf.py -## -## Created on: May 23, 2020 -## Author: Yacine Izza -## E-mail: yacine.izza@univ-toulouse.fr -## - -# -#============================================================================== -from .validate import SMTValidator -from .pi_checker import SMTChecker -from .encode import SMTEncoder -from .explain import SMTExplainer -import numpy as np -import os -import resource -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -import sklearn -# print('The scikit-learn version is {}.'.format(sklearn.__version__)) - -from sklearn.preprocessing import OneHotEncoder -import sys -from six.moves import range -from .tree import TreeEnsemble -import xgboost as xgb -from xgboost import XGBRFClassifier, Booster, plot_tree -import matplotlib.pyplot as plt -import pickle - - -# -#============================================================================== -class XGBRandomForest(object): - """ - The main class to train/encode/explain Random Forest models. - """ - - def __init__(self, options, from_data=None, from_model=None, - from_encoding=None): - """ - Constructor. - """ - - assert from_data or from_model or from_encoding, \ - 'At least one input file should be specified' - - self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime - - # saving command-line options - self.options = options - self.seed = 42 - np.random.seed(self.seed) - - if from_data: - self.use_categorical = self.options.use_categorical - # saving data - self.data = from_data - dataset = np.asarray(self.data.samps, dtype=np.float32) - - - # split data into X and y - self.feature_names = self.data.names[:-1] - self.nb_features = len(self.feature_names) - - self.X = dataset[:, 0:self.nb_features] - self.Y = dataset[:, self.nb_features] - self.num_class = len(set(self.Y)) - self.target_name = list(range(self.num_class)) - - param_dist = {'n_estimators':self.options.n_estimators, - 'max_depth':self.options.maxdepth} - - self.params = { 'num_parallel_tree': self.options.n_estimators, - 'max_depth': self.options.maxdepth, - 'colsample_bynode': 0.8, 'subsample': 0.8, - 'learning_rate': 1, 'random_state': self.seed, - 'verbosity' : self.options.verb - } - - if(self.num_class == 2): - self.params['eval_metric'] = 'error' - self.params['objective'] = 'binary:logistic' - else: - self.params['eval_metric'] = 'merror' - self.params['num_class'] = self.num_class - self.params['objective'] = 'multi:softprob' - - if(self.num_class == 2): - param_dist['objective'] = 'binary:logistic' - - #self.model = XGBRFClassifier(**param_dist) - self.model = None - - # split data into train and test sets - self.test_size = self.options.testsplit - if (self.test_size > 0): - self.X_train, self.X_test, self.Y_train, self.Y_test = \ - train_test_split(self.X, self.Y, test_size=self.test_size, - random_state=self.seed) - else: - self.X_train = self.X - self.X_test = [] # need a fix - self.Y_train = self.Y - self.Y_test = []# need a fix - - # check if we have info about categorical features - if (self.use_categorical): - self.categorical_features = from_data.categorical_features - self.categorical_names = from_data.categorical_names - self.target_name = from_data.class_names - - #################################### - # this is a set of checks to make sure that we use the same as anchor encoding - cat_names = sorted(self.categorical_names.keys()) - assert(cat_names == self.categorical_features) - self.encoder = {} - for i in self.categorical_features: - self.encoder.update({i: OneHotEncoder(categories='auto', sparse=False)})#, - self.encoder[i].fit(self.X[:,[i]]) - - else: - self.categorical_features = [] - self.categorical_names = [] - self.encoder = [] - - fname = from_data - - elif from_model: - fname = from_model - self.load_datainfo(from_model) - if (self.use_categorical is False) and (self.options.use_categorical is True): - print("Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions") - exit() - # load model - - elif from_encoding: - fname = from_encoding - - # encoding, feature names, and number of classes - # are read from an input file - enc = SMTEncoder(None, None, None, self, from_encoding) - self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \ - self.num_class = enc.access() - - # create extra file names - try: - os.stat(options.output) - except: - os.mkdir(options.output) - - self.mapping_features() - ################# - self.test_encoding_transformes() - - bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] - bench_dir_name = options.output + "/" + bench_name - try: - os.stat(bench_dir_name) - except: - os.mkdir(bench_dir_name) - - self.basename = (os.path.join(bench_dir_name, bench_name + - "_nbestim_" + str(options.n_estimators) + - "_maxdepth_" + str(options.maxdepth) + - "_testsplit_" + str(options.testsplit))) - - data_suffix = '.splitdata.pkl' - self.modfile = self.basename + '.mod.pkl' - - self.mod_plainfile = self.basename + '.mod.txt' - - self.resfile = self.basename + '.res.txt' - self.encfile = self.basename + '.enc.txt' - self.expfile = self.basename + '.exp.txt' - - def form_datefile_name(self, modfile): - data_suffix = '.splitdata.pkl' - return modfile + data_suffix - - def pickle_save_file(self, filename, data): - try: - f = open(filename, "wb") - pickle.dump(data, f) - f.close() - except: - print("Cannot save to file", filename) - exit() - - def pickle_load_file(self, filename): - try: - f = open(filename, "rb") - data = pickle.load(f) - f.close() - return data - except: - print("Cannot load from file", filename) - exit() - - def save_datainfo(self, filename): - - print("saving model to ", filename) - self.pickle_save_file(filename, self.model) - - filename_data = self.form_datefile_name(filename) - print("saving data to ", filename_data) - samples = {} - samples["X"] = self.X - samples["Y"] = self.Y - samples["X_train"] = self.X_train - samples["Y_train"] = self.Y_train - samples["X_test"] = self.X_test - samples["Y_test"] = self.Y_test - samples["feature_names"] = self.feature_names - samples["target_name"] = self.target_name - samples["num_class"] = self.num_class - samples["categorical_features"] = self.categorical_features - samples["categorical_names"] = self.categorical_names - samples["encoder"] = self.encoder - samples["use_categorical"] = self.use_categorical - - - self.pickle_save_file(filename_data, samples) - - def load_datainfo(self, filename): - print("loading model from ", filename) - self.model = XGBRFClassifier() - self.model = self.pickle_load_file(filename) - - datafile = self.form_datefile_name(filename) - print("loading data from ", datafile) - loaded_data = self.pickle_load_file(datafile) - self.X = loaded_data["X"] - self.Y = loaded_data["Y"] - self.X_train = loaded_data["X_train"] - self.X_test = loaded_data["X_test"] - self.Y_train = loaded_data["Y_train"] - self.Y_test = loaded_data["Y_test"] - self.feature_names = loaded_data["feature_names"] - self.target_name = loaded_data["target_name"] - self.num_class = loaded_data["num_class"] - self.nb_features = len(self.feature_names) - self.categorical_features = loaded_data["categorical_features"] - self.categorical_names = loaded_data["categorical_names"] - self.encoder = loaded_data["encoder"] - self.use_categorical = loaded_data["use_categorical"] - - def train(self, outfile=None): - """ - Train a random forest using XGBoost. - """ - - return self.build_xgbtree(outfile) - - def encode(self, test_on=None): - """ - Encode a random forest trained previously. - """ - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, self) - self.enc, self.intvs, self.imaps, self.ivars = encoder.encode() - - if test_on: - encoder.test_sample(np.array(test_on)) - - encoder.save_to(self.encfile) - - def explain(self, sample, use_lime=None, use_anchor=None, use_shap=None, - expl_ext=None, prefer_ext=False, nof_feats=5): - """ - Explain a prediction made for a given sample with a previously - trained tree ensemble. - """ - - if use_lime: - expl = use_lime(self, sample=sample, nb_samples=5, nb_features_in_exp=nof_feats) - elif use_anchor: - expl = use_anchor(self, sample=sample, nb_samples=5, - nb_features_in_exp=nof_feats, threshold=0.95) - elif use_shap: - expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats) - else: - if 'x' not in dir(self): - self.x = SMTExplainer(self.enc, self.intvs, self.imaps, - self.ivars, self.feature_names, self.num_class, - self.options, self) - - expl = self.x.explain(np.array(sample), self.options.smallest, - expl_ext, prefer_ext) - - # returning the explanation - return expl - - def validate(self, sample, expl): - """ - Make an attempt to show that a given explanation is optimistic. - """ - - # there must exist an encoding - if 'enc' not in dir(self): - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, - self) - self.enc, _, _, _ = encoder.encode() - - if 'v' not in dir(self): - self.v = SMTValidator(self.enc, self.feature_names, self.num_class, - self) - - # try to compute a counterexample - return self.v.validate(np.array(sample), expl) - - - def isPrimeImplicant(self, sample, expl): - """ - Check the explnation if it is a prime implicant. - """ - - # there must exist an encoding - if 'enc' not in dir(self): - encoder = SMTEncoder(self.model, self.feature_names, self.num_class, - self) - self.enc, _, _, _ = encoder.encode() - - if 'checker' not in dir(self): - self.checker = SMTChecker(self.enc, self.feature_names, self.num_class, - self) - - # check the explanation - return self.checker.check(np.array(sample), expl) - - def repair(self, sample, expl): - """ - Make an attempt to repair that a given pessimistic (incorrect) explanation. - """ - #encode without sample - self.encode() - gexpl = self.explain(sample, expl_ext=expl, prefer_ext=True) - - #global explanation - return gexpl - - def refine(self, sample, expl): - """ - Make an attempt to refine that a given optimistic explanation. - """ - #encode without sample - self.encode() - gexpl = self.explain(sample, expl_ext=expl) - - #global explanation - return gexpl - - def transform(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.encoder != []) - tx = [] - for i in range(self.nb_features): - #self.encoder[i].drop = None - if (i in self.categorical_features): - self.encoder[i].drop = None - tx_aux = self.encoder[i].transform(x[:,[i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:,[i]]) - tx = np.hstack(tx) - return tx - else: - return x - - def transform_inverse(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.encoder != []) - inverse_x = [] - for i, xi in enumerate(x): - inverse_xi = np.zeros(self.nb_features) - for f in range(self.nb_features): - if f in self.categorical_features: - nb_values = len(self.categorical_names[f]) - v = xi[:nb_values] - v = np.expand_dims(v, axis=0) - iv = self.encoder[f].inverse_transform(v) - inverse_xi[f] =iv - xi = xi[nb_values:] - - else: - inverse_xi[f] = xi[0] - xi = xi[1:] - inverse_x.append(inverse_xi) - return inverse_x - else: - return x - - def transform_inverse_by_index(self, idx): - if (idx in self.extended_feature_names): - return self.extended_feature_names[idx] - else: - print("Warning there is no feature {} in the internal mapping".format(idx)) - return None - - def transform_by_value(self, feat_value_pair): - if (feat_value_pair in self.extended_feature_names.values()): - keys = (list(self.extended_feature_names.keys())[list( self.extended_feature_names.values()).index(feat_value_pair)]) - return keys - else: - print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) - return None - - def mapping_features(self): - self.extended_feature_names = {} - self.extended_feature_names_as_array_strings = [] - counter = 0 - if (self.use_categorical): - for i in range(self.nb_features): - if (i in self.categorical_features): - for j, _ in enumerate(self.encoder[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append("f{}_{}".format(i,j)) # str(self.feature_names[i]), j)) - counter = counter + 1 - else: - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) #(self.feature_names[i]) - counter = counter + 1 - else: - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i))#(self.feature_names[i]) - counter = counter + 1 - - def readable_sample(self, x): - readable_x = [] - for i, v in enumerate(x): - if (i in self.categorical_features): - readable_x.append(self.categorical_names[i][int(v)]) - else: - #readable_x.append(v) - readable_x.append(str(v)) - return np.asarray(readable_x) - - def test_encoding_transformes(self): - # test encoding - - X = self.X_train[[0],:] - - print("Sample of length", len(X[0])," : ", X) - enc_X = self.transform(X) - print("Encoded sample of length", len(enc_X[0])," : ", enc_X) - inv_X = self.transform_inverse(enc_X) - print("Back to sample", inv_X) - print("Readable sample", self.readable_sample(inv_X[0])) - assert((inv_X == X).all()) - - if (self.options.verb > 1): - for i in range(len(self.extended_feature_names)): - print(i, self.transform_inverse_by_index(i)) - for key, value in self.extended_feature_names.items(): - print(value, self.transform_by_value(value)) - - def transfomed_sample_info(self, i): - print(enc.categories_) - - def build_xgbtree(self, outfile=None): - """ - Build an ensemble of trees (forest). - """ - - if (outfile is None): - outfile = self.modfile - else: - self.datafile = sefl.form_datefile_name(outfile) - - # fit model no training data - - if (len(self.X_test) > 0): - eval_set=[(self.transform(self.X_train), self.Y_train), (self.transform(self.X_test), self.Y_test)] - else: - eval_set=[(self.transform(self.X_train), self.Y_train)] - - print("start xgb") - ''' - self.model.fit(self.transform(self.X_train), self.Y_train, - eval_set=eval_set, - verbose=self.options.verb) # eval_set=[(X_test, Y_test)], - ''' - dtrain = xgb.DMatrix(self.transform(self.X_train), label=self.Y_train) - dtest = xgb.DMatrix(self.transform(self.X_test), label=self.Y_test) - eval_set = [(dtrain, 'train'), (dtest, 'eval')] - evals_result = {} - self.model = xgb.train(self.params, dtrain, num_boost_round=1, - evals=eval_set, evals_result=evals_result) - print("end xgb") - print(self.model.get_score()) - print(len(self.model.get_score())) - #for i in range(5): - # xgb.plot_tree(self.model, num_trees=i) - # plt.show() - - - try: - train_accuracy = round(1 - evals_result['train']['merror'][-1],2) - except: - try: - train_accuracy = round(1 - evals_result['train']['error'][-1],2) - except: - assert(False) - try: - test_accuracy = round(1 - evals_result['eval']['merror'][-1],2) - except: - try: - test_accuracy = round(1 - evals_result['eval']['error'][-1],2) - except: - assert(False) - #print('Train accuracy_xgb: ',train_accuracy) - #print('Test accuracy_xgb: ', test_accuracy) - - - #evals_result = self.model.evals_result() - ########## saving model - self.save_datainfo(outfile) - print("saving plain model to ", self.mod_plainfile) - #self.model._Booster.dump_model(self.mod_plainfile) - self.model.dump_model(self.mod_plainfile) - - ensemble = TreeEnsemble(self.model, self.extended_feature_names_as_array_strings, nb_classes = self.num_class) - #ensemble.print_tree() - - #y_pred_prob = self.model.predict_proba(self.transform(self.X_train[:10])) - classone_probs = self.model.predict(xgb.DMatrix(self.transform(self.X_train[:10]))) - if self.num_class == 2: - classzero_probs = 1.0 - classone_probs - y_pred_prob = np.vstack((classzero_probs, classone_probs)).transpose() - else: - y_pred_prob = classone_probs - - y_pred_prob_compute = ensemble.predict(self.transform(self.X_train[:10]), self.num_class) - #print('y_pred_prob: \n', y_pred_prob) - #print('y_pred_prob_compute: \n',y_pred_prob_compute) - #print('\n\n') - - assert(np.absolute(y_pred_prob_compute- y_pred_prob).sum() < 0.01*len(y_pred_prob)) - - ### accuracy - ''' - try: - train_accuracy = round(1 - evals_result['validation_0']['merror'][-1],2) - except: - try: - train_accuracy = round(1 - evals_result['validation_0']['error'][-1],2) - except: - assert(False) - - try: - test_accuracy = round(1 - evals_result['validation_1']['merror'][-1],2) - except: - try: - test_accuracy = round(1 - evals_result['validation_1']['error'][-1],2) - except: - print("no results test data") - test_accuracy = 0 - ''' - - #### saving - - print("saving results to ", self.resfile) - with open(self.resfile, 'w') as f: - f.write("{} & {} & {} &{} &{} & {} \\\\ \n \hline \n".format( - os.path.basename(self.options.files[0]).replace("_","-"), - train_accuracy, - test_accuracy, - self.options.n_estimators, - self.options.maxdepth, - self.options.testsplit)) - f.close() - - print("Train accuracy: %.2f%%" % (train_accuracy * 100.0)) - print("Test accuracy: %.2f%%" % (test_accuracy * 100.0)) - - - return train_accuracy, test_accuracy, self.model - - def predict(self, X): - classone_probs = self.model.predict(xgb.DMatrix(self.transform(X))) - if self.num_class == 2: - classzero_probs = 1.0 - classone_probs - y_pred_prob = np.vstack((classzero_probs, classone_probs)).transpose() - else: - y_pred_prob = classone_probs - return y_pred_prob - diff --git a/pages/application/RandomForest/utils/xpAnchor.py b/pages/application/RandomForest/utils/xpAnchor.py deleted file mode 100755 index 86958e2bfd42e79e43276b13124c9ffa0f3deed8..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xpAnchor.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python3 -#-*- coding:utf-8 -*- -## -## lime_wrap.py (reuses parts of the code of SHAP) -## - - -# -#============================================================================== -from __future__ import print_function -#from data import Data -from pages.application.RandomForest.utils.xrf import Dataset -import os -import sys -import pickle - - -import json -import numpy as np -import math -from anchor import utils -from anchor import anchor_tabular -import resource - - -# -#============================================================================== -def anchor_call(model, data, sample, threshold=0.95, verbose=0): - - - classifier_fn = lambda x: model.forest.predict(data.transform(x)).astype(float) - X_train, _, _, _ = data.train_test_split() - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - explainer = anchor_tabular.AnchorTabularExplainer( - class_names=data.target_name, - feature_names=data.feature_names, - train_data=data.X) - #print(explainer.d_train) - - if (sample is not None): - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except: - print("Cannot parse input sample:", sample) - exit() - if verbose: - print("\n\n\nStarting Anchor explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = data.transform(feat_sample_exp) - ##y_pred = model.forest.predict(feat_sample_exp)[0] - y_pred_prob = model.forest.predict_proba(feat_sample_exp)[0] - y_pred = np.argmax(y_pred_prob) - - - exp = explainer.explain_instance(feat_sample, - classifier_fn, - threshold=threshold) - if verbose: - print('Anchor: %s' % (' AND '.join(exp.names()))) - print('Precision: %.2f' % exp.precision()) - print('Coverage: %.2f' % exp.coverage()) - - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - if verbose: - print(' time: {0:.2f}'.format(timer)) - - expl = [] - return sorted(expl), timer - - -def pickle_load_file(filename): - try: - f = open(filename, "rb") - data = pickle.load(f) - f.close() - return data - except: - print("Cannot load from file", filename) - exit() - - -# -#============================================================================== -if __name__ == '__main__': - # parsing command-line options - options = Options(sys.argv) - - # making output unbuffered - if sys.version_info.major == 2: - sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) - - # showing head - #show_info() - print('Starting LIME explainer...') - - - if options.files: - cls = None - - print("loading data ...") - data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, use_categorical = False) - - - if options.explain: - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - - if not cls: - print("loading model ...") - cls = pickle_load_file(options.files[1]) - cls.print_accuracy(data) # print test accuray - - samps_file = options.explain.strip() - print(samps_file) - with open(samps_file, 'r') as fp: - lines = fp.readlines() - - # timers - atimes = [] - tested = set() - - for i, s in enumerate(lines): - sample = [float(v.strip()) for v in s.split(',')] - - if tuple(sample) in tested: - continue - - #print("inst#{0}".format(i+1)) - - tested.add(tuple(sample)) - #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) - - expl, time = anchor_call(cls, data, sample, verbose=options.verb) # call lime - - atimes.append(time) - - - #if i == 100: - # break - - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem - - - # reporting the time spent - print('') - print('tot time: {0:.2f}'.format(sum(atimes))) - print('max time: {0:.2f}'.format(max(atimes))) - print('min time: {0:.2f}'.format(min(atimes))) - print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) - print('') - print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) - \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xpLime.py b/pages/application/RandomForest/utils/xpLime.py deleted file mode 100755 index 9945bf7943e755633b688ff183efadea1d7a641a..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xpLime.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python3 -#-*- coding:utf-8 -*- -## -## lime_wrap.py (reuses parts of the code of SHAP) -## - - -# -#============================================================================== -from __future__ import print_function -#from data import Data -from pages.application.RandomForest.utils.xrf import Dataset -import os -import sys -import pickle - - -import json -import numpy as np -import math -import lime -import lime.lime_tabular -import resource - - -# -#============================================================================== -def lime_call(model, data, sample, nb_samples = 50, feats='all', - nb_features_in_exp=10, verbose=0): - - # we need a way to say that features are categorical ? - # we do not have this informations. - predict_fn_rf = lambda x: model.forest.predict_proba(data.transform(x)).astype(float) - X_train, _, _, _ = data.train_test_split() - - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - explainer = lime.lime_tabular.LimeTabularExplainer( - X_train, - feature_names=data.feature_names, - categorical_features= None, - class_names=data.target_name, - discretize_continuous=True, - ) - - f2imap = {} - for i, f in enumerate(data.feature_names): - f2imap[f.strip()] = i - - if (sample is not None): - try: - feat_sample = np.asarray(sample, dtype=np.float32) - except: - print("Cannot parse input sample:", sample) - exit() - if verbose: - print("\n\n\nStarting LIME explainer... \nConsidering a sample with features:", feat_sample) - if not (len(feat_sample) == len(X_train[0])): - print("Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}".format(len(feat_sample), len(X_train[0]))) - exit() - - # compute boost predictions - feat_sample_exp = np.expand_dims(feat_sample, axis=0) - feat_sample_exp = data.transform(feat_sample_exp) - #y_pred = model.forest.predict(feat_sample_exp)[0] - y_pred_prob = model.forest.predict_proba(feat_sample_exp)[0] - y_pred = np.argmax(y_pred_prob) - - - exp = explainer.explain_instance(feat_sample, - predict_fn_rf, - num_features = nb_features_in_exp, - top_labels = 1)#, - #labels = list(range(xgb.num_class))) - - expl = [] - - # choose which features in the explanation to focus on - if feats in ('p', 'pos', '+'): - feats = 1 - elif feats in ('n', 'neg', '-'): - feats = -1 - else: - feats = 0 - - for i in range(data.num_class): - if (i != y_pred): - continue - if verbose: - print("\t \t Explanations for the winner class", i, " ( confidence = ", y_pred_prob[i], ")") - print("\t \t Features in explanations: ", exp.as_list(label=i)) - - s_human_readable = "" - for k, v in enumerate(exp.as_list(label=i)): - if (feats == 1 and v[1] < 0) or (feats == -1 and v[1] >= 0): - continue - - if not (('<' in v[0]) or ('>' in v[0])): - a = v[0].split('=') - f = a[0].strip() - l = a[1].strip() - u = l - -# if (xgb.use_categorical): -# fid = f2imap[f] -# fvid = int(a[1]) -# s_human_readable = s_human_readable + "\t \t id = {}, name = {}, score = {}\n".format(fid, f, str(v[1])) - - - else: - a = v[0].split('<') - - if len(a) == 1: - a = v[0].split('>') - - if len(a) == 2: - f = a[0].strip() - - if '>' in v[0]: - l, u = float(a[1].strip(' =')), None - else: - l, u = None, float(a[1].strip(' =')) - else: - l = float(a[0].strip()) - f = a[1].strip(' =') - u = float(a[2].strip(' =')) - - # expl.append(tuple([f2imap[f], l, u, v[1] >= 0])) - expl.append(f2imap[f]) - -# if (xgb.use_categorical): -# if (len(s_human_readable) > 0): -# print("\t \t Features in explanations (with provided categorical labels): \n", s_human_readable) - timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer - if verbose: - print(' time: {0:.2f}'.format(timer)) - - - return sorted(expl), timer - - -def pickle_load_file(filename): - try: - f = open(filename, "rb") - data = pickle.load(f) - f.close() - return data - except: - print("Cannot load from file", filename) - exit() - - -# -#============================================================================== -if __name__ == '__main__': - # parsing command-line options - options = Options(sys.argv) - - # making output unbuffered - if sys.version_info.major == 2: - sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) - - # showing head - #show_info() - print('Starting LIME explainer...') - - - if options.files: - cls = None - - print("loading data ...") - data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, use_categorical = False) - - - if options.explain: - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - - if not cls: - print("loading model ...") - cls = pickle_load_file(options.files[1]) - cls.print_accuracy(data) # print test accuray - - samps_file = options.explain.strip() - print(samps_file) - with open(samps_file, 'r') as fp: - lines = fp.readlines() - - # timers - atimes = [] - tested = set() - - for i, s in enumerate(lines): - sample = [float(v.strip()) for v in s.split(',')] - - if tuple(sample) in tested: - continue - - #print("inst#{0}".format(i+1)) - - tested.add(tuple(sample)) - #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) - - expl, time = lime_call(cls, data, sample, verbose=options.verb) # call lime - - atimes.append(time) - - - #if i == 3: - # break - - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem - - - # reporting the time spent - print('') - print('tot time: {0:.2f}'.format(sum(atimes))) - print('max time: {0:.2f}'.format(max(atimes))) - print('min time: {0:.2f}'.format(min(atimes))) - print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) - print('') - print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) - \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xprf.py b/pages/application/RandomForest/utils/xprf.py deleted file mode 100755 index acd3fd81c9201a52d6460d5d8397c7074e7f2e8d..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xprf.py +++ /dev/null @@ -1,274 +0,0 @@ -#!/usr/bin/env python3 -#-*- coding:utf-8 -*- -## -## xrf.py -## -## Created on: Oct 08, 2020 -## Author: Yacine Izza -## E-mail: yacine.izza@univ-toulouse.fr -## - -# -#============================================================================== -from __future__ import print_function -from pages.application.RandomForest.utils.data import Data -import os -import sys -import pickle -import resource - -from pages.application.RandomForest.utils.xgbooster import preprocess_dataset - -from pages.application.RandomForest.utils.xrf import XRF, RF2001, Dataset, Checker -import numpy as np - -################## -from pages.application.RandomForest.utils.xpLime import lime_call -import math -import lime -import lime.lime_tabular -### -from pages.application.RandomForest.utils.xpAnchor import anchor_call -#from anchor import utils -from anchor import anchor_tabular -################ - -# -#============================================================================== -def show_info(): - """ - Print info message. - """ - print("c XRF: eXplaining Random Forest.") - print('c') - - -# -#============================================================================== -def pickle_save_file(filename, data): - try: - f = open(filename, "wb") - pickle.dump(data, f) - f.close() - except: - print("Cannot save to file", filename) - exit() - -def pickle_load_file(filename): - try: - f = open(filename, "rb") - data = pickle.load(f) - f.close() - return data - except: - print("Cannot load from file", filename) - exit() - - -# -#============================================================================== -if __name__ == '__main__': - # parsing command-line options - options = Options(sys.argv) - - # making output unbuffered - if sys.version_info.major == 2: - sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) - - # showing head - show_info() - - if (options.preprocess_categorical): - preprocess_dataset(options.files[0], options.preprocess_categorical_files, options.use_categorical) - exit() - - - if options.files: - cls = None - xrf = None - - print("loading data ...") - data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, use_categorical = options.use_categorical) - - if options.train: - ''' - data = Dataset(filename=options.files[0], mapfile=options.mapfile, - separator=options.separator, - use_categorical = options.use_categorical) - ''' - - cls = RF2001(options) - train_accuracy, test_accuracy = cls.train(data) - - if options.verb == 1: - print("----------------------") - print("Train accuracy: {0:.2f}".format(100. * train_accuracy)) - print("Test accuracy: {0:.2f}".format(100. * test_accuracy)) - print("----------------------") - - xrf = XRF(options, cls, data) - #xrf.test_tree_ensemble() - - bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] - bench_dir_name = options.output + "/RF/" + bench_name - try: - os.stat(bench_dir_name) - except: - os.mkdir(bench_dir_name) - - basename = (os.path.join(bench_dir_name, bench_name + - "_nbestim_" + str(options.n_estimators) + - "_maxdepth_" + str(options.maxdepth))) - - modfile = basename + '.mod.pkl' - print("saving model to ", modfile) - pickle_save_file(modfile, cls) - - #data_suffix = '.splitdata.pkl' - #filename_data = basename + data_suffix - #print("saving data to ", filename_data) - #pickle_save_file(filename_data, data) - - - # read a sample from options.explain - #if options.explain: - # options.explain = [float(v.strip()) for v in options.explain.split(',')] - - ''' - if options.encode: - # encode it and save the encoding to another file - #xrf.encode(test_on=options.explain) - xrf.encode(options.explain) - ''' - if options.explain: - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - - if not xrf: - print("loading model ...") - cls = pickle_load_file(options.files[1]) - xrf = XRF(options, cls, data) - - - #expl = xrf.explain(options.explain) - - #expl_checker = Checker(xrf.f, data.num_class, data.extended_feature_names_as_array_strings) - - cls.print_accuracy(data) # print test accuracy of the RF model - - samps_file = options.explain.strip() - print(samps_file) - with open(samps_file, 'r') as fp: - lines = fp.readlines() - - # timers - atimes = [] - lengths = [] - tested = set() - mSAT, mUNSAT = 0.0, 0.0 - stimes = [] - utimes = [] - nSatCalls = [] - nUnsCalls = [] - - ltimes = [] - ctimes = [] - wins = 0 - - for i, s in enumerate(lines): - sample = [float(v.strip()) for v in s.split(',')] - - if tuple(sample) in tested: - continue - - #print("inst#{0}".format(i+1)) - - tested.add(tuple(sample)) - #print('sample {0}: {1}'.format(i, ','.join(s.strip().split(',')))) - - xrf.encode(sample) - expl = xrf.explain(sample) - atimes.append(xrf.x.time) - lengths.append(len(expl)) - - nvars = xrf.enc.cnf.nv - nclauses = len(xrf.enc.cnf.clauses) - - #mSAT = max(xrf.x.stimes+[mSAT]) - #mUNSAT = max(xrf.x.utimes+[mUNSAT]) - if len(xrf.x.stimes): - stimes.append(max(xrf.x.stimes)) - if len(xrf.x.utimes): - utimes.append(max(xrf.x.utimes)) - nSatCalls.append(xrf.x.nsat) - nUnsCalls.append(xrf.x.nunsat) - - #inst = data.transform(np.array(sample))[0] - #expl_checker.check(np.array(inst), expl) - #####check_expl(np.array(inst), expl, xrf.enc.forest, xrf.enc.intvs) - - del xrf.enc - del xrf.x - - #####################LIME########### - ''' - _, ltime = lime_call(cls, data, sample, verbose=options.verb) # call lime - ltimes.append(ltime) - #wins += 1 - if atimes[-1] < ltime: - wins += 1 - ''' - - _, ctime = anchor_call(cls, data, sample, verbose=options.verb) # call lime - ctimes.append(ctime) - if atimes[-1] < ctime: - wins += 1 - - #if i == 1: - # break - - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ - resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem - - - # reporting the time spent - print('') - print('tot time: {0:.2f}'.format(sum(atimes))) - print('max time: {0:.2f}'.format(max(atimes))) - print('min time: {0:.2f}'.format(min(atimes))) - print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes))) - print('') - #### - print('avg length: {0:.0f}'.format(round(sum(lengths) / len(lengths))*100/len(sample))) - #print('max SAT: {0:.2f}'.format(mSAT)) - #print('max UNSAT: {0:.2f}'.format(mUNSAT)) - print('max SAT: {0:.2f}'.format(max(stimes))) - print('max UNSAT: {0:.2f}'.format(max(utimes))) - print('avg #SAT: {0:.0f}'.format(sum(nSatCalls) / len(nSatCalls))) - print('avg #UNSAT: {0:.0f}'.format(sum(nUnsCalls) / len(nUnsCalls))) - print('') - #reporting nof_vars and nof_clauses - print('c nof vars: {0}'.format(nvars)) - print('c nof clauses: {0}'.format(nclauses)) - # - print('c nof instances: {0}'.format(len(tested))) - print("c mem used: {0:.2f} Mb".format(mem/(1024*1024))) - - - # LIME runtimes - ''' - print('') - print('min time for Lime: {0:.2f}'.format(min(ltimes))) - print('avg time for Lime: {0:.2f}'.format(sum(ltimes) / len(ltimes))) - print('#wins {0} out of {1}'.format(wins, len(tested)) ) - ''' - - # Anchor runtimes - print('') - print('tot time for Anchor: {0:.2f}'.format(sum(ctimes))) - print('max time for Anchor: {0:.2f}'.format(max(ctimes))) - print('min time for Anchor: {0:.2f}'.format(min(ctimes))) - print('avg time for Anchor: {0:.2f}'.format(sum(ctimes) / len(ctimes))) - print('#wins {0} out of {1}'.format(wins, len(tested)) ) - \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/__init__.py b/pages/application/RandomForest/utils/xrf/__init__.py deleted file mode 100644 index 7ae37dec55e919f83a87d18f7c9271aa7baab3c2..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -#from .encode import * -#from .tree import * -from .rndmforest import * -#from .checker import check_expl -from .checker import Checker \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so b/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so deleted file mode 100755 index 8037905889ba350ca148832a64d0dfd51f823953..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortn.cpython-38-darwin.so and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so b/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so deleted file mode 100755 index 7aa11a7e970e9b6d0a19149ddf63f889f952f4a7..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/build/lib.macosx-10.9-x86_64-3.8/pysortnetwrk.cpython-38-darwin.so and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o b/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o deleted file mode 100644 index ba7953dbc2b8e701d652333c93f2d2aa33024d72..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortn.o and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o b/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o deleted file mode 100644 index 972595fc8d85029e1a21ad144b452cd34d8306b4..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/build/temp.macosx-10.9-x86_64-3.8/pysortnetwrk.o and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/encode.py b/pages/application/RandomForest/utils/xrf/archive/encode.py deleted file mode 100644 index 3478ebc15ec768b853dfbca892b966d2177bac58..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/encode.py +++ /dev/null @@ -1,276 +0,0 @@ - -from pysat.formula import CNF, IDPool -from pysat.solvers import Solver -from pysat.card import * -from itertools import combinations - -import collections -import six -from six.moves import range - -from .tree import Forest, predict_tree -from .sortnetwrk import HSorNetwrk - -# -#============================================================================== -class SATEncoder(object): - """ - Encoder of Random Forest classifier into SAT. - """ - - def __init__(self, forest, feats, nof_classes, extended_feature_names=None, from_file=None): - #self.model = model - self.forest = forest - self.feats = {f: i for i, f in enumerate(feats)} - self.num_class = nof_classes - self.vpool = IDPool() - #self.optns = xgb.options - self.extended_feature_names = extended_feature_names - - #encoding formula - self.cnf = None - - # for interval-based encoding - self.intvs, self.imaps, self.ivars = None, None, None - - #if from_file: - # self.load_from(from_file) - - def newVar(self, name): - assert(name) - - if name in self.vpool.obj2id: #var has been already created - return self.vpool.obj2id[name] - - var = self.vpool.id('{0}'.format(name)) - return var - - - def traverse(self, tree, k, clause): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - var = self.newVar(tree.name) - #print("{0} => {1}".format(tree.name, var)) - pos, neg = var, -var - - self.traverse(tree.children[0], k, clause + [-neg]) # -var - self.traverse(tree.children[1], k, clause + [-pos]) # --var - else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k)) - print('c: ', clause + [cvar]) - self.cnf.append(clause + [cvar]) - - - ''' - def encode_node(self, node): - """ - Encode a node of a tree. - """ - - if '_' not in node.name: - # continuous features => expecting an upper bound - # feature and its upper bound (value) - f, v = node.name, node.threshold - - existing = True if tuple([f, v]) in self.idmgr.obj2id else False - vid = self.idmgr.id(tuple([f, v])) - bv = Symbol('bvar{0}'.format(vid), typename=BOOL) - - if not existing: - if self.intvs: - d = self.imaps[f][v] + 1 - pos, neg = self.ivars[f][:d], self.ivars[f][d:] - self.enc.append(Iff(bv, Or(pos))) - self.enc.append(Iff(Not(bv), Or(neg))) - else: - fvar, fval = Symbol(f, typename=REAL), Real(v) - self.enc.append(Iff(bv, LT(fvar, fval))) - - return bv, Not(bv) - else: - # all features are expected to be categorical and - # encoded with one-hot encoding into Booleans - # each node is expected to be of the form: f_i < 0.5 - bv = Symbol(node.name, typename=BOOL) - - # left branch is positive, i.e. bv is true - # right branch is negative, i.e. bv is false - return Not(bv), bv - ''' - - - def compute_intervals(self): - """ - Traverse all trees in the ensemble and extract intervals for each - feature. - - At this point, the method only works for numerical datasets! - """ - - def traverse_intervals(tree): - """ - Auxiliary function. Recursive tree traversal. - """ - - if tree.children: - f = tree.name - v = tree.threshold - self.intvs[f].add(v) - - traverse_intervals(tree.children[0]) - traverse_intervals(tree.children[1]) - - # initializing the intervals - self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))} - - for tree in self.forest.trees: - traverse_intervals(tree) - - # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)} - - self.imaps, self.ivars = {}, {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - - ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL) - self.ivars[feat].append(ivar) - - def encode(self, sample): - """ - Do the job. - """ - - self.cnf = CNF() - # getting a tree ensemble - #self.forest = Forest(self.model, self.extended_feature_names) - num_tree = len(self.forest.trees) - - # introducing class score variables - cvars = [[] for t in range(num_tree)] - for k in range(len(self.forest.trees)): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j,k)) - cvars[k].append(-var) - - # if targeting interval-based encoding, - # traverse all trees and extract all possible intervals - # for each feature - ''' - if self.optns.encode == 'smtbool': - self.compute_intervals() - ''' - - # traversing and encoding each tree - for k, tree in enumerate(self.forest.trees): - print("Encode tree#{0}".format(k)) - # encoding the tree - self.traverse(tree, k, []) - #exactly one class var is true this could could be squeezed - # more to reduce NB binary clauses!!!!!!! - enc = CardEnc.atmost(lits=[-v for v in cvars[k]], - vpool=self.vpool, - encoding=EncType.cardnetwrk) #AtMostOne constraint - self.cnf.extend(enc.clauses) - - - csum = [[] for c in range(self.num_class)] - for k, tree in enumerate(self.forest.trees): - c = predict_tree(tree, sample) - csum[c].append(k) - cvars[k][c] = - cvars[k][c] - - # encoding the majority - cmaj,_ = max(enumerate(csum), key=(lambda x: len(x[1]))) - sorted_lits = [[] for c in range(self.num_class)] - #sorting bits - for j in range(self.num_class): - tvars = [cvars[k][j] for k in range(num_tree)] - clauses, vout, _ = HSorNetwrk(lits=tvars, vpool = self.vpool) - self.cnf.extend(clauses) - sorted_lits[j] = vout - #print("tvars: {0} ==> {3} \nclauses: {1}\ntop: {2}".format(tvars, clauses, self.vpool.top, vout)) - #compare bits - for j in range(self.num_class): - if j == cmaj: - continue - for k in range(num_tree): - self.cnf.append([ -sorted_lits[j][k], sorted_lits[cmaj][k] ]) # (v1 => v2) - #print("-{0} => {1}".format(sorted_lits[j][k], sorted_lits[cmaj][k])) - - ''' - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.extended_feature_names: - if '_' in f: - categories[f.split('_')[0]].append(self.newVar(f)) - - for c, feats in six.iteritems(categories): - #ExactlyOne feat is True - self.cnf.append(feats) - enc = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(enc.clauses) - ''' - - #if self.optns.verb: - # number of variables - print('#vars:', self.cnf.nv) - # number of clauses - print('#clauses:', len(self.cnf.clauses)) - #print(self.cnf.clauses) - - return self.cnf, self.intvs, self.imaps, self.ivars - - ''' - def test_sample(self, sample): - """ - Check whether or not the encoding "predicts" the same class - as the classifier given an input sample. - """ - - # first, compute the scores for all classes as would be - # predicted by the classifier - - # score arrays computed for each class - csum = [[] for c in range(self.num_class)] - - #if self.optns.verb: - print('testing sample:', list(sample)) - - # traversing all trees - for i, tree in enumerate(self.forest.trees): - c = predict_tree(tree, sample) - csum[c].append(i) - - # encoding the majority - cmaj,_ = max(enumerate(csum), key=(lambda x: len(x[1]))) - - # second, get the scores computed with the use of the encoding - assert self.cnf, "There is no encoding." - - slv = Solver(name="minisat22") - slv.add_formula(self.cnf) - - - # asserting the sample - hypos = [] - - #for i, fval in enumerate(sample): - - ''' - - - def access(self): - """ - Get access to the encoding, features names, and the number of - classes. - """ - - return self.cnf, self.intvs, self.imaps, self.ivars, self.feats, self.num_class \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc b/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc deleted file mode 100644 index 8a44d9682946bf694983856d759abfaab3351f42..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.cc +++ /dev/null @@ -1,248 +0,0 @@ - - -#define PY_SSIZE_T_CLEAN - -#include <setjmp.h> -#include <signal.h> -#include <stdio.h> -#include <Python.h> - -#include "sortcard.hh" - -using namespace std; - -// docstrings -//============================================================================= -static char module_docstring[] = "This module provides an interface for " - "encoding a few types of cardinality " - "constraints"; -//static char atmost_docstring[] = "Create an AtMost(k) constraint."; -//static char atleast_docstring[] = "Create an AtLeast(k) constraint."; -static char sortn_docstring[] = "Sort an array of bits."; - -static PyObject *CardError; -static jmp_buf env; - -// function declaration for functions available in module -//============================================================================= -extern "C" { - //static PyObject *py_encode_atmost (PyObject *, PyObject *); - //static PyObject *py_encode_atleast (PyObject *, PyObject *); - static PyObject *py_sortn (PyObject *, PyObject *); -} - -// module specification -//============================================================================= -static PyMethodDef module_methods[] = { - //{ "encode_atmost", py_encode_atmost, METH_VARARGS, atmost_docstring }, - //{ "encode_atleast", py_encode_atleast, METH_VARARGS, atleast_docstring }, - { "HSort", py_sortn, METH_VARARGS, sortn_docstring }, - - { NULL, NULL, 0, NULL } -}; - -extern "C" { - -// signal handler for SIGINT -//============================================================================= -static void sigint_handler(int signum) -{ - longjmp(env, -1); -} - -//#if PY_MAJOR_VERSION >= 3 // for Python3 -// PyInt_asLong() -//============================================================================= -static int pyint_to_cint(PyObject *i_obj) -{ - return PyLong_AsLong(i_obj); -} - -// PyInt_fromLong() -//============================================================================= -static PyObject *pyint_from_cint(int i) -{ - return PyLong_FromLong(i); -} - -// PyCapsule_New() -//============================================================================= -static PyObject *void_to_pyobj(void *ptr) -{ - return PyCapsule_New(ptr, NULL, NULL); -} - -// PyCapsule_GetPointer() -//============================================================================= -static void *pyobj_to_void(PyObject *obj) -{ - return PyCapsule_GetPointer(obj, NULL); -} - -// PyInt_Check() -//============================================================================= -static int pyint_check(PyObject *i_obj) -{ - return PyLong_Check(i_obj); -} - -// module initialization -//============================================================================= -static struct PyModuleDef module_def = { - PyModuleDef_HEAD_INIT, - "pysortnetwrk", /* m_name */ - module_docstring, /* m_doc */ - -1, /* m_size */ - module_methods, /* m_methods */ - NULL, /* m_reload */ - NULL, /* m_traverse */ - NULL, /* m_clear */ - NULL, /* m_free */ -}; - -/* -PyMODINIT_FUNC PyInit_pycard(void) -{ - PyObject *m = PyModule_Create(&module_def); - - if (m == NULL) - return NULL; - - CardError = PyErr_NewException((char *)"pycard.error", NULL, NULL); - Py_INCREF(CardError); - - if (PyModule_AddObject(m, "error", CardError) < 0) { - Py_DECREF(CardError); - return NULL; - } - - return m; -}*/ - -PyMODINIT_FUNC PyInit_pysortnetwrk(void) -{ - PyObject *m = PyModule_Create(&module_def); - - if (m == NULL) - return NULL; - - CardError = PyErr_NewException((char *)"pycard.error", NULL, NULL); - Py_INCREF(CardError); - - if (PyModule_AddObject(m, "error", CardError) < 0) { - Py_DECREF(CardError); - return NULL; - } - - return m; -} - - -// auxiliary function for translating an iterable to a vector<int> -//============================================================================= -static bool pyiter_to_vector(PyObject *obj, vector<int>& vect) -{ - PyObject *i_obj = PyObject_GetIter(obj); - - if (i_obj == NULL) { - PyErr_SetString(PyExc_RuntimeError, - "Object does not seem to be an iterable."); - return false; - } - - PyObject *l_obj; - while ((l_obj = PyIter_Next(i_obj)) != NULL) { - if (!pyint_check(l_obj)) { - Py_DECREF(l_obj); - Py_DECREF(i_obj); - PyErr_SetString(PyExc_TypeError, "integer expected"); - return false; - } - - int l = pyint_to_cint(l_obj); - Py_DECREF(l_obj); - - if (l == 0) { - Py_DECREF(i_obj); - PyErr_SetString(PyExc_ValueError, "non-zero integer expected"); - return false; - } - - vect.push_back(l); - } - - Py_DECREF(i_obj); - return true; -} - -// -//============================================================================= -static PyObject *py_sortn(PyObject *self, PyObject *args) -{ - - PyObject *av_obj; - //PyObject *cv_obj; - int top; - int zvar; - - //PyObject *lhs_obj; - //int rhs; - //int top; - //int enc; - int main_thread; - - if (!PyArg_ParseTuple(args, "Oiii", &av_obj, &top, &zvar, - &main_thread)) - return NULL; - - vector<int> av; - if (pyiter_to_vector(av_obj, av) == false) - return NULL; - - PyOS_sighandler_t sig_save; - if (main_thread) { - sig_save = PyOS_setsig(SIGINT, sigint_handler); - - if (setjmp(env) != 0) { - PyErr_SetString(CardError, "Caught keyboard interrupt"); - return NULL; - } - } - - // calling encoder - ClauseSet dest; - vector<int> cv; - sortn_half_sorter_recur(top, dest, av, cv, zvar); - //_encode_atmost(dest, lhs, rhs, top, enc); - - if (main_thread) - PyOS_setsig(SIGINT, sig_save); - - // creating the resulting clause set - PyObject *dest_obj = PyList_New(dest.size()); - for (size_t i = 0; i < dest.size(); ++i) { - PyObject *cl_obj = PyList_New(dest[i].size()); - - for (size_t j = 0; j < dest[i].size(); ++j) { - PyObject *lit_obj = pyint_from_cint(dest[i][j]); - PyList_SetItem(cl_obj, j, lit_obj); - } - - PyList_SetItem(dest_obj, i, cl_obj); - } - - PyObject *cv_obj = PyList_New(cv.size()); - for (size_t i = 0; i < cv.size(); ++i) { - PyObject *lit_obj = pyint_from_cint(cv[i]); - PyList_SetItem(cv_obj, i, lit_obj); - } - - PyObject *ret = Py_BuildValue("OOn", dest_obj, cv_obj, (Py_ssize_t)top); - Py_DECREF(dest_obj); - Py_DECREF(cv_obj); - - return ret; -} - - -} // extern "C" diff --git a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so b/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so deleted file mode 100755 index 7aa11a7e970e9b6d0a19149ddf63f889f952f4a7..0000000000000000000000000000000000000000 Binary files a/pages/application/RandomForest/utils/xrf/archive/pysortnetwrk.so and /dev/null differ diff --git a/pages/application/RandomForest/utils/xrf/archive/rfc.py b/pages/application/RandomForest/utils/xrf/archive/rfc.py deleted file mode 100644 index 411ad46c1e635664cd64695394bc2093710c2368..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/rfc.py +++ /dev/null @@ -1,636 +0,0 @@ -from sklearn.ensemble._voting import VotingClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.preprocessing import OneHotEncoder, LabelEncoder -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -import numpy as np -import sys -import os -import resource - -import collections -from six.moves import range -import six - -from pages.application.RandomForest.utils.data import Data -from .tree import Forest, predict_tree -# from .encode import SATEncoder -from .sortnetwrk import HSorNetwrk -from pysat.formula import CNF, IDPool -from pysat.solvers import Solver -from pysat.card import CardEnc, EncType -from itertools import combinations - - -# -# ============================================================================== -class Dataset(Data): - """ - Class for representing dataset (transactions). - """ - - def __init__(self, filename=None, fpointer=None, mapfile=None, - separator=' ', use_categorical=False): - super().__init__(filename, fpointer, mapfile, separator, use_categorical) - - # split data into X and y - self.feature_names = self.names[:-1] - self.nb_features = len(self.feature_names) - self.use_categorical = use_categorical - samples = np.asarray(self.samps, dtype=np.float32) - self.X = samples[:, 0: self.nb_features] - self.y = samples[:, self.nb_features] - self.num_class = len(set(self.y)) - self.target_name = list(range(self.num_class)) - - print("c nof_features: {0}".format(self.nb_features)) - print("c nof_samples: {0}".format(len(self.samps))) - - # check if we have info about categorical features - if (self.use_categorical): - self.binarizer = {} - for i in self.categorical_features: - self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)}) # , - self.binarizer[i].fit(self.X[:, [i]]) - else: - self.binarize = [] - # feat map - self.mapping_features() - - def train_test_split(self, test_size=0.2, seed=0): - return train_test_split(self.X, self.y, test_size=test_size, random_state=seed) - - def transform(self, x): - if (len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert (self.binarizer != []) - tx = [] - for i in range(self.nb_features): - self.binarizer[i].drop = None - if (i in self.categorical_features): - tx_aux = self.binarizer[i].transform(x[:, [i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:, [i]]) - tx = np.hstack(tx) - return tx - else: - return x - - def transform_inverse(self, x): - if (len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert (self.binarizer != []) - inverse_x = [] - for i, xi in enumerate(x): - inverse_xi = np.zeros(self.nb_features) - for f in range(self.nb_features): - if f in self.categorical_features: - nb_values = len(self.categorical_names[f]) - v = xi[:nb_values] - v = np.expand_dims(v, axis=0) - iv = self.binarizer[f].inverse_transform(v) - inverse_xi[f] = iv - xi = xi[nb_values:] - - else: - inverse_xi[f] = xi[0] - xi = xi[1:] - inverse_x.append(inverse_xi) - return inverse_x - else: - return x - - def transform_inverse_by_index(self, idx): - if (idx in self.extended_feature_names): - return self.extended_feature_names[idx] - else: - print("Warning there is no feature {} in the internal mapping".format(idx)) - return None - - def transform_by_value(self, feat_value_pair): - if (feat_value_pair in self.extended_feature_names.values()): - keys = ( - list(self.extended_feature_names.keys())[list(self.extended_feature_names.values()).index(feat_value_pair)]) - return keys - else: - print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) - return None - - def mapping_features(self): - self.extended_feature_names = {} - self.extended_feature_names_as_array_strings = [] - counter = 0 - if (self.use_categorical): - for i in range(self.nb_features): - if (i in self.categorical_features): - for j, _ in enumerate(self.binarizer[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append( - "f{}_{}".format(i, j)) # str(self.feature_names[i]), j)) - counter = counter + 1 - else: - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - else: - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i]) - counter = counter + 1 - - def readable_sample(self, x): - readable_x = [] - for i, v in enumerate(x): - if (i in self.categorical_features): - readable_x.append(self.categorical_names[i][int(v)]) - else: - readable_x.append(v) - return np.asarray(readable_x) - - def test_encoding_transformes(self, X_train): - # test encoding - - X = X_train[[0], :] - - print("Sample of length", len(X[0]), " : ", X) - enc_X = self.transform(X) - print("Encoded sample of length", len(enc_X[0]), " : ", enc_X) - inv_X = self.transform_inverse(enc_X) - print("Back to sample", inv_X) - print("Readable sample", self.readable_sample(inv_X[0])) - assert ((inv_X == X).all()) - - ''' - for i in range(len(self.extended_feature_names)): - print(i, self.transform_inverse_by_index(i)) - for key, value in self.extended_feature_names.items(): - print(value, self.transform_by_value(value)) - ''' - - -# -# ============================================================================== -class VotingRF(VotingClassifier): - """ - Majority rule classifier - """ - - def fit(self, X, y, sample_weight=None): - self.estimators_ = [] - for _, est in self.estimators: - self.estimators_.append(est) - - self.le_ = LabelEncoder().fit(y) - self.classes_ = self.le_.classes_ - - def predict(self, X): - """Predict class labels for X. - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. - Returns - ------- - maj : array-like of shape (n_samples,) - Predicted class labels. - """ - # check_is_fitted(self) - - # 'hard' voting - predictions = self._predict(X) - predictions = np.asarray(predictions, np.int64) # NEED TO BE CHECKED - maj = np.apply_along_axis( - lambda x: np.argmax( - np.bincount(x, weights=self._weights_not_none)), - axis=1, arr=predictions) - - maj = self.le_.inverse_transform(maj) - - return maj - - -# -# ============================================================================== -class RF2001(object): - """ - The main class to train Random Forest Classifier (RFC). - """ - - def __init__(self, options, from_data=None, from_model=None): - """ - Constructor. - """ - self.forest = None - self.voting = None - self.opt = options - - param_dist = {'n_estimators': options.n_estimators, - 'max_depth': options.maxdepth} - - self.forest = RandomForestClassifier(**param_dist) - - def train(self, dataset, outfile=None): - """ - Train a random forest. - """ - - X_train, X_test, y_train, y_test = dataset.train_test_split() - - dataset.test_encoding_transformes(X_train) - X_train = dataset.transform(X_train) - X_test = dataset.transform(X_test) - - print("Build a random forest.") - self.forest.fit(X_train, y_train) - - rtrees = [('dt', dt) for i, dt in enumerate(self.forest.estimators_)] - self.voting = VotingRF(estimators=rtrees) - self.voting.fit(X_train, y_train) - - train_acc = accuracy_score(self.voting.predict(X_train), y_train) - ''' - print(X_test[[0],:]) - print("RF: ",np.asarray(self.voting.predict(X_test[[0],:]))) - for i,t in enumerate(self.forest.estimators_): - print("DT_{0}: {1}".format(i,np.asarray(t.predict(X_test[[0],:])))) - ''' - test_acc = accuracy_score(self.voting.predict(X_test), y_test) - print("----------------------") - print("RF2001:") - print("Train accuracy RF2001: {0:.2f}".format(100. * train_acc)) - print("Test accuracy RF2001: {0:.2f}".format(100. * test_acc)) - print("----------------------") - - train_acc = accuracy_score(self.forest.predict(X_train), y_train) - test_acc = accuracy_score(self.forest.predict(X_test), y_test) - print("RF-scikit:") - print("Train accuracy RF-scikit: {0:.2f}".format(100. * train_acc)) - print("Test accuracy RF-scikit: {0:.2f}".format(100. * test_acc)) - print("----------------------") - - return train_acc, test_acc - - def predict(self, X): - return self.voting.predict(X) - - def estimators(self): - assert (self.forest.estimators_ is not None) - return self.forest.estimators_ - - def n_estimators(self): - return self.forest.n_estimators - - -# -# ============================================================================== -class XRF(object): - """ - class to encode and explain Random Forest classifiers. - """ - - def __init__(self, options, model): - self.cls = model - self.f = Forest(model) - # self.f.print_tree() - self.verbose = options.verb - - def encode(self, data): - """ - Encode a tree ensemble trained previously. - """ - ########## - self.f = Forest(self.cls, data.extended_feature_names_as_array_strings) - self.f.print_tree() - ####### - self.sat_enc = SATEncoder(self.f, data.feature_names, data.num_class, - extended_feature_names=data.extended_feature_names_as_array_strings) - - _, X_test, _, y_test = data.train_test_split() - - inst = X_test[[1], :] - inst = data.transform(inst)[0] - self.sat_enc.encode(inst) - self.explain(inst, data) - - def explain(self, sample, data): - """ - Explain a prediction made for a given sample with a previously - trained RF. - """ - - preamble = None - if self.verbose: - inpvals = data.readable_sample(sample) - - preamble = [] - for f, v in zip(data.feature_names, inpvals): - if f not in v: - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - inps = data.extended_feature_names_as_array_strings # input (feature value) variables - # print("inps: {0}".format(inps)) - - if 'x' not in dir(self): - self.x = SATExplainer(self.sat_enc, inps, preamble, data.target_name) - - expl = self.x.explain(np.array(sample)) - - # returning the explanation - return expl - - def test_tree_ensemble(self, dataset): - _, X_test, _, y_test = dataset.train_test_split() - X_test = dataset.transform(X_test) - - y_pred_forest = self.f.predict(X_test) - acc = accuracy_score(y_pred_forest, y_test) - print("Test accuracy: {0:.2f}".format(100. * acc)) - - y_pred_cls = self.cls.predict(X_test) - # print(np.asarray(y_pred_cls, np.int64)) - # print(y_pred_forest) - - assert ((y_pred_cls == y_pred_forest).all()) - - -# -# ============================================================================== -class SATEncoder(object): - """ - Encoder of Random Forest classifier into SAT. - """ - - def __init__(self, forest, feats, nof_classes, extended_feature_names=None, from_file=None): - # self.model = model - self.forest = forest - self.feats = {f: i for i, f in enumerate(feats)} - self.num_class = nof_classes - self.vpool = IDPool() - # self.optns = xgb.options - self.extended_feature_names = extended_feature_names - - # encoding formula - self.cnf = None - - # for interval-based encoding - self.intvs, self.imaps, self.ivars = None, None, None - - # if from_file: - # self.load_from(from_file) - - def newVar(self, name): - assert (name) - - if name in self.vpool.obj2id: # var has been already created - return self.vpool.obj2id[name] - - var = self.vpool.id('{0}'.format(name)) - return var - - def traverse(self, tree, k, clause): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - var = self.newVar(tree.name) - # print("{0} => {1}".format(tree.name, var)) - pos, neg = var, -var - - self.traverse(tree.children[0], k, clause + [-neg]) # -var - self.traverse(tree.children[1], k, clause + [-pos]) # --var - else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values, k)) - # print('c: ', clause + [cvar]) - self.cnf.append(clause + [cvar]) - - def encode(self, sample): - """ - Do the job. - """ - - self.cnf = CNF() - # getting a tree ensemble - # self.forest = Forest(self.model, self.extended_feature_names) - num_tree = len(self.forest.trees) - - # introducing class variables - cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_tree)] - - # introducing class-tree variables - ctvars = [[] for t in range(num_tree)] - for k in range(num_tree): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j, k)) - ctvars[k].append(var) - - # if targeting interval-based encoding, - # traverse all trees and extract all possible intervals - # for each feature - ''' - if self.optns.encode == 'smtbool': - self.compute_intervals() - ''' - - # traversing and encoding each tree - for k, tree in enumerate(self.forest.trees): - # print("Encode tree#{0}".format(k)) - # encoding the tree - self.traverse(tree, k, []) - # exactly one class var is true this could could be squeezed - # more to reduce NB binary clauses!!!!!!! - enc = CardEnc.atmost(lits=ctvars[k], - vpool=self.vpool, - encoding=EncType.cardnetwrk) # AtMostOne constraint - self.cnf.extend(enc.clauses) - - csum = [[] for c in range(self.num_class)] - for k, tree in enumerate(self.forest.trees): - c = predict_tree(tree, sample) - csum[c].append(k) - - # encoding the majority - self.cmaj, _ = max(enumerate(csum), key=(lambda x: len(x[1]))) - sorted_lits = [[] for c in range(self.num_class)] - # sorting bits - for j in range(self.num_class): - lits = [ctvars[k][j] for k in range(num_tree)] - clauses, vout, _ = HSorNetwrk(lits=lits, vpool=self.vpool) - self.cnf.extend(clauses) - sorted_lits[j] = vout - print("{0}:{2} => {1}".format(j, vout, lits)) - # compare bits - for j in range(self.cmaj): - - for j in range(self.cmaj + 1, self.num_class): - for k in range(num_tree): - self.cnf.append([-sorted_lits[j][k], sorted_lits[self.cmaj][k]]) # (v1 => v2) - # print("-{0} => {1}".format(sorted_lits[j][k], sorted_lits[self.cmaj][k])) - - ''' - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.extended_feature_names: - if '_' in f: - categories[f.split('_')[0]].append(self.newVar(f)) - - for c, feats in six.iteritems(categories): - #ExactlyOne feat is True - self.cnf.append(feats) - enc = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(enc.clauses) - ''' - for cl in self.cnf: - print("{0} == {1}".format(cl, - [self.vpool.obj(abs(p)) if p > 0 else "!" + str(self.vpool.obj(abs(p))) for p in - cl])) - - # if self.optns.verb: - # number of variables - print('#vars:', self.cnf.nv) - # number of clauses - print('#clauses:', len(self.cnf.clauses)) - # print(self.cnf.clauses) - - return self.cnf, self.intvs, self.imaps, self.ivars - - -# -# ============================================================================== -class SATExplainer(object): - """ - An SAT-inspired minimal explanation extractor for Random Forest models. - """ - - def __init__(self, sat_enc, inps, preamble, target_name): - """ - Constructor. - """ - - self.enc = sat_enc - # self.optns = options - self.inps = inps # input (feature value) variables - self.target_name = target_name - self.preamble = preamble - - self.verbose = True # self.optns.verb - # self.slv = Solver(name=options.solver) - self.slv = Solver(name="minisat22") - - # CNF formula - self.slv.append_formula(self.enc.cnf) - - # current selector - # self.selv = None - - def explain(self, sample, smallest=False): - """ - Hypotheses minimization. - """ - if self.verbose: - print( - ' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - self.assums = [] # var selectors to be used as assumptions - self.sel2fid = {} # selectors to original feature ids - self.sel2vid = {} # selectors to categorical feature ids - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, sample), 1): - feat = inp.split('_')[0] - selv = self.enc.newVar('selv_{0}'.format(feat)) - - self.assums.append(selv) - if selv not in self.sel2fid: - self.sel2fid[selv] = int(feat[1:]) - self.sel2vid[selv] = [i - 1] - else: - self.sel2vid[selv].append(i - 1) - - if not self.enc.intvs: - for inp, val, sel in zip(self.inps, sample, self.assums): - p = self.enc.newVar(inp) - hypo = [-sel, p if val else -p] - print("{0} => {1}".format(self.enc.vpool.obj(sel), inp if val else "!" + inp)) - self.slv.add_clause(hypo) - else: - raise NotImplementedError('Intervals are not supported.') - - self.assums = sorted(set(self.assums)) - # print("selctors: ", self.assums) - - self.slv.solve(assumptions=self.assums) - print("Model1:") - for p in self.slv.get_model(): - # if self.enc.vpool.obj(abs(p)) :and self.enc.vpool.obj(abs(p)) in self.inps: - if self.enc.vpool.obj(abs(p)) and "class" in self.enc.vpool.obj(abs(p)): - print((p, self.enc.vpool.obj(abs(p)))) - print(self.slv.get_model()) - - # forcing a misclassification, i.e. a wrong observation - for k in range(len(self.enc.forest.trees)): - cl = [] - for j in range(self.enc.num_class): - if j != self.enc.cmaj: - cl.append(self.enc.newVar('class{0}_tr{1}'.format(j, k))) - self.slv.add_clause(cl) - - # if satisfiable, then the observation is not implied by the hypotheses - if self.slv.solve(assumptions=self.assums): - print(' no implication!') - print(self.slv.get_model()) - # print("Model: {0}".format([ (p, self.enc.vpool.obj(abs(p))) for p in self.slv.get_model()])) - sys.exit(1) - - if not smallest: - self.compute_minimal() - else: - raise NotImplementedError('Smallest explanation is not yet implemented.') - # self.compute_smallest() - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - expl = sorted([self.sel2fid[h] for h in self.assums if h > 0]) - print("expl-selctors: ", expl) - - if self.verbose: - self.preamble = [self.preamble[i] for i in expl] - # print("cmaj: ", self.enc.cmaj) - print( - ' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) - print(' # hypos left:', len(self.assums)) - print(' time: {0:.2f}'.format(self.time)) - - return expl - - def compute_minimal(self): - """ - Compute any subset-minimal explanation. - """ - i = 0 - # simple deletion-based linear search - for i, p in enumerate(self.assums): - to_test = self.assums[:i] + self.assums[(i + 1):] + [-p] - # print(to_test) - if self.slv.solve(assumptions=to_test): - self.assums[i] = -p - print("Model:") - for p in self.slv.get_model(): - if self.enc.vpool.obj(abs(p)) and self.enc.vpool.obj(abs(p)) in self.inps: - print((p, self.enc.vpool.obj(abs(p)))) diff --git a/pages/application/RandomForest/utils/xrf/archive/setup.py b/pages/application/RandomForest/utils/xrf/archive/setup.py deleted file mode 100644 index 15c642788ddbea871d9a2a51fa61101d62230808..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 - -from distutils.core import setup, Extension - -pysortn_ext = Extension('pysortnetwrk', - sources=['pysortnetwrk.cc'], - include_dirs=['sortcard'], - language='c++') - -setup(name='pysortnetwrk', - version='1.0', - description='This module provides a sorting network to sort a vector of bits', - py_modules=['pysortnetwrk'], - ext_modules=[pysortn_ext]) diff --git a/pages/application/RandomForest/utils/xrf/archive/sortcard.hh b/pages/application/RandomForest/utils/xrf/archive/sortcard.hh deleted file mode 100644 index e6410504144682d39d43fcd172ce1c0c10a58f53..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/sortcard.hh +++ /dev/null @@ -1,298 +0,0 @@ - -#ifndef SORTCARD_HH_ -#define SORTCARD_HH_ - -#include <vector> -#include <algorithm> -#include <vector> -#include <ostream> - - -#define NOPTCLS true - - -using namespace std; - -class ClauseSet { -public: - ClauseSet() : clauses(0) {} - ClauseSet(ClauseSet& orig) : clauses(orig.clauses) {} - - void clear() - { - clauses.clear(); - } - - size_t size() - { - return clauses.size(); - } - - void resize(size_t sz_new) - { - return clauses.resize(sz_new); - } - - vector<int>& operator[](size_t i) - { - return clauses[i]; - } - - void erase(vector<int>& cl) - { - clauses.erase(std::find(clauses.begin(), clauses.end(), cl)); - } - - void erase_subset(size_t start, ClauseSet& clset) - { - if (clset.size()) { - vector<int>& cl_first = clset[0]; - vector<vector<int> >::iterator begin = std::find(clauses.begin() + start, clauses.end(), cl_first); - clauses.erase(begin, begin + clset.size()); - } - } - - vector<vector<int> >& get_clauses() - { - return clauses; - } - - void add_clause(vector<int> cl) - { - clauses.push_back(cl); - } - - void add_clause_ref(vector<int>& cl) - { - clauses.push_back(cl); - } - - void create_clause(vector<int>& cl) - { - add_clause(cl); - } - - void create_unit_clause(int l) - { - vector<int> cl; cl.push_back(l); - clauses.push_back(cl); - } - - void create_binary_clause(int l1, int l2) - { - vector<int> cl; - cl.push_back(l1); - cl.push_back(l2); - - clauses.push_back(cl); - } - - void create_ternary_clause(int l1, int l2, int l3) - { - vector<int> cl; - cl.push_back(l1); - cl.push_back(l2); - cl.push_back(l3); - - clauses.push_back(cl); - } - - void dump(ostream& out) - { - for (size_t i = 0; i < clauses.size(); ++i) - dump_clause(out, clauses[i]); - } -private: - void dump_clause(ostream& out, vector<int>& cl) - { - for (size_t i = 0; i < cl.size(); ++i) - out << cl[i] << " "; - out << "0" << endl; - } -protected: - vector<vector<int> > clauses; -}; - - - -// -//============================================================================= -inline void create_vvect(int& top_id, vector<int>& ov, size_t nvars) -{ - assert(nvars > 0); - - size_t refnv = ov.size(); - size_t tvars = refnv + nvars; - ov.resize(tvars, 0); - - for (size_t k = refnv; k < tvars; ++k) - ov[k] = ++top_id; - - assert(ov.size() > 0); -} - - -// -//============================================================================= -inline void copy_vvect(int& top_id, vector<int>& ov, vector<int>& iv) -{ - size_t refnv = ov.size(); - ov.resize(refnv + iv.size(), 0); - - for (size_t k = 0; k < iv.size(); ++k) - ov[refnv + k] = iv[k]; - - assert(ov.size() > 0); -} - - - -// -//============================================================================= -inline void mk_half_vect(vector<int>& ov, vector<int>& iv, size_t offset) -{ - assert(iv.size() > 0); - - size_t ns = iv.size() / 2; - ov.resize(ns, 0); - - for (size_t k = 0; k < ns; ++k) - ov[k] = iv[offset + k]; -} - -// -//============================================================================= -inline void mk_odd_vect(vector<int>& ov, vector<int>& iv) -{ - assert(iv.size() > 0); - - size_t ns = iv.size() / 2; - ov.resize(ns, 0); - - for (size_t k = 0; k < ns; ++k) - ov[k] = iv[k * 2]; -} - -//============================================================================= -inline void mk_even_vect(vector<int>& ov, vector<int>& iv) -{ - assert(iv.size() > 0); - - size_t ns = iv.size() / 2; - ov.resize(ns, 0); - - for (size_t k = 0; k < ns; ++k) - ov[k] = iv[k * 2 + 1]; -} - -// sorting networks -//============================================================================= -inline void sortn_half_merge_recur( - int& top_id, - ClauseSet& clset, - vector<int>& av, - vector<int>& bv, - vector<int>& cv, - size_t zvar -) -{ - assert(bv.size() == av.size()); - - if (av.size() == 1) { // vectors of size 1 - assert(av[0] != 0); - if (NOPTCLS || (av[0] != zvar && bv[0] != zvar)) { - create_vvect(top_id, cv, 2); - clset.create_binary_clause (-av[0], cv[0]); - clset.create_binary_clause (-bv[0], cv[0]); - clset.create_ternary_clause(-av[0], -bv[0], cv[1]); - } - else { - if (av[0] == zvar) { - cv.push_back(bv[0]); - cv.push_back(av[0]); - } - else { - assert(bv[0] == zvar); - cv.push_back(av[0]); - cv.push_back(bv[0]); - } - } - } - else { - if (NOPTCLS || - ((av[0] != zvar || av[av.size() - 1] != zvar) && - (bv[0] != zvar || bv[av.size() - 1] != zvar))) { - vector<int> aodd, aeven, bodd, beven, dv, ev; - - mk_odd_vect(aodd, av); mk_even_vect(aeven, av); - mk_odd_vect(bodd, bv); mk_even_vect(beven, bv); - - sortn_half_merge_recur(top_id, clset, aodd, bodd, dv, zvar); - sortn_half_merge_recur(top_id, clset, aeven, beven, ev, zvar); - - assert(cv.size() == 0); - cv.push_back(dv[0]); - create_vvect(top_id, cv, 2 * av.size() - 2); - cv.push_back(ev[ev.size() - 1]); - - for (size_t i = 0; i < av.size() - 1; ++i) { - assert(i + 1 < dv.size()); - assert(i < ev.size()); - assert(2 * 1 + 1 < cv.size()); - - clset.create_binary_clause (-dv[i + 1], cv[2 * i + 1]); - clset.create_binary_clause (-ev[i ], cv[2 * i + 1]); - clset.create_ternary_clause(-dv[i + 1], -ev[i], cv[2 * i + 2]); - } - } - else { - if (av[0] == zvar && av[av.size() - 1] == zvar) { - copy_vvect(top_id, cv, bv); - copy_vvect(top_id, cv, av); - } - else { - assert(bv[0] == zvar && bv[av.size() - 1] == zvar); - copy_vvect(top_id, cv, av); - copy_vvect(top_id, cv, bv); - } - } - } - - assert(cv.size() > 0); -} - -// -//============================================================================= -inline vector<int>& sortn_half_sorter_recur( - int& top_id, - ClauseSet& clset, - vector<int>& av, - vector<int>& cv, - size_t zvar -) -{ - assert(av.size() > 1); - - if (av.size() == 2) { - assert(av[0] != 0 && av[1] != 0); - - vector<int> xav, xbv; xav.push_back(av[0]); xbv.push_back(av[1]); - sortn_half_merge_recur(top_id, clset, xav, xbv, cv, zvar); - } - else { - vector<int> dv1, dv2, lav, uav; - mk_half_vect(lav, av, 0); - mk_half_vect(uav, av, av.size() / 2); - - assert(lav.size() == uav.size()); - sortn_half_sorter_recur(top_id, clset, lav, dv1, zvar); assert(dv1.size() > 0); - sortn_half_sorter_recur(top_id, clset, uav, dv2, zvar); assert(dv2.size() > 0); - sortn_half_merge_recur (top_id, clset, dv1, dv2, cv, zvar); - } - - assert(cv.size() > 0); - return cv; -} - - -#endif // SORTCARD_HH_ diff --git a/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py b/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py deleted file mode 100644 index f18c26ece6ad9804c0d40f4e584df7e149b075c0..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/sortnetwrk.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -# -from math import exp, log, trunc - -from pysat._utils import MainThread -from pysat.formula import CNF, IDPool -import pysortnetwrk - - - -def HSorNetwrk(lits, top_id=None, vpool=None): - assert not top_id or not vpool, \ - 'Use either a top id or a pool of variables but not both.' - - - # we are going to return this formula - #ret = CNF() - - # if the list of literals is empty, return empty formula - if not lits: - return ret - - # obtaining the top id from the variable pool - if vpool: - top_id = vpool.top - - - # making sure we are dealing with a list of literals - lits = list(lits) - - # choosing the maximum id among the current top and the list of literals - top_id = max(map(lambda x: abs(x), lits + [top_id if top_id != None else 0])) - - - nvars = len(lits) - - #get smallest power of 2 larger than number of vars - exponent = trunc(log(nvars) / log(2)) # assume exponent - nvtmp = exp(log(2) * exponent) - - # check if number of vars already is power of 2; correct exponent if required - exponent = exponent if (nvars - nvtmp < 0.000001) else exponent + 1 - nnvars = trunc(exp(log(2) * exponent) + 0.1) - - cl = None - zvar = 0 - if (nnvars != nvars): - top_id += 1 - zvar = top_id - lits.extend([zvar] * (nnvars - nvars)) - cl = [-zvar] - - # generate odd-even sorting network - clset,slits,top = pysortnetwrk.HSort(lits, top_id, zvar, int(MainThread.check())) - - clset = clset +[cl] if (cl is not None) else clset - - - # updating vpool if necessary - if vpool: - vpool.top = top - 1 - vpool._next() - - - - return clset, slits, top - -if __name__ == '__main__': - print("Sorting Network:") - lits = [1, 2, 3] - top_id = 5 - clauses, slits, top = HSorNetwrk(lits, top_id) - print(clauses) - print(slits) \ No newline at end of file diff --git a/pages/application/RandomForest/utils/xrf/archive/tree.py b/pages/application/RandomForest/utils/xrf/archive/tree.py deleted file mode 100644 index 9e3794544ac8d0e2baded3a4271bab312f4aa86e..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/archive/tree.py +++ /dev/null @@ -1,154 +0,0 @@ -# -#============================================================================== -from anytree import Node, RenderTree,AsciiStyle -import json -import numpy as np -import math -import os - - -# -#============================================================================== -class xgnode(Node): - def __init__(self, id, parent = None): - Node.__init__(self, id, parent) - self.id = id # The node value - self.name = None - self.left_node_id = -1 # Left child - self.right_node_id = -1 # Right child - - self.feature = -1 - self.threshold = None - self.values = -1 - #iai - #self.split = None - - def __str__(self): - pref = ' ' * self.depth - if (len(self.children) == 0): - return (pref+ "leaf: {} {}".format(self.id, self.values)) - else: - if(self.name is None): - return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) - else: - return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) - - -#============================================================================== -def build_tree(tree_, feature_names = None): - ## - feature = tree_.feature - threshold = tree_.threshold - values = tree_.value - n_nodes = tree_.node_count - children_left = tree_.children_left - children_right = tree_.children_right - node_depth = np.zeros(shape=n_nodes, dtype=np.int64) - is_leaf = np.zeros(shape=n_nodes, dtype=bool) - stack = [(0, -1)] # seed is the root node id and its parent depth - while len(stack) > 0: - node_id, parent_depth = stack.pop() - node_depth[node_id] = parent_depth + 1 - - # If we have a test node - if (children_left[node_id] != children_right[node_id]): - stack.append((children_left[node_id], parent_depth + 1)) - stack.append((children_right[node_id], parent_depth + 1)) - else: - is_leaf[node_id] = True - ## - - m = tree_.node_count - assert (m > 0), "Empty tree" - - def extract_data(idx, root = None, feature_names = None): - i = idx - assert (i < m), "Error index node" - if (root is None): - node = xgnode(i) - else: - node = xgnode(i, parent = root) - #node.cover = json_node["cover"] - if is_leaf[i]: - node.values = np.argmax(values[i]) - #if(inverse): - # node.values = -node.values - else: - node.feature = feature[i] - if (feature_names is not None): - node.name = feature_names[feature[i]] - node.threshold = threshold[i] - node.left_node_id = children_left[i] - node.right_node_id = children_right[i] - extract_data(node.left_node_id, node, feature_names) #feat < 0.5 (False) - extract_data(node.right_node_id, node, feature_names) #feat > 0.% (True) - - return node - - root = extract_data(0, None, feature_names) - - return root - - -#============================================================================== -def walk_tree(node): - if (len(node.children) == 0): - # leaf - print(node) - else: - print(node) - walk_tree(node.children[0]) - walk_tree(node.children[1]) - -# -#============================================================================== -def predict_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feature_branch = node.feature - sample_value = sample[feature_branch] - assert(sample_value is not None) - if(sample_value < node.threshold): - return predict_tree(node.children[0], sample) - else: - return predict_tree(node.children[1], sample) - - -# -#============================================================================== -class Forest: - """ An ensemble of decision trees. - - This object provides a common interface to many different types of models. - """ - def __init__(self, rf, feature_names = None): - self.rf = rf - ##self.feature_names = feature_names - self.trees = [ build_tree(dt.tree_, feature_names) for dt in self.rf.estimators()] - #self.print_trees() - - def print_trees(self): - for i,t in enumerate(self.trees): - print("tree number: ", i) - walk_tree(t) - - def predict(self, samples): - predictions = [] - n_estimators = self.rf.n_estimators() - print("#Trees: ", n_estimators) - for sample in np.asarray(samples): - scores = [] - for i,t in enumerate(self.trees): - s = predict_tree(t, sample) - scores.append((s)) - scores = np.asarray(scores) - predictions.append(scores) - predictions = np.asarray(predictions) - #print(predictions) - #np.bincount(x, weights=self._weights_not_none) - maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions) - - return maj - diff --git a/pages/application/RandomForest/utils/xrf/checker.py b/pages/application/RandomForest/utils/xrf/checker.py deleted file mode 100644 index 5fb8650613bc4fe4e8d9d033e71476729a051164..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/checker.py +++ /dev/null @@ -1,346 +0,0 @@ -# -#============================================================================== -import numpy as np -import math - -from .tree import Forest, dt_node -import six -from pysat.formula import CNF, IDPool -from pysat.solvers import Solver -from pysat.card import CardEnc, EncType -#from itertools import combinations - -# -#============================================================================== -def predict_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feat = node.feature - sample_value = sample[feat] - if sample_value is None: - return predict_tree(node.children[0], sample) - elif(sample_value < node.threshold): - return predict_tree(node.children[0], sample) - else: - return predict_tree(node.children[1], sample) - - -# -#============================================================================== -class Checker: - - def __init__(self, forest, num_class, feature_names): - self.forest = forest - self.num_class = num_class - self.feature_names = feature_names - self.cnf = None - self.vpool = IDPool() - - - self.intvs = None - self.intvs = {'{0}'.format(f): set([]) for f in feature_names if '_' not in f} - for tree in self.forest.trees: - self.traverse_intervals(tree) - self.intvs = {f: sorted(self.intvs[f]) + - ([math.inf] if len(self.intvs[f]) else []) - for f in six.iterkeys(self.intvs)} - self.imaps, self.ivars = {}, {} - self.thvars = {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - self.thvars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - ivar = self.newVar('{0}_intv{1}'.format(feat, i)) - self.ivars[feat].append(ivar) - if ub != math.inf: - thvar = self.newVar('{0}_th{1}'.format(feat, i)) - self.thvars[feat].append(thvar) - - - self.cnf = CNF() - #### - cvars = [self.newVar('class{0}'.format(i)) for i in range(num_class)] - num_tree = len(self.forest.trees) - ctvars = [[] for t in range(num_tree)] - for k in range(num_tree): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j,k)) - ctvars[k].append(var) - ##### - for k, tree in enumerate(self.forest.trees): - self.traverse(tree, k, []) - card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool,encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - ###### - for f, intvs in six.iteritems(self.ivars): - if not len(intvs): - continue - self.cnf.append(intvs) - card = CardEnc.atmost(lits=intvs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - for f, threshold in six.iteritems(self.thvars): - for j, thvar in enumerate(threshold): - d = j+1 - pos, neg = self.ivars[f][d:], self.ivars[f][:d] - if j == 0: - self.cnf.append([thvar, neg[-1]]) - self.cnf.append([-thvar, -neg[-1]]) - else: - self.cnf.append([thvar, neg[-1], -threshold[j-1]]) - self.cnf.append([-thvar, threshold[j-1]]) - self.cnf.append([-thvar, -neg[-1]]) - - if j == len(threshold) - 1: - self.cnf.append([-thvar, pos[0]]) - self.cnf.append([thvar, -pos[0]]) - else: - self.cnf.append([-thvar, pos[0], threshold[j+1]]) - self.cnf.append([thvar, -pos[0]]) - self.cnf.append([thvar, -threshold[j+1]]) - - - def newVar(self, name): - if name in self.vpool.obj2id: #var has been already created - return self.vpool.obj2id[name] - var = self.vpool.id('{0}'.format(name)) - return var - - def traverse(self, tree, k, clause): - if tree.children: - f = tree.name - v = tree.threshold - pos = neg = [] - if f in self.intvs: - d = self.imaps[f][v] - pos, neg = self.thvars[f][d], -self.thvars[f][d] - else: - var = self.newVar(tree.name) - pos, neg = var, -var - self.traverse(tree.children[0], k, clause + [-neg]) - self.traverse(tree.children[1], k, clause + [-pos]) - else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k)) - self.cnf.append(clause + [cvar]) - #self.printLits(clause + [cvar]) - - - - def traverse_intervals(self, tree): - if tree.children: - f = tree.name - v = tree.threshold - if f in self.intvs: - self.intvs[f].add(v) - self.traverse_intervals(tree.children[0]) - self.traverse_intervals(tree.children[1]) - - - def check(self, sample, expl): - print("check PI-expl") - slv = Solver(name="glucose3") - slv.append_formula(self.cnf) - - pred = self.forest.predict_inst(sample) - num_tree = len(self.forest.trees) - ##### - cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] - ctvars = [[] for t in range(num_tree)] - for k in range(num_tree): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j,k)) - ctvars[k].append(var) - # - rhs = num_tree - 1 - for j in range(pred): - lhs = [ctvars[k][j] for k in range(num_tree)] + [ - ctvars[k][pred] for k in range(num_tree)] - atms = CardEnc.atmost(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - #add maj class selector to activate/deactivate eq atmsk - #self.cnf.extend([cl + [-cvars[pred]] for cl in atms]) - slv.append_formula([cl + [-cvars[pred]] for cl in atms]) - rhs = num_tree - for j in range(pred + 1, self.num_class): - lhs = [ctvars[k][j] for k in range(num_tree)] + [ - ctvars[k][pred] for k in range(num_tree)] - atms = CardEnc.atmost(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - #self.cnf.extend([cl + [-cvars[pred]] for cl in atms]) - slv.append_formula([cl + [-cvars[pred]] for cl in atms]) - ######## - ######## - rhs = num_tree - for j in range(pred): - lhs = [ - ctvars[k][j] for k in range(num_tree)] + [ctvars[k][pred] for k in range(num_tree)] - atms = CardEnc.atmost(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - #self.cnf.extend([cl+[-cvars[j]] for cl in atms]) - slv.append_formula([cl+[-cvars[j]] for cl in atms]) - rhs = num_tree - 1 - for j in range(pred + 1, self.num_class): - lhs = [ - ctvars[k][j] for k in range(num_tree)] + [ctvars[k][pred] for k in range(num_tree)] - atms = CardEnc.atmost(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - #self.cnf.extend([cl+[-cvars[j]] for cl in atms]) - slv.append_formula([cl+[-cvars[j]] for cl in atms]) - ############ - #self.cnf.append(cvars) - card = CardEnc.atmost(lits=cvars, vpool=self.vpool, encoding=EncType.cardnetwrk) #AtMostOne constraint - #self.cnf.extend(card.clauses) - slv.add_clause(cvars) - slv.append_formula(card.clauses) - - assums = [] # var selectors to be used as assumptions - #sel2fid = {} # selectors to original feature ids - #sel2vid = {} # selectors to categorical feature ids - #sel2v = {} # selectors to (categorical/interval) values - sel_expl = [] - - #inps = ['f{0}'.format(f) for f in range(len(sample))] # works only with pure continuous feats - inps = self.feature_names - - for i, (inp, val) in enumerate(zip(inps, sample)): - if len(self.intvs[inp]): - v = next((intv for intv in self.intvs[inp] if intv > val), None) - assert(v is not None) - selv = self.newVar('selv_{0}'.format(inp)) - assums.append(selv) - ## - if i in expl: - sel_expl.append(selv) - #print('{0}={1}'.format('selv_{0}'.format(inp), val)) - ## - for j,p in enumerate(self.ivars[inp]): - cl = [-selv] - if j == self.imaps[inp][v]: - cl += [p] - #self.sel2v[selv] = p - else: - cl += [-p] - #self.cnf.append(cl) - slv.add_clause(cl) - assums = sorted(set(assums)) - #print(sel_expl, assums) - sel_pred = cvars[pred] - - #slv = Solver(name="glucose3") - #slv.append_formula(self.cnf) - - - assert (slv.solve(assumptions=sel_expl+[sel_pred])), '{0} is not an explanation.'.format(expl) - print('expl:{0} is valid'.format(expl)) - - for i, p in enumerate(sel_expl): - #print(i,p) - to_test = sel_expl[:i] + sel_expl[(i + 1):] + [-sel_pred] - print(to_test) - assert slv.solve(assumptions=to_test), '{0} is not minimal explanation.'.format(expl) - - # delete sat solver - slv.delete() - slv = None - - print('expl:{0} is minimal'.format(expl)) - print() - - -def check_expl(sample, expl, forest, intvs): - - print("check PI-expl") - - pred = forest.predict_inst(sample) - - sample_expl = [None]*len(sample) - for p in expl: - sample_expl[p] = sample[p] - - # initializing the intervals - #intvs = {'f{0}'.format(f): set([]) for f in range(len(sample))} - #for tree in forest.trees: - # traverse_intervals(tree) - - # first, check if expl is an explanation - scores = [predict_tree(dt, sample_expl) for dt in forest.trees] - scores = np.asarray(scores) - maj = np.argmax(np.bincount(scores)) - - assert maj == pred, '{0} is not an explanation.'.format(expl) - - print('expl:{0} is valid'.format(expl)) - print("pred = ", pred) - - sample_expl = sample - - feats = ['f{0}'.format(f) for f in expl] - univ = [(i, f) for i, f in enumerate(intvs) if (len(intvs[f]) and (f not in feats))] - - # Now, check if expl is a minimal - for p, f in zip(expl, feats): - print("{0}={1}".format(f, sample_expl[p])) - print([-math.inf]+intvs[f]) - assert(len(intvs[f])) - - # calculate possible values for f - possible_val = [] - d = next((i for i, v in enumerate(intvs[f]) if v > sample_expl[p]), None) - assert(d is not None) - print("d=",d) - - if d: - #possible_val.append(intvs[f][0] - 1) - possible_val.append(-math.inf) - print(intvs[f][:d-1]) - for i, v in enumerate(intvs[f][:d-1]): - possible_val.append((v + intvs[f][i + 1]) * 0.5) - - for i, v in enumerate(intvs[f][d+1:]): - #print('{0} + {1}'.format(v , intvs[f][d+i])) - possible_val.append((v + intvs[f][d+i]) * 0.5) - #if v == math.inf: - # assert(v == intvs[f][-1]) - # possible_val.append(v + 1) - #else: - # possible_val.append((v + intvs[f][i - 1]) * 0.5) - - - ## print("{0} => {1} | {2} , {3}".format(f,sample_expl[p], [-math.inf]+intvs[f], possible_val)) - for v in possible_val: - sample_expl[p] = v - for uf in univ: - for x in ([-math.inf]+intvs[uf[1]]): - print('{0}={1}'.format(uf[1], x)) - sample_expl[uf[0]] = x - scores = [predict_tree(dt, sample_expl) for dt in forest.trees] - scores = np.asarray(scores) - maj = np.argmax(np.bincount(scores)) - #print("maj: {0} | {1}={2}".format( maj, f, v)) - if maj != pred: - break - sample_expl[uf[0]] = sample[p] - - print("maj: {0} | {1}={2}".format( maj, f, v)) - - else: - assert False, '{0} is not minimal explanation.'.format(expl) - - sample_expl[p] = sample[p] - - print('expl:{0} is minimal'.format(expl)) - print() - - return True - -''' -def traverse_intervals(tree, intvs): - if tree.children: - f = tree.name - v = tree.threshold - if f in self.intvs: - intvs[p].add(v) - - l_intvs = traverse_intervals(tree.children[0]) - r_intvs = traverse_intervals(tree.children[1]) - return {**l_intvs, **r_intvs} - - else: - return intvs -''' - diff --git a/pages/application/RandomForest/utils/xrf/rndmforest.py b/pages/application/RandomForest/utils/xrf/rndmforest.py deleted file mode 100644 index 583fe91b8dcd0fab92b9be65b30794636497eb7d..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/rndmforest.py +++ /dev/null @@ -1,976 +0,0 @@ - -from sklearn.ensemble._voting import VotingClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.preprocessing import OneHotEncoder, LabelEncoder -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -import numpy as np -import sys -import os -import resource - -import collections -from six.moves import range -import six -import math - -from pages.application.RandomForest.utils.data import Data -from .tree import Forest, predict_tree -#from .encode import SATEncoder -from pysat.formula import CNF, IDPool -from pysat.solvers import Solver -from pysat.card import CardEnc, EncType -from itertools import combinations - - - -# -#============================================================================== -class Dataset(Data): - """ - Class for representing dataset (transactions). - """ - def __init__(self, filename=None, fpointer=None, mapfile=None, - separator=' ', use_categorical = False): - super().__init__(filename, fpointer, mapfile, separator, use_categorical) - - # split data into X and y - self.feature_names = self.names[:-1] - self.nb_features = len(self.feature_names) - self.use_categorical = use_categorical - - samples = np.asarray(self.samps) - if not all(c.isnumeric() for c in samples[:, -1]): - le = LabelEncoder() - le.fit(samples[:, -1]) - samples[:, -1]= le.transform(samples[:, -1]) - #self.class_names = le.classes_ - print(le.classes_) - print(samples[1:4, :]) - - samples = np.asarray(samples, dtype=np.float32) - self.X = samples[:, 0: self.nb_features] - self.y = samples[:, self.nb_features] - self.num_class = len(set(self.y)) - self.target_name = list(range(self.num_class)) - - print("c nof features: {0}".format(self.nb_features)) - print("c nof classes: {0}".format(self.num_class)) - print("c nof samples: {0}".format(len(self.samps))) - - # check if we have info about categorical features - if (self.use_categorical): - self.target_name = self.class_names - - self.binarizer = {} - for i in self.categorical_features: - self.binarizer.update({i: OneHotEncoder(categories='auto', sparse=False)})#, - self.binarizer[i].fit(self.X[:,[i]]) - else: - self.categorical_features = [] - self.categorical_names = [] - self.binarizer = [] - #feat map - self.mapping_features() - - - - def train_test_split(self, test_size=0.2, seed=0): - return train_test_split(self.X, self.y, test_size=test_size, random_state=seed) - - - def transform(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.binarizer != []) - tx = [] - for i in range(self.nb_features): - #self.binarizer[i].drop = None - if (i in self.categorical_features): - self.binarizer[i].drop = None - tx_aux = self.binarizer[i].transform(x[:,[i]]) - tx_aux = np.vstack(tx_aux) - tx.append(tx_aux) - else: - tx.append(x[:,[i]]) - tx = np.hstack(tx) - return tx - else: - return x - - def transform_inverse(self, x): - if(len(x) == 0): - return x - if (len(x.shape) == 1): - x = np.expand_dims(x, axis=0) - if (self.use_categorical): - assert(self.binarizer != []) - inverse_x = [] - for i, xi in enumerate(x): - inverse_xi = np.zeros(self.nb_features) - for f in range(self.nb_features): - if f in self.categorical_features: - nb_values = len(self.categorical_names[f]) - v = xi[:nb_values] - v = np.expand_dims(v, axis=0) - iv = self.binarizer[f].inverse_transform(v) - inverse_xi[f] =iv - xi = xi[nb_values:] - - else: - inverse_xi[f] = xi[0] - xi = xi[1:] - inverse_x.append(inverse_xi) - return inverse_x - else: - return x - - def transform_inverse_by_index(self, idx): - if (idx in self.extended_feature_names): - return self.extended_feature_names[idx] - else: - print("Warning there is no feature {} in the internal mapping".format(idx)) - return None - - def transform_by_value(self, feat_value_pair): - if (feat_value_pair in self.extended_feature_names.values()): - keys = (list(self.extended_feature_names.keys())[list( self.extended_feature_names.values()).index(feat_value_pair)]) - return keys - else: - print("Warning there is no value {} in the internal mapping".format(feat_value_pair)) - return None - - def mapping_features(self): - self.extended_feature_names = {} - self.extended_feature_names_as_array_strings = [] - counter = 0 - if (self.use_categorical): - for i in range(self.nb_features): - if (i in self.categorical_features): - for j, _ in enumerate(self.binarizer[i].categories_[0]): - self.extended_feature_names.update({counter: (self.feature_names[i], j)}) - self.extended_feature_names_as_array_strings.append("f{}_{}".format(i,j)) # str(self.feature_names[i]), j)) - counter = counter + 1 - else: - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i)) #(self.feature_names[i]) - counter = counter + 1 - else: - for i in range(self.nb_features): - self.extended_feature_names.update({counter: (self.feature_names[i], None)}) - self.extended_feature_names_as_array_strings.append("f{}".format(i))#(self.feature_names[i]) - counter = counter + 1 - - def readable_sample(self, x): - readable_x = [] - for i, v in enumerate(x): - if (i in self.categorical_features): - readable_x.append(self.categorical_names[i][int(v)]) - else: - readable_x.append(v) - return np.asarray(readable_x) - - - def test_encoding_transformes(self, X_train): - # test encoding - - X = X_train[[0],:] - - print("Sample of length", len(X[0])," : ", X) - enc_X = self.transform(X) - print("Encoded sample of length", len(enc_X[0])," : ", enc_X) - inv_X = self.transform_inverse(enc_X) - print("Back to sample", inv_X) - print("Readable sample", self.readable_sample(inv_X[0])) - assert((inv_X == X).all()) - - ''' - for i in range(len(self.extended_feature_names)): - print(i, self.transform_inverse_by_index(i)) - for key, value in self.extended_feature_names.items(): - print(value, self.transform_by_value(value)) - ''' -# -#============================================================================== -class VotingRF(VotingClassifier): - """ - Majority rule classifier - """ - - def fit(self, X, y, sample_weight=None): - self.estimators_ = [] - for _, est in self.estimators: - self.estimators_.append(est) - - self.le_ = LabelEncoder().fit(y) - self.classes_ = self.le_.classes_ - - - def predict(self, X): - """Predict class labels for X. - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. - Returns - ------- - maj : array-like of shape (n_samples,) - Predicted class labels. - """ - #check_is_fitted(self) - - # 'hard' voting - predictions = self._predict(X) - predictions = np.asarray(predictions, np.int64) #NEED TO BE CHECKED - maj = np.apply_along_axis( - lambda x: np.argmax( - np.bincount(x, weights=self._weights_not_none)), - axis=1, arr=predictions) - - maj = self.le_.inverse_transform(maj) - - return maj - - -# -#============================================================================== -class RF2001(object): - """ - The main class to train Random Forest Classifier (RFC). - """ - - def __init__(self, options): - """ - Constructor. - """ - self.forest = None - self.voting = None - self.opt = options - - param_dist = {'n_estimators':options.n_estimators, - 'max_depth':options.maxdepth, - 'criterion':'entropy', - 'random_state':324089} - - self.forest = RandomForestClassifier(**param_dist) - - - - def train(self, dataset, outfile=None): - """ - Train a random forest. - """ - - X_train, X_test, y_train, y_test = dataset.train_test_split() - - if self.opt.verb: - dataset.test_encoding_transformes(X_train) - - X_train = dataset.transform(X_train) - X_test = dataset.transform(X_test) - - print("Build a random forest.") - self.forest.fit(X_train,y_train) - - rtrees = [ ('dt', dt) for i, dt in enumerate(self.forest.estimators_)] - self.voting = VotingRF(estimators=rtrees) - self.voting.fit(X_train,y_train) - - ''' - print(X_test[[0],:]) - print("RF: ",np.asarray(self.voting.predict(X_test[[0],:]))) - for i,t in enumerate(self.forest.estimators_): - print("DT_{0}: {1}".format(i,np.asarray(t.predict(X_test[[0],:])))) - ''' - - train_acc = accuracy_score(self.predict(X_train), y_train) - test_acc = accuracy_score(self.predict(X_test), y_test) - - if self.opt.verb > 1: - self.print_acc_vote(X_train, X_test, y_train, y_test) - self.print_acc_prob(X_train, X_test, y_train, y_test) - - return train_acc, test_acc - - def predict(self, X): - return self.voting.predict(X) - - def predict_prob(self, X): - self.forest.predict(X) - - def estimators(self): - assert(self.forest.estimators_ is not None) - return self.forest.estimators_ - - def n_estimators(self): - return self.forest.n_estimators - - def print_acc_vote(self, X_train, X_test, y_train, y_test): - train_acc = accuracy_score(self.predict(X_train), y_train) - test_acc = accuracy_score(self.predict(X_test), y_test) - print("----------------------") - print("RF2001:") - print("Train accuracy RF2001: {0:.2f}".format(100. * train_acc)) - print("Test accuracy RF2001: {0:.2f}".format(100. * test_acc)) - print("----------------------") - - def print_acc_prob(self, X_train, X_test, y_train, y_test): - train_acc = accuracy_score(self.forest.predict(X_train), y_train) - test_acc = accuracy_score(self.forest.predict(X_test), y_test) - print("RF-scikit:") - print("Train accuracy RF-scikit: {0:.2f}".format(100. * train_acc)) - print("Test accuracy RF-scikit: {0:.2f}".format(100. * test_acc)) - print("----------------------") - - def print_accuracy(self, data): - _, X_test, _, y_test = data.train_test_split() - #X_train = dataset.transform(X_train) - X_test = data.transform(X_test) - test_acc = accuracy_score(self.predict(X_test), y_test) - #print("----------------------") - #print("Train accuracy : {0:.2f}".format(100. * train_acc)) - #print("Test accuracy : {0:.2f}".format(100. * test_acc)) - print("c Cross-Validation: {0:.2f}".format(100. * test_acc)) - #print("----------------------") -# -#============================================================================== -class XRF(object): - """ - class to encode and explain Random Forest classifiers. - """ - - def __init__(self, options, model, dataset): - self.cls = model - self.data = dataset - self.verbose = options.verb - self.f = Forest(model, dataset.extended_feature_names_as_array_strings) - - if options.verb > 2: - self.f.print_trees() - print("c RF sz:", self.f.sz) - print('c max-depth:', self.f.md) - print('c nof DTs:', len(self.f.trees)) - - - def encode(self, inst): - """ - Encode a tree ensemble trained previously. - """ - if 'f' not in dir(self): - self.f = Forest(self.cls, self.data.extended_feature_names_as_array_strings) - #self.f.print_tree() - - time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - self.enc = SATEncoder(self.f, self.data.feature_names, self.data.num_class, \ - self.data.extended_feature_names_as_array_strings) - - inst = self.data.transform(np.array(inst))[0] - formula, _, _, _ = self.enc.encode(inst) - - time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - time - - if self.verbose: - print('c nof vars:', formula.nv) # number of variables - print('c nof clauses:', len(formula.clauses)) # number of clauses - print('c encoding time: {0:.3f}'.format(time)) - - def explain(self, inst): - """ - Explain a prediction made for a given sample with a previously - trained RF. - """ - - time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - if 'enc' not in dir(self): - self.encode(inst) - - #if self.verbose: - # print("instance: {0}".format(np.array(inst)) ) - - inpvals = self.data.readable_sample(inst) - preamble = [] - for f, v in zip(self.data.feature_names, inpvals): - if f not in str(v): - preamble.append('{0} = {1}'.format(f, v)) - else: - preamble.append(v) - - inps = self.data.extended_feature_names_as_array_strings # input (feature value) variables - #print("inps: {0}".format(inps)) - - self.x = SATExplainer(self.enc, inps, preamble, self.data.target_name, verb=self.verbose) - inst = self.data.transform(np.array(inst))[0] - expl = self.x.explain(np.array(inst)) - - time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - time - - if self.verbose: - print("c Total time: {0:.3f}".format(time)) - - return expl - - - def test_tree_ensemble(self): - if 'f' not in dir(self): - self.f = Forest(self.cls) - - _, X_test, _, y_test = self.data.train_test_split() - X_test = self.data.transform(X_test) - - y_pred_forest = self.f.predict(X_test) - acc = accuracy_score(y_pred_forest, y_test) - print("Test accuracy: {0:.2f}".format(100. * acc)) - - y_pred_cls = self.cls.predict(X_test) - #print(np.asarray(y_pred_cls, np.int64)) - #print(y_pred_forest) - - assert((y_pred_cls == y_pred_forest).all()) - - -# -#============================================================================== -class SATEncoder(object): - """ - Encoder of Random Forest classifier into SAT. - """ - - def __init__(self, forest, feats, nof_classes, extended_feature_names, from_file=None): - self.forest = forest - #self.feats = {f: i for i, f in enumerate(feats)} - self.num_class = nof_classes - self.vpool = IDPool() - self.extended_feature_names = extended_feature_names - - #encoding formula - self.cnf = None - - # for interval-based encoding - self.intvs, self.imaps, self.ivars, self.thvars = None, None, None, None - - - def newVar(self, name): - - if name in self.vpool.obj2id: #var has been already created - return self.vpool.obj2id[name] - - var = self.vpool.id('{0}'.format(name)) - return var - - def printLits(self, lits): - print(["{0}{1}".format("-" if p<0 else "",self.vpool.obj(abs(p))) for p in lits]) - - def traverse(self, tree, k, clause): - """ - Traverse a tree and encode each node. - """ - - if tree.children: - f = tree.name - v = tree.threshold - pos = neg = [] - if f in self.intvs: - d = self.imaps[f][v] - pos, neg = self.thvars[f][d], -self.thvars[f][d] - else: - var = self.newVar(tree.name) - pos, neg = var, -var - #print("{0} => {1}".format(tree.name, var)) - - assert (pos and neg) - self.traverse(tree.children[0], k, clause + [-neg]) - self.traverse(tree.children[1], k, clause + [-pos]) - else: # leaf node - cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k)) - self.cnf.append(clause + [cvar]) - #self.printLits(clause + [cvar]) - - def compute_intervals(self): - """ - Traverse all trees in the ensemble and extract intervals for each - feature. - - At this point, the method only works for numerical datasets! - """ - - def traverse_intervals(tree): - """ - Auxiliary function. Recursive tree traversal. - """ - - if tree.children: - f = tree.name - v = tree.threshold - if f in self.intvs: - self.intvs[f].add(v) - - traverse_intervals(tree.children[0]) - traverse_intervals(tree.children[1]) - - # initializing the intervals - self.intvs = {'{0}'.format(f): set([]) for f in self.extended_feature_names if '_' not in f} - - for tree in self.forest.trees: - traverse_intervals(tree) - - # OK, we got all intervals; let's sort the values - self.intvs = {f: sorted(self.intvs[f]) + ([math.inf] if len(self.intvs[f]) else []) for f in six.iterkeys(self.intvs)} - - self.imaps, self.ivars = {}, {} - self.thvars = {} - for feat, intvs in six.iteritems(self.intvs): - self.imaps[feat] = {} - self.ivars[feat] = [] - self.thvars[feat] = [] - for i, ub in enumerate(intvs): - self.imaps[feat][ub] = i - - ivar = self.newVar('{0}_intv{1}'.format(feat, i)) - self.ivars[feat].append(ivar) - #print('{0}_intv{1}'.format(feat, i)) - - if ub != math.inf: - #assert(i < len(intvs)-1) - thvar = self.newVar('{0}_th{1}'.format(feat, i)) - self.thvars[feat].append(thvar) - #print('{0}_th{1}'.format(feat, i)) - - - - def encode(self, sample): - """ - Do the job. - """ - - ###print('Encode RF into SAT ...') - - self.cnf = CNF() - # getting a tree ensemble - #self.forest = Forest(self.model, self.extended_feature_names) - num_tree = len(self.forest.trees) - self.forest.predict_inst(sample) - - #introducing class variables - #cvars = [self.newVar('class{0}'.format(i)) for i in range(self.num_class)] - - # define Tautology var - vtaut = self.newVar('Tautology') - - # introducing class-tree variables - ctvars = [[] for t in range(num_tree)] - for k in range(num_tree): - for j in range(self.num_class): - var = self.newVar('class{0}_tr{1}'.format(j,k)) - ctvars[k].append(var) - - # traverse all trees and extract all possible intervals - # for each feature - ###print("compute intervarls ...") - self.compute_intervals() - - #print(self.intvs) - #print([len(self.intvs[f]) for f in self.intvs]) - #print(self.imaps) - #print(self.ivars) - #print(self.thvars) - #print(ctvars) - - - ##print("encode trees ...") - # traversing and encoding each tree - for k, tree in enumerate(self.forest.trees): - #print("Encode tree#{0}".format(k)) - # encoding the tree - self.traverse(tree, k, []) - # exactly one class var is true - #self.printLits(ctvars[k]) - card = CardEnc.atmost(lits=ctvars[k], vpool=self.vpool,encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - - - - # calculate the majority class - self.cmaj = self.forest.predict_inst(sample) - - ##print("encode majority class ...") - #Cardinality constraint AtMostK to capture a j_th class - - if(self.num_class == 2): - rhs = math.floor(num_tree / 2) + 1 - if(self.cmaj==1 and not num_tree%2): - rhs = math.floor(num_tree / 2) - lhs = [ctvars[k][1 - self.cmaj] for k in range(num_tree)] - atls = CardEnc.atleast(lits = lhs, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(atls) - else: - zvars = [] - zvars.append([self.newVar('z_0_{0}'.format(k)) for k in range (num_tree) ]) - zvars.append([self.newVar('z_1_{0}'.format(k)) for k in range (num_tree) ]) - ## - rhs = num_tree - lhs0 = zvars[0] + [ - ctvars[k][self.cmaj] for k in range(num_tree)] - ##self.printLits(lhs0) - atls = CardEnc.atleast(lits = lhs0, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(atls) - ## - #rhs = num_tree - 1 - rhs = num_tree + 1 - ########### - lhs1 = zvars[1] + [ - ctvars[k][self.cmaj] for k in range(num_tree)] - ##self.printLits(lhs1) - atls = CardEnc.atleast(lits = lhs1, bound = rhs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(atls) - # - pvars = [self.newVar('p_{0}'.format(k)) for k in range(self.num_class + 1)] - ##self.printLits(pvars) - for k,p in enumerate(pvars): - for i in range(num_tree): - if k == 0: - z = zvars[0][i] - #self.cnf.append([-p, -z, vtaut]) - self.cnf.append([-p, z, -vtaut]) - #self.printLits([-p, z, -vtaut]) - #print() - elif k == self.cmaj+1: - z = zvars[1][i] - self.cnf.append([-p, z, -vtaut]) - - #self.printLits([-p, z, -vtaut]) - #print() - - else: - z = zvars[0][i] if (k<self.cmaj+1) else zvars[1][i] - self.cnf.append([-p, -z, ctvars[i][k-1] ]) - self.cnf.append([-p, z, -ctvars[i][k-1] ]) - - #self.printLits([-p, -z, ctvars[i][k-1] ]) - #self.printLits([-p, z, -ctvars[i][k-1] ]) - #print() - - # - self.cnf.append([-pvars[0], -pvars[self.cmaj+1]]) - ## - lhs1 = pvars[:(self.cmaj+1)] - ##self.printLits(lhs1) - eqls = CardEnc.equals(lits = lhs1, bound = 1, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(eqls) - - - lhs2 = pvars[(self.cmaj + 1):] - ##self.printLits(lhs2) - eqls = CardEnc.equals(lits = lhs2, bound = 1, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(eqls) - - - - ##print("exactly-one feat const ...") - # enforce exactly one of the feature values to be chosen - # (for categorical features) - categories = collections.defaultdict(lambda: []) - for f in self.extended_feature_names: - if '_' in f: - categories[f.split('_')[0]].append(self.newVar(f)) - for c, feats in six.iteritems(categories): - # exactly-one feat is True - self.cnf.append(feats) - card = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - # lits of intervals - for f, intvs in six.iteritems(self.ivars): - if not len(intvs): - continue - self.cnf.append(intvs) - card = CardEnc.atmost(lits=intvs, vpool=self.vpool, encoding=EncType.cardnetwrk) - self.cnf.extend(card.clauses) - #self.printLits(intvs) - - - - for f, threshold in six.iteritems(self.thvars): - for j, thvar in enumerate(threshold): - d = j+1 - pos, neg = self.ivars[f][d:], self.ivars[f][:d] - - if j == 0: - assert(len(neg) == 1) - self.cnf.append([thvar, neg[-1]]) - self.cnf.append([-thvar, -neg[-1]]) - else: - self.cnf.append([thvar, neg[-1], -threshold[j-1]]) - self.cnf.append([-thvar, threshold[j-1]]) - self.cnf.append([-thvar, -neg[-1]]) - - if j == len(threshold) - 1: - assert(len(pos) == 1) - self.cnf.append([-thvar, pos[0]]) - self.cnf.append([thvar, -pos[0]]) - else: - self.cnf.append([-thvar, pos[0], threshold[j+1]]) - self.cnf.append([thvar, -pos[0]]) - self.cnf.append([thvar, -threshold[j+1]]) - - - - return self.cnf, self.intvs, self.imaps, self.ivars - - -# -#============================================================================== -class SATExplainer(object): - """ - An SAT-inspired minimal explanation extractor for Random Forest models. - """ - - def __init__(self, sat_enc, inps, preamble, target_name, verb=1): - """ - Constructor. - """ - - self.enc = sat_enc - self.inps = inps # input (feature value) variables - self.target_name = target_name - self.preamble = preamble - - self.verbose = verb - - self.slv = None - ##self.slv = Solver(name=options.solver) - ##self.slv = Solver(name="minisat22") - #self.slv = Solver(name="glucose3") - # CNF formula - #self.slv.append_formula(self.enc.cnf) - - - def explain(self, sample, smallest=False): - """ - Hypotheses minimization. - """ - if self.verbose: - print(' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) - - #create a SAT solver - self.slv = Solver(name="glucose3") - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - # adapt the solver to deal with the current sample - #self.csel = [] - self.assums = [] # var selectors to be used as assumptions - self.sel2fid = {} # selectors to original feature ids - self.sel2vid = {} # selectors to categorical feature ids - self.sel2v = {} # selectors to (categorical/interval) values - - #for i in range(self.enc.num_class): - # self.csel.append(self.enc.newVar('class{0}'.format(i))) - #self.csel = self.enc.newVar('class{0}'.format(self.enc.cmaj)) - - # preparing the selectors - for i, (inp, val) in enumerate(zip(self.inps, sample), 1): - if '_' in inp: - - assert (inp not in self.enc.intvs) - - feat = inp.split('_')[0] - selv = self.enc.newVar('selv_{0}'.format(feat)) - - self.assums.append(selv) - if selv not in self.sel2fid: - self.sel2fid[selv] = int(feat[1:]) - self.sel2vid[selv] = [i - 1] - else: - self.sel2vid[selv].append(i - 1) - - p = self.enc.newVar(inp) - if not val: - p = -p - else: - self.sel2v[selv] = p - - self.enc.cnf.append([-selv, p]) - - #self.enc.printLits([-selv, p]) - - elif len(self.enc.intvs[inp]): - #v = None - #for intv in self.enc.intvs[inp]: - # if intv > val: - # v = intv - # break - v = next((intv for intv in self.enc.intvs[inp] if intv > val), None) - assert(v is not None) - - selv = self.enc.newVar('selv_{0}'.format(inp)) - self.assums.append(selv) - - assert (selv not in self.sel2fid) - self.sel2fid[selv] = int(inp[1:]) - self.sel2vid[selv] = [i - 1] - - for j,p in enumerate(self.enc.ivars[inp]): - cl = [-selv] - if j == self.enc.imaps[inp][v]: - cl += [p] - self.sel2v[selv] = p - else: - cl += [-p] - - self.enc.cnf.append(cl) - #self.enc.printLits(cl) - ''' - with open("/tmp/pendigits.cnf", 'w') as fp: - fp.write('p cnf {0} {1}\n'.format(self.enc.cnf.nv, len(self.enc.cnf.clauses))) - for p in self.assums + [-self.csel]: - fp.write('{0} 0\n'.format(str(p))) - - for cl in self.enc.cnf.clauses: - fp.write(' '.join([str(p) for p in cl+[0]])) - fp.write('\n') - fp.close() - print(self.assums + [self.csel]) - ''' - - self.assums = sorted(set(self.assums)) - if self.verbose: - print(' # hypos:', len(self.assums)) - - # pass a CNF formula - self.slv.append_formula(self.enc.cnf) - - ''' - # if unsat, then the observation is not implied by the assumptions - if not self.slv.solve(assumptions=self.assums+[self.csel]): - print(' no implication!') - print(self.slv.get_core()) - sys.exit(1) - - if self.verbose > 1: - self.enc.printLits(self.assums+[self.csel]) - self.print_sat_model() - ''' - - if not smallest: - self.compute_minimal() - else: - raise NotImplementedError('Smallest explanation is not yet implemented.') - #self.compute_smallest() - - self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time - - expl = sorted([self.sel2fid[h] for h in self.assums if h>0 ]) - assert len(expl), 'PI-explanation cannot be an empty-set! otherwise the RF predicts only one class' - - # delete sat solver - self.slv.delete() - self.slv = None - - if self.verbose: - print("expl-selctors: ", expl) - self.preamble = [self.preamble[i] for i in expl] - print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.target_name[self.enc.cmaj])) - print(' # hypos left:', len(expl)) - print(' time: {0:.3f}'.format(self.time)) - - return expl - - def compute_minimal(self): - """ - Compute any subset-minimal explanation. - """ - nsat, nunsat = 0, 0 - stimes, utimes = [], [] - - vtaut = self.enc.newVar('Tautology') - - # simple deletion-based linear search - for i, p in enumerate(self.assums): - to_test = [vtaut] + self.assums[:i] + self.assums[(i + 1):] + [-p, -self.sel2v[p]] - - t0 = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - - sat = self.slv.solve(assumptions=to_test) - - if not sat: - self.assums[i] = -p - elif self.verbose > 1: - self.print_sat_model() - - t = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \ - resource.getrusage(resource.RUSAGE_SELF).ru_utime - t0 - #print("{0} {1:.2f}s".format("SAT" if sat else "UNSAT", t)) - - if sat: - nsat += 1 - stimes.append(t) - if self.verbose > 1: - self.enc.printLits(to_test) - print("SAT") - else: - #print("Core: ",self.slv.get_core()) - nunsat += 1 - utimes.append(t) - if self.verbose > 1: - self.enc.printLits(to_test) - print("UNSAT") - if self.verbose: - print('') - print('#SAT: {0} | #UNSAT: {1}'.format(len(stimes), len(utimes))) - if(nsat): - print('SAT: tot: {0:.2f} | m: {1:.2f} | M: {2:.2f} | avg: {3:.2f}'.format( - sum(stimes), min(stimes), max(stimes), sum(stimes) / len(stimes))) - if(nunsat): - print('UNSAT: tot: {0:.2f} | m: {1:.2f} | M: {2:.2f} | avg: {3:.2f}'.format( - sum(utimes), min(utimes), max(utimes), sum(utimes) / len(utimes))) - print('') - - self.stimes, self.utimes = stimes, utimes - self.nsat, self.nunsat = nsat, nunsat - - - def print_sat_model(self): - assert(self.slv.get_model()) - model = [ p for p in self.slv.get_model() if self.enc.vpool.obj(abs(p)) ] - str_model = [] - lits = [] - for p in model: - if self.enc.vpool.obj(abs(p)) in self.inps: - str_model.append((p, self.enc.vpool.obj(abs(p)))) - - elif ("class" in self.enc.vpool.obj(abs(p))): - str_model.append((p, self.enc.vpool.obj(abs(p)))) - - #elif ("intv" in self.enc.vpool.obj(abs(p))) : - # str_model.append((p, self.enc.vpool.obj(abs(p)))) - - if ("_tr" in self.enc.vpool.obj(abs(p))) : - lits.append(p) - - if ("p_" in self.enc.vpool.obj(abs(p))) : - str_model.append((p, self.enc.vpool.obj(abs(p)))) - if ("z_" in self.enc.vpool.obj(abs(p))) : - str_model.append((p, self.enc.vpool.obj(abs(p)))) - - print("Model:", str_model) - ###print(self.slv.get_model()) - - num_tree = len(self.enc.forest.trees) - num_class = self.enc.num_class - occ = [0]*num_class - - for p in lits: - if p > 0: - j = int(self.enc.vpool.obj(abs(p))[5]) - occ[j] +=1 - print(occ) - diff --git a/pages/application/RandomForest/utils/xrf/tree.py b/pages/application/RandomForest/utils/xrf/tree.py deleted file mode 100644 index da81c9820d69d96061446e9d1eafbcb265bf1351..0000000000000000000000000000000000000000 --- a/pages/application/RandomForest/utils/xrf/tree.py +++ /dev/null @@ -1,174 +0,0 @@ -# -#============================================================================== -from anytree import Node, RenderTree,AsciiStyle -import json -import numpy as np -import math -import os - - -# -#============================================================================== -class dt_node(Node): - def __init__(self, id, parent = None): - Node.__init__(self, id, parent) - self.id = id # The node value - self.name = None - self.left_node_id = -1 # Left child - self.right_node_id = -1 # Right child - - self.feature = -1 - self.threshold = None - self.values = -1 - #iai - #self.split = None - - def __str__(self): - pref = ' ' * self.depth - if len(self.children) == 0: - return (pref+ "leaf: {} {}".format(self.id, self.values)) - else: - if(self.name is None): - return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold)) - else: - return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold)) - - -#============================================================================== -def build_tree(tree_, feature_names = None): - ## - feature = tree_.feature - threshold = tree_.threshold - values = tree_.value - n_nodes = tree_.node_count - children_left = tree_.children_left - children_right = tree_.children_right - node_depth = np.zeros(shape=n_nodes, dtype=np.int64) - is_leaf = np.zeros(shape=n_nodes, dtype=bool) - stack = [(0, -1)] # seed is the root node id and its parent depth - while len(stack) > 0: - node_id, parent_depth = stack.pop() - node_depth[node_id] = parent_depth + 1 - - # If we have a test node - if (children_left[node_id] != children_right[node_id]): - stack.append((children_left[node_id], parent_depth + 1)) - stack.append((children_right[node_id], parent_depth + 1)) - else: - is_leaf[node_id] = True - ## - - m = tree_.node_count - assert (m > 0), "Empty tree" - - def extract_data(idx, root = None, feature_names = None): - i = idx - assert (i < m), "Error index node" - if (root is None): - node = dt_node(i) - else: - node = dt_node(i, parent = root) - #node.cover = json_node["cover"] - if is_leaf[i]: - node.values = np.argmax(values[i]) - #if(inverse): - # node.values = -node.values - else: - node.feature = feature[i] - if (feature_names is not None): - node.name = feature_names[feature[i]] - node.threshold = threshold[i] - node.left_node_id = children_left[i] - node.right_node_id = children_right[i] - extract_data(node.left_node_id, node, feature_names) #feat < threshold ( < 0.5 False) - extract_data(node.right_node_id, node, feature_names) #feat >= threshold ( >= 0.5 True) - - return node - - root = extract_data(0, None, feature_names) - - return root - - -#============================================================================== -def walk_tree(node): - if (len(node.children) == 0): - # leaf - print(node) - else: - print(node) - walk_tree(node.children[0]) - walk_tree(node.children[1]) - -def count_nodes(root): - def count(node): - if len(node.children): - return sum([1+count(n) for n in node.children]) - else: - return 0 - m = count(root) + 1 - return m - -# -#============================================================================== -def predict_tree(node, sample): - if (len(node.children) == 0): - # leaf - return node.values - else: - feature_branch = node.feature - sample_value = sample[feature_branch] - assert(sample_value is not None) - if(sample_value < node.threshold): - return predict_tree(node.children[0], sample) - else: - return predict_tree(node.children[1], sample) - - -# -#============================================================================== -class Forest: - """ An ensemble of decision trees. - - This object provides a common interface to many different types of models. - """ - def __init__(self, rf, feature_names = None): - #self.rf = rf - self.trees = [ build_tree(dt.tree_, feature_names) for dt in rf.estimators()] - self.sz = sum([dt.tree_.node_count for dt in rf.estimators()]) - self.md = max([dt.tree_.max_depth for dt in rf.estimators()]) - #### - nb_nodes = [dt.tree_.node_count for dt in rf.estimators()] - print("min: {0} | max: {1}".format(min(nb_nodes), max(nb_nodes))) - assert([dt.tree_.node_count for dt in rf.estimators()] == [count_nodes(dt) for dt in self.trees]) - #self.print_trees() - - def print_trees(self): - for i,t in enumerate(self.trees): - print("tree number: ", i) - walk_tree(t) - - def predict_inst(self, inst): - scores = [predict_tree(dt, inst) for dt in self.trees] - scores = np.asarray(scores) - maj = np.argmax(np.bincount(scores)) - return maj - - - def predict(self, samples): - predictions = [] - print("#Trees: ", len(self.trees)) - for sample in np.asarray(samples): - scores = [] - for i,t in enumerate(self.trees): - s = predict_tree(t, sample) - scores.append((s)) - scores = np.asarray(scores) - predictions.append(scores) - predictions = np.asarray(predictions) - #print(predictions) - #np.bincount(x, weights=self._weights_not_none) - maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions) - - return maj - diff --git a/pages/application/application.py b/pages/application/application.py index 65475b9e6b50c7213810ddf1e10a0dae6e9ed148..84ffdd5b2db368a22794ba2be29a6bf6246ebc85 100644 --- a/pages/application/application.py +++ b/pages/application/application.py @@ -3,11 +3,6 @@ import dash_bootstrap_components as dbc import dash_daq as daq from pages.application.DecisionTree.DecisionTreeComponent import DecisionTreeComponent -from pages.application.NaiveBayes.NaiveBayesComponent import NaiveBayesComponent -from pages.application.RandomForest.RandomForestComponent import RandomForestComponent - -import subprocess - class Application(): def __init__(self, view): @@ -54,7 +49,9 @@ class Model(): self.component_class = self.dict_components[self.ml_model] self.component_class = globals()[self.component_class] self.solvers = self.dic_solvers[self.ml_model] + self.solver = self.solvers[0] self.xtypes = self.dic_xtypes[self.ml_model] + self.xtype = [list(self.xtypes.keys())[0]] def update_pretrained_model(self, pretrained_model_update): self.pretrained_model = pretrained_model_update @@ -192,9 +189,9 @@ class View(): self.sidebar = dcc.Tabs(children=[ dcc.Tab(label='Basic Parameters', children=[ self.ml_menu_models, - self.pretrained_model_upload, self.add_model_info_choice, self.model_info, + self.pretrained_model_upload, self.instance_upload], className="sidebar"), dcc.Tab(label='Advanced Parameters', children=[ html.Br(), diff --git a/pages/course/main_course.py b/pages/course/main_course.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd6b6a65e1737e68d9e7e6032ab824454e83352 --- /dev/null +++ b/pages/course/main_course.py @@ -0,0 +1,17 @@ +from dash import dcc, html +import dash_bootstrap_components as dbc +import dash_daq as daq +from dash import html + +course_data_format = html.Div(html.Iframe( + src="assets/course_data_format.html", + style={"height": "1067px", "width": "100%"}, + )) +course_decision_tree = html.Iframe( + src="assets/course_data_format.html", + style={"height": "1067px", "width": "100%"}, + ) + +main_course = dcc.Tabs(children=[ + dcc.Tab(label='Data format', children=[course_data_format]), + dcc.Tab(label='Course Decision Tree', children=[course_decision_tree])]) diff --git a/requirements.txt b/requirements.txt index 8f12a33646aa22ec091e89844763ad78408bc265..7999a92565259a58e27f2c240e05b57f93f26c6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,5 +20,4 @@ xgboost lime shap anchor-exp -pysmt -anytree +pysmt \ No newline at end of file diff --git a/utils.py b/utils.py index 9fd8a5498e8323b33ae9d840ddb75025e86d226a..4cab3ac21eae535367d725dace1822fc000eecb2 100644 --- a/utils.py +++ b/utils.py @@ -6,25 +6,16 @@ import joblib import pickle from dash import html -from pages.application.RandomForest.utils import xrf -from pages.application.RandomForest.utils.xrf import * -sys.modules['xrf'] = xrf -from pages.application.RandomForest.utils import options -from pages.application.RandomForest.utils.options import * -sys.modules['options'] = options def parse_contents_graph(contents, filename): - content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) try: - if 'mod.pkl' in filename: - print("in") - print(io.BytesIO(decoded)) - print(pickle.load(io.BytesIO(decoded))) - data = pickle.load(io.BytesIO(decoded)) - elif '.pkl' in filename: - data = joblib.load(io.BytesIO(decoded)) + if '.pkl' in filename: + try: + data = joblib.load(io.BytesIO(decoded)) + except : + data = pickle.load(io.BytesIO(decoded)) elif '.txt' in filename: data = decoded.decode('utf-8').strip() except Exception as e: