Skip to content
Snippets Groups Projects
Commit 6caa741f authored by Caroline DE POURTALES's avatar Caroline DE POURTALES
Browse files

update

parent a8d388a1
No related branches found
No related tags found
No related merge requests found
Showing
with 285 additions and 2329 deletions
......@@ -22,7 +22,7 @@
"ml_type" : "RandomForest",
"component" : "RandomForestComponent",
"solvers" : ["LIME", "ANCHOR", "SHAP"],
"xtypes" : {"H": "Heuristic", "HV": "Heuristic and validation", "G": "Global"}
"xtypes" : {"S": "Smallest", "NS": "Not smallest"}
}
]
......
......@@ -11,16 +11,16 @@ from pages.application.DecisionTree.utils.dtviz import (visualize,
from pages.application.DecisionTree.utils.upload_tree import UploadedDecisionTree
class DecisionTreeComponent():
class DecisionTreeComponent:
def __init__(self, tree, type_tree='SKL', info=None, type_info=''):
def __init__(self, tree, info=None, type_info=''):
if info is not None and '.csv' in type_info:
self.categorical = True
data = Data(info)
fvmap = data.mapping_features()
feature_names = data.names[:-1]
self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(),
self.uploaded_dt = UploadedDecisionTree(tree, "SKL", maxdepth=tree.get_depth(),
feature_names=feature_names, nb_classes=tree.n_classes_)
self.dt_format, self.map, features_names_mapping = self.uploaded_dt.dump(fvmap, feat_names=feature_names)
......@@ -38,7 +38,7 @@ class DecisionTreeComponent():
dom = sorted(dom)
for j, v in enumerate(dom):
fvmap[f'f{i}'][j] = (fid, True, v)
self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(),
self.uploaded_dt = UploadedDecisionTree(tree, "SKL", maxdepth=tree.get_depth(),
feature_names=feature_names, nb_classes=tree.n_classes_)
self.dt_format, self.map, features_names_mapping = self.uploaded_dt.dump(fvmap, feat_names=feature_names)
......@@ -48,7 +48,7 @@ class DecisionTreeComponent():
feature_names = tree.feature_names_in_
except:
feature_names = [f'f{i}' for i in range(tree.n_features_in_)]
self.uploaded_dt = UploadedDecisionTree(tree, type_tree, maxdepth=tree.get_depth(),
self.uploaded_dt = UploadedDecisionTree(tree, "SKL", maxdepth=tree.get_depth(),
feature_names=feature_names, nb_classes=tree.n_classes_)
self.dt_format, self.map, features_names_mapping = self.uploaded_dt.convert_dt(feat_names=feature_names)
......
......@@ -11,7 +11,7 @@ import shlex
class NaiveBayesComponent():
def __init__(self, model, type_model='SKL', info=None, type_info=''):
def __init__(self, model, info=None, type_info=''):
#Conversion model
p=subprocess.Popen(['perl','pages/application/NaiveBayes/utils/cnbc2xlc.pl', model],stdout=subprocess.PIPE)
......
import base64
from dash import html
from io import StringIO
import pandas as pd
import dash_bootstrap_components as dbc
import numpy as np
from dash import dcc, html
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble._voting import VotingClassifier
from pages.application.RandomForest.utils import xrf
from pages.application.RandomForest.utils.xrf.rndmforest import XRF, RF2001, VotingRF, Dataset
from xgboost import XGBClassifier, XGBRFClassifier
from pages.application.RandomForest.utils.data import Data
from pages.application.RandomForest.utils.anchor_wrap import anchor_call
from pages.application.RandomForest.utils.lime_wrap import lime_call
from pages.application.RandomForest.utils.shap_wrap import shap_call
from pages.application.RandomForest.utils.xgbooster import XGBooster, preprocess_dataset
from pages.application.RandomForest.utils.xgbrf import XGBRandomForest
from pages.application.RandomForest.utils.xgbooster import XGBooster
from pages.application.RandomForest.utils.xgbrf import XGBRandomForest
class RandomForestComponent:
def __init__(self, model, type_model='SKL', info=None, type_info=''):
#################################################################
##################### Questions #################################
##### Can upload sklearn model or need to translate to xrf ######
# separate XGBRandomForest and XGBooster #
# data format : avoir un fichier chelou de données là #
# pas adapté, get_dump ? beaucoup de classe, besoin de diagramme de classe #
if info is not None and '.csv' in type_info:
self.data = Data(info)
# Conversion model
if type_model == "RF":
self.random_forest = XGBRandomForest(info, from_model=model)
else:
self.random_forest = XGBooster(info, from_model=model)
class RandomForestComponent:
# self.random_forest.encode(test_on=info)
def __init__(self, model, info=None, type_info=''):
self.map_file = ""
# Conversion model
options = {}
if info is not None and '.csv' in type_info:
self.data = Dataset(info)
self.data.mapping_features()
options["n_classes"] = self.data.num_class
options["feature_names"] = self.data.feature_names
options["n_features"] = self.data.nb_features
if isinstance(model, RandomForestClassifier) or isinstance(model, VotingClassifier) or isinstance(model,
xrf.rndmforest.RF2001):
self.random_forest = XRF(model, self.data)
elif isinstance(model, XGBRFClassifier):
self.random_forest = XGBRandomForest(options, from_model=model)
elif isinstance(model, XGBClassifier):
self.random_forest = XGBooster(options, from_model=model)
self.network = html.Div([])
self.explanation = []
def update_with_explicability(self, instance, enum_feats=None, validation=None, xtype=None, solver=None, ):
def update_with_explicability(self, instance, enum_feats=None, xtype=None, solver=None, ):
# Call explanation
if not enum_feats and self.data is not None:
enum_feats = len(self.data.names) - 1
enum_feats = len(self.data.nb_features) - 1
expl = self.random_forest.explain(instance,
use_lime=lime_call if solver == "lime" else None,
use_anchor=anchor_call if solver == "anchor" else None,
use_shap=shap_call if solver == "shap" else None,
nof_feats=enum_feats)
if validation:
coex = self.random_forest.validate(instance, expl)
if coex:
# repairing the local explanation
gexpl = self.random_forest.explain(instance, expl_ext=expl, prefer_ext=True)
else:
# an attempt to refine the local explanation further
gexpl = self.random_forest.explain(instance, expl_ext=expl)
print(expl)
smallest = True if xtype == "S" else False
if isinstance(self.random_forest, XRF):
explanation_result = self.random_forest.explain(instance)
else:
explanation_result = self.random_forest.explain(instance, smallest, solver,
use_lime=lime_call if solver == "LIME" else None,
use_anchor=anchor_call if solver == "ANCHOR" else None,
use_shap=shap_call if solver == "SHAP" else None,
nof_feats=enum_feats)
self.explanation = []
list_explanations_path = []
explanation = {}
self.network = html.Div([])
# Creating a clean and nice text component
# instance plotting
self.explanation.append(html.H4("Instance : \n"))
self.explanation.append(html.P(str([str(instance[i]) for i in range(len(instance))])))
for k in explanation.keys():
if k != "List of path explanation(s)" and k != "List of path contrastive explanation(s)":
if k in ["List of abductive explanation(s)", "List of contrastive explanation(s)"]:
self.explanation.append(html.H4(k))
for expl in explanation[k]:
self.explanation.append(html.Hr())
self.explanation.append(html.P(expl))
self.explanation.append(html.Hr())
else:
self.explanation.append(html.P(k + explanation[k]))
else:
list_explanations_path = explanation["List of path explanation(s)"]
list_contrastive_explanations_path = explanation["List of path contrastive explanation(s)"]
return list_explanations_path, list_contrastive_explanations_path
compt=0
for sample_expl in explanation_result:
compt+=1
self.explanation.append(html.H4("Sample{0} : \n".format(compt)))
for k in sample_expl.keys():
self.explanation.append(html.H5(k))
self.explanation.append(html.Hr())
self.explanation.append(html.P(sample_expl[k]))
self.explanation.append(html.Hr())
return list_explanations_path, []
......@@ -26,8 +26,7 @@ class Data(object):
Class for representing data (transactions).
"""
def __init__(self, filename=None, fpointer=None, mapfile=None,
separator=' ', use_categorical = False):
def __init__(self, file=None, mapfile=None, separator=',', use_categorical = False):
"""
Constructor and parser.
"""
......@@ -40,55 +39,21 @@ class Data(object):
self.fvmap = None
self.ovmap = {}
self.fvars = None
self.fname = filename
self.mname = mapfile
self.deleted = set([])
if filename:
with open(filename, 'r') as fp:
self.parse(fp, separator)
elif fpointer:
self.parse(fpointer, separator)
self.parse(file, separator)
if self.mname:
self.read_orig_values()
# check if we have extra info about categorical_features
if (use_categorical):
extra_file = filename+".pkl"
try:
f = open(extra_file, "rb")
print("Attempt: loading extra data from ", extra_file)
extra_info = pickle.load(f)
print("loaded")
f.close()
self.categorical_features = extra_info["categorical_features"]
self.categorical_names = extra_info["categorical_names"]
self.class_names = extra_info["class_names"]
self.categorical_onehot_names = extra_info["categorical_names"].copy()
for i, name in enumerate(self.class_names):
self.class_names[i] = str(name).replace("b'","'")
for c in self.categorical_names.items():
clean_feature_names = []
for i, name in enumerate(c[1]):
name = str(name).replace("b'","'")
clean_feature_names.append(name)
self.categorical_names[c[0]] = clean_feature_names
except Exception as e:
f.close()
print("Please provide info about categorical features or omit option -c", e)
exit()
def parse(self, fp, separator):
"""
Parse input file.
"""
# reading data set from file
lines = fp.readlines()
lines = fp.split('\n')
# reading preamble
self.names = lines[0].strip().split(separator)
......
......@@ -42,7 +42,6 @@ class SMTEncoder(object):
self.feats = {f: i for i, f in enumerate(feats)}
self.nofcl = nof_classes
self.idmgr = IDPool()
self.optns = xgb.options
# xgbooster will also be needed
self.xgb = xgb
......@@ -165,8 +164,8 @@ class SMTEncoder(object):
# if targeting interval-based encoding,
# traverse all trees and extract all possible intervals
# for each feature
if self.optns.encode == 'smtbool':
self.compute_intervals()
# if self.optns.encode == 'smtbool':
self.compute_intervals()
# traversing and encoding each tree
for i, tree in enumerate(self.ensemble.trees):
......@@ -203,13 +202,9 @@ class SMTEncoder(object):
# number of variables
nof_vars = len(self.enc.get_free_variables())
if self.optns.verb:
print('encoding vars:', nof_vars)
print('encoding asserts:', nof_asserts)
return self.enc, self.intvs, self.imaps, self.ivars
def test_sample(self, sample):
def test_sample(self, sample, solver=None):
"""
Check whether or not the encoding "predicts" the same class
as the classifier given an input sample.
......@@ -221,9 +216,6 @@ class SMTEncoder(object):
# score arrays computed for each class
csum = [[] for c in range(self.nofcl)]
if self.optns.verb:
print('testing sample:', list(sample))
sample_internal = list(self.xgb.transform(sample)[0])
# traversing all trees
......@@ -274,7 +266,7 @@ class SMTEncoder(object):
# now, getting the model
escores = []
model = get_model(And(self.enc, *hypos), solver_name=self.optns.solver)
model = get_model(And(self.enc, *hypos), solver_name=solver)
for c in range(self.nofcl):
v = Symbol('class{0}_score'.format(c), typename=REAL)
escores.append(float(model.get_py_value(v)))
......@@ -282,10 +274,6 @@ class SMTEncoder(object):
assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \
'wrong prediction: {0} vs {1}'.format(cscores, escores)
if self.optns.verb:
print('xgb scores:', cscores)
print('enc scores:', escores)
def save_to(self, outfile):
"""
Save the encoding into a file with a given name.
......
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# -*- coding:utf-8 -*-
##
## explain.py
##
......@@ -9,7 +9,7 @@
##
#
#==============================================================================
# ==============================================================================
from __future__ import print_function
import numpy as np
import os
......@@ -23,6 +23,7 @@ from six.moves import range
import sys
#
#==============================================================================
class SMTExplainer(object):
......@@ -31,7 +32,7 @@ class SMTExplainer(object):
"""
def __init__(self, formula, intvs, imaps, ivars, feats, nof_classes,
options, xgb):
solver, xgb):
"""
Constructor.
"""
......@@ -41,14 +42,12 @@ class SMTExplainer(object):
self.imaps = imaps
self.ivars = ivars
self.nofcl = nof_classes
self.optns = options
self.idmgr = IDPool()
# saving XGBooster
self.xgb = xgb
self.verbose = self.optns.verb
self.oracle = Solver(name=options.solver)
self.oracle = Solver(name=solver)
self.inps = [] # input (feature value) variables
for f in self.xgb.extended_feature_names_as_array_strings:
......@@ -150,28 +149,26 @@ class SMTExplainer(object):
disj.append(GT(self.outs[i], self.outs[self.out_id]))
self.oracle.add_assertion(Implies(self.selv, Or(disj)))
if self.verbose:
inpvals = self.xgb.readable_sample(sample)
inpvals = self.xgb.readable_sample(sample)
self.preamble = []
for f, v in zip(self.xgb.feature_names, inpvals):
if f not in str(v):
self.preamble.append('{0} = {1}'.format(f, v))
else:
self.preamble.append(v)
self.preamble = []
for f, v in zip(self.xgb.feature_names, inpvals):
if f not in str(v):
self.preamble.append('{0} = {1}'.format(f, v))
else:
self.preamble.append(v)
print(' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.output))
explanation_dic = {}
explanation_dic["explaning instance"] = ' explaining: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.output)
return explanation_dic
def explain(self, sample, smallest, expl_ext=None, prefer_ext=False):
"""
Hypotheses minimization.
"""
self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \
resource.getrusage(resource.RUSAGE_SELF).ru_utime
# adapt the solver to deal with the current sample
self.prepare(sample)
explanation_dic = self.prepare(sample)
# saving external explanation to be minimized further
if expl_ext == None or prefer_ext:
......@@ -182,27 +179,20 @@ class SMTExplainer(object):
# if satisfiable, then the observation is not implied by the hypotheses
if self.oracle.solve([self.selv] + [h for h, c in zip(self.rhypos, self.to_consider) if c]):
print(' no implication!')
print(self.oracle.get_model())
sys.exit(1)
if not smallest:
self.compute_minimal(prefer_ext=prefer_ext)
else:
self.compute_smallest()
self.time = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \
resource.getrusage(resource.RUSAGE_SELF).ru_utime - self.time
expl = sorted([self.sel2fid[h] for h in self.rhypos])
explanation_dic["no implication"] = self.oracle.get_model()
else :
if not smallest:
self.compute_minimal(prefer_ext=prefer_ext)
else:
self.compute_smallest()
if self.verbose:
expl = sorted([self.sel2fid[h] for h in self.rhypos])
explanation_dic["explanation brute "] = expl
self.preamble = [self.preamble[i] for i in expl]
print(' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id]))
print(' # hypos left:', len(self.rhypos))
print(' time: {0:.2f}'.format(self.time))
explanation_dic["explanation"] = ' explanation: "IF {0} THEN {1}"'.format(' AND '.join(self.preamble), self.xgb.target_name[self.out_id])
explanation_dic["Hyphothesis left"] = ' # hypos left:' + str(len(self.rhypos))
return expl
return explanation_dic
def compute_minimal(self, prefer_ext=False):
"""
......@@ -258,10 +248,6 @@ class SMTExplainer(object):
hset = hitman.get()
iters += 1
if self.verbose > 1:
print('iter:', iters)
print('cand:', hset)
if self.oracle.solve([self.selv] + [self.rhypos[i] for i in hset]):
to_hit = []
satisfied, unsatisfied = [], []
......@@ -302,10 +288,7 @@ class SMTExplainer(object):
else:
to_hit.append(h)
if self.verbose > 1:
print('coex:', to_hit)
hitman.hit(to_hit)
else:
self.rhypos = [self.rhypos[i] for i in hset]
break
break
\ No newline at end of file
......@@ -11,24 +11,16 @@
#
# ==============================================================================
from __future__ import print_function
from .validate import SMTValidator
from .encode import SMTEncoder
from .explain import SMTExplainer
import numpy as np
import os
import resource
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import sklearn
# print('The scikit-learn version is {}.'.format(sklearn.__version__))
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import sys
import random
import numpy as np
from six.moves import range
from .tree import TreeEnsemble
import xgboost as xgb
from xgboost import XGBClassifier, Booster
import pickle
from xgboost import XGBClassifier
from .encode import SMTEncoder
from .explain import SMTExplainer
from .validate import SMTValidator
#
......@@ -38,164 +30,20 @@ class XGBooster(object):
The main class to train/encode/explain XGBoost models.
"""
def __init__(self, options, from_data=None, from_model=None,
from_encoding=None):
def __init__(self, options, from_model=None):
"""
Constructor.
"""
np.random.seed(random.randint(1, 100))
assert from_data or from_model or from_encoding, \
'At least one input file should be specified'
self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime
self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime
# saving command-line options
self.options = options
self.seed = self.options.seed
np.random.seed(self.seed)
if from_data:
self.use_categorical = self.options.use_categorical
# saving data
self.data = from_data
##
samps = np.asarray(self.data.samps)
if not all(c.isnumeric() for c in samps[:, -1]):
le = LabelEncoder()
le.fit(samps[:, -1])
samps[:, -1] = le.transform(samps[:, -1])
# self.class_names = le.classes_
# print(le.classes_)
##
dataset = np.asarray(samps, dtype=np.float32)
# dataset = np.asarray(self.data.samps, dtype=np.float32)
# split data into X and y
self.feature_names = self.data.names[:-1]
self.nb_features = len(self.feature_names)
self.X = dataset[:, 0:self.nb_features]
self.Y = dataset[:, self.nb_features]
self.num_class = len(set(self.Y))
self.target_name = list(range(self.num_class))
param_dist = {'n_estimators': self.options.n_estimators,
'max_depth': self.options.maxdepth}
if (self.num_class == 2):
param_dist['objective'] = 'binary:logistic'
self.model = XGBClassifier(**param_dist)
# split data into train and test sets
self.test_size = self.options.testsplit
if (self.test_size > 0):
self.X_train, self.X_test, self.Y_train, self.Y_test = \
train_test_split(self.X, self.Y, test_size=self.test_size,
random_state=self.seed)
else:
self.X_train = self.X
self.X_test = [] # need a fix
self.Y_train = self.Y
self.Y_test = [] # need a fix
# check if we have info about categorical features
if (self.use_categorical):
self.categorical_features = from_data.categorical_features
self.categorical_names = from_data.categorical_names
self.target_name = from_data.class_names
####################################
# this is a set of checks to make sure that we use the same as anchor encoding
cat_names = sorted(self.categorical_names.keys())
assert (cat_names == self.categorical_features)
self.encoder = {}
for i in self.categorical_features:
self.encoder.update({i: OneHotEncoder(categories='auto', sparse=False)}) # ,
self.encoder[i].fit(self.X[:, [i]])
else:
self.categorical_features = []
self.categorical_names = []
self.encoder = []
fname = from_data
elif from_model:
fname = from_model
self.load_datainfo(from_model)
if (self.use_categorical is False) and (self.options.use_categorical is True):
print(
"Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions")
exit()
# load model
elif from_encoding:
fname = from_encoding
# encoding, feature names, and number of classes
# are read from an input file
enc = SMTEncoder(None, None, None, self, from_encoding)
self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \
self.num_class = enc.access()
# create extra file names
try:
os.stat(options.output)
except:
os.mkdir(options.output)
self.mapping_features()
#################
self.test_encoding_transformes()
bench_name = os.path.splitext(os.path.basename(options.files[0]))[0]
bench_dir_name = options.output + "/bt/" + bench_name
try:
os.stat(bench_dir_name)
except:
os.mkdir(bench_dir_name)
self.basename = (os.path.join(bench_dir_name, bench_name +
"_nbestim_" + str(options.n_estimators) +
"_maxdepth_" + str(options.maxdepth) +
"_testsplit_" + str(options.testsplit)))
data_suffix = '.splitdata.pkl'
self.modfile = self.basename + '.mod.pkl'
self.mod_plainfile = self.basename + '.mod.txt'
self.resfile = self.basename + '.res.txt'
self.encfile = self.basename + '.enc.txt'
self.expfile = self.basename + '.exp.txt'
def load_datainfo(self, model_from_pkl, data_from_pkl):
self.model = XGBClassifier()
self.model = model_from_pkl
loaded_data = data_from_pkl
self.X = loaded_data["X"]
self.Y = loaded_data["Y"]
self.X_train = loaded_data["X_train"]
self.X_test = loaded_data["X_test"]
self.Y_train = loaded_data["Y_train"]
self.Y_test = loaded_data["Y_test"]
self.feature_names = loaded_data["feature_names"]
self.target_name = loaded_data["target_name"]
self.num_class = loaded_data["num_class"]
self.nb_features = len(self.feature_names)
self.categorical_features = loaded_data["categorical_features"]
self.categorical_names = loaded_data["categorical_names"]
self.encoder = loaded_data["encoder"]
self.use_categorical = loaded_data["use_categorical"]
self.model = from_model
def train(self, outfile=None):
"""
Train a tree ensemble using XGBoost.
"""
self.feature_names = options["feature_names"]
self.num_class = options["n_classes"]
self.nb_features = options["n_features"]
return self.build_xgbtree(outfile)
self.mapping_features()
def encode(self, test_on=None):
"""
......@@ -208,9 +56,7 @@ class XGBooster(object):
if test_on:
encoder.test_sample(np.array(test_on))
# encoder.save_to(self.encfile)
def explain(self, sample, use_lime=None, use_anchor=None, use_shap=None,
def explain(self, sample, smallest, solver, use_lime=None, use_anchor=None, use_shap=None,
expl_ext=None, prefer_ext=False, nof_feats=5):
"""
Explain a prediction made for a given sample with a previously
......@@ -229,12 +75,10 @@ class XGBooster(object):
if 'x' not in dir(self):
self.x = SMTExplainer(self.enc, self.intvs, self.imaps,
self.ivars, self.feature_names, self.num_class,
self.options, self)
solver, self)
expl = self.x.explain(np.array(sample), self.options.smallest,
expl_ext, prefer_ext)
expl = self.x.explain(np.array(sample), smallest, expl_ext, prefer_ext)
# returning the explanation
return expl
def validate(self, sample, expl):
......@@ -255,191 +99,12 @@ class XGBooster(object):
# try to compute a counterexample
return self.v.validate(np.array(sample), expl)
def transform(self, x):
if (len(x) == 0):
return x
if (len(x.shape) == 1):
x = np.expand_dims(x, axis=0)
if (self.use_categorical):
assert (self.encoder != [])
tx = []
for i in range(self.nb_features):
self.encoder[i].drop = None
if (i in self.categorical_features):
tx_aux = self.encoder[i].transform(x[:, [i]])
tx_aux = np.vstack(tx_aux)
tx.append(tx_aux)
else:
tx.append(x[:, [i]])
tx = np.hstack(tx)
return tx
else:
return x
def transform_inverse(self, x):
if (len(x) == 0):
return x
if (len(x.shape) == 1):
x = np.expand_dims(x, axis=0)
if (self.use_categorical):
assert (self.encoder != [])
inverse_x = []
for i, xi in enumerate(x):
inverse_xi = np.zeros(self.nb_features)
for f in range(self.nb_features):
if f in self.categorical_features:
nb_values = len(self.categorical_names[f])
v = xi[:nb_values]
v = np.expand_dims(v, axis=0)
iv = self.encoder[f].inverse_transform(v)
inverse_xi[f] = iv
xi = xi[nb_values:]
else:
inverse_xi[f] = xi[0]
xi = xi[1:]
inverse_x.append(inverse_xi)
return inverse_x
else:
return x
def transform_inverse_by_index(self, idx):
if (idx in self.extended_feature_names):
return self.extended_feature_names[idx]
else:
print("Warning there is no feature {} in the internal mapping".format(idx))
return None
def transform_by_value(self, feat_value_pair):
if (feat_value_pair in self.extended_feature_names.values()):
keys = (
list(self.extended_feature_names.keys())[list(self.extended_feature_names.values()).index(feat_value_pair)])
return keys
else:
print("Warning there is no value {} in the internal mapping".format(feat_value_pair))
return None
def mapping_features(self):
self.extended_feature_names = {}
self.extended_feature_names_as_array_strings = []
counter = 0
if (self.use_categorical):
for i in range(self.nb_features):
if (i in self.categorical_features):
for j, _ in enumerate(self.encoder[i].categories_[0]):
self.extended_feature_names.update({counter: (self.feature_names[i], j)})
self.extended_feature_names_as_array_strings.append(
"f{}_{}".format(i, j)) # str(self.feature_names[i]), j))
counter = counter + 1
else:
self.extended_feature_names.update({counter: (self.feature_names[i], None)})
self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i])
counter = counter + 1
else:
for i in range(self.nb_features):
self.extended_feature_names.update({counter: (self.feature_names[i], None)})
self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i])
counter = counter + 1
def readable_sample(self, x):
readable_x = []
for i, v in enumerate(x):
if (i in self.categorical_features):
readable_x.append(self.categorical_names[i][int(v)])
else:
readable_x.append(v)
return np.asarray(readable_x)
def test_encoding_transformes(self):
# test encoding
X = self.X_train[[0], :]
print("Sample of length", len(X[0]), " : ", X)
enc_X = self.transform(X)
print("Encoded sample of length", len(enc_X[0]), " : ", enc_X)
inv_X = self.transform_inverse(enc_X)
print("Back to sample", inv_X)
print("Readable sample", self.readable_sample(inv_X[0]))
assert ((inv_X == X).all())
if (self.options.verb > 1):
for i in range(len(self.extended_feature_names)):
print(i, self.transform_inverse_by_index(i))
for key, value in self.extended_feature_names.items():
print(value, self.transform_by_value(value))
def transfomed_sample_info(self, i):
print(enc.categories_)
def build_xgbtree(self, outfile=None):
"""
Build an ensemble of trees.
"""
if (outfile is None):
outfile = self.modfile
else:
self.datafile = sefl.form_datefile_name(outfile)
# fit model no training data
if (len(self.X_test) > 0):
eval_set = [(self.transform(self.X_train), self.Y_train), (self.transform(self.X_test), self.Y_test)]
else:
eval_set = [(self.transform(self.X_train), self.Y_train)]
print("start xgb")
self.model.fit(self.transform(self.X_train), self.Y_train,
eval_set=eval_set,
verbose=self.options.verb) # eval_set=[(X_test, Y_test)],
print("end xgb")
evals_result = self.model.evals_result()
########## saving model
self.save_datainfo(outfile)
print("saving plain model to ", self.mod_plainfile)
self.model._Booster.dump_model(self.mod_plainfile)
ensemble = TreeEnsemble(self.model, self.extended_feature_names_as_array_strings, nb_classes=self.num_class)
y_pred_prob = self.model.predict_proba(self.transform(self.X_train[:10]))
y_pred_prob_compute = ensemble.predict(self.transform(self.X_train[:10]), self.num_class)
assert (np.absolute(y_pred_prob_compute - y_pred_prob).sum() < 0.01 * len(y_pred_prob))
### accuracy
try:
train_accuracy = round(1 - evals_result['validation_0']['merror'][-1], 2)
except:
try:
train_accuracy = round(1 - evals_result['validation_0']['error'][-1], 2)
except:
assert (False)
try:
test_accuracy = round(1 - evals_result['validation_1']['merror'][-1], 2)
except:
try:
test_accuracy = round(1 - evals_result['validation_1']['error'][-1], 2)
except:
print("no results test data")
test_accuracy = 0
#### saving
print("saving results to ", self.resfile)
with open(self.resfile, 'w') as f:
f.write("{} & {} & {} &{} &{} & {} \\\\ \n \hline \n".format(
os.path.basename(self.options.files[0]).replace("_", "-"),
train_accuracy,
test_accuracy,
self.options.n_estimators,
self.options.maxdepth,
self.options.testsplit))
f.close()
print("c BT sz:", ensemble.sz)
print("Train accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Test accuracy: %.2f%%" % (test_accuracy * 100.0))
return train_accuracy, test_accuracy, self.model
for i in range(self.nb_features):
self.extended_feature_names.update({counter: (self.feature_names[i], None)})
self.extended_feature_names_as_array_strings.append("f{}".format(i)) # (self.feature_names[i])
counter = counter + 1
......@@ -39,10 +39,10 @@ class SMTEncoder(object):
"""
self.model = model
self.features_names = feats
self.feats = {f: i for i, f in enumerate(feats)}
self.nofcl = nof_classes
self.idmgr = IDPool()
self.optns = xgb.options
# xgbooster will also be needed
self.xgb = xgb
......@@ -165,8 +165,8 @@ class SMTEncoder(object):
# if targeting interval-based encoding,
# traverse all trees and extract all possible intervals
# for each feature
if self.optns.encode == 'smtbool':
self.compute_intervals()
# if self.optns.encode == 'smtbool':
self.compute_intervals()
# traversing and encoding each tree
for i, tree in enumerate(self.ensemble.trees):
......@@ -203,13 +203,9 @@ class SMTEncoder(object):
# number of variables
nof_vars = len(self.enc.get_free_variables())
if self.optns.verb:
print('encoding vars:', nof_vars)
print('encoding asserts:', nof_asserts)
return self.enc, self.intvs, self.imaps, self.ivars
def test_sample(self, sample):
def test_sample(self, sample, solver):
"""
Check whether or not the encoding "predicts" the same class
as the classifier given an input sample.
......@@ -221,9 +217,6 @@ class SMTEncoder(object):
# score arrays computed for each class
csum = [[] for c in range(self.nofcl)]
if self.optns.verb:
print('testing sample:', list(sample))
sample_internal = list(self.xgb.transform(sample)[0])
# traversing all trees
......@@ -274,7 +267,7 @@ class SMTEncoder(object):
# now, getting the model
escores = []
model = get_model(And(self.enc, *hypos), solver_name=self.optns.solver)
model = get_model(And(self.enc, *hypos), solver_name=solver)
for c in range(self.nofcl):
v = Symbol('class{0}_score'.format(c), typename=REAL)
escores.append(float(model.get_py_value(v)))
......@@ -282,10 +275,6 @@ class SMTEncoder(object):
assert all(map(lambda c, e: abs(c - e) <= 0.001, cscores, escores)), \
'wrong prediction: {0} vs {1}'.format(cscores, escores)
if self.optns.verb:
print('xgb scores:', cscores)
print('enc scores:', escores)
def save_to(self, outfile):
"""
Save the encoding into a file with a given name.
......
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# -*- coding:utf-8 -*-
##
## tree.py (reuses parts of the code of SHAP)
##
......@@ -9,8 +9,8 @@
##
#
#==============================================================================
from anytree import Node, RenderTree,AsciiStyle
# ==============================================================================
from anytree import Node, RenderTree, AsciiStyle
import json
import numpy as np
import xgboost as xgb
......@@ -18,13 +18,13 @@ import math
#
#==============================================================================
# ==============================================================================
class xgnode(Node):
def __init__(self, id, parent = None):
def __init__(self, id, parent=None):
Node.__init__(self, id, parent)
self.id = id # The node value
self.name = None
self.left_node_id = -1 # Left child
self.left_node_id = -1 # Left child
self.right_node_id = -1 # Right child
self.missing_node_id = -1
......@@ -37,29 +37,31 @@ class xgnode(Node):
def __str__(self):
pref = ' ' * self.depth
if (len(self.children) == 0):
return (pref+ "leaf: {} {}".format(self.id, self.values))
return (pref + "leaf: {} {}".format(self.id, self.values))
else:
if(self.name is None):
return (pref+ "{} f{}<{}".format(self.id, self.feature, self.threshold))
if (self.name is None):
return (pref + "{} f{}<{}".format(self.id, self.feature, self.threshold))
else:
return (pref+ "{} \"{}\"<{}".format(self.id, self.name, self.threshold))
return (pref + "{} \"{}\"<{}".format(self.id, self.name, self.threshold))
#
#==============================================================================
def build_tree(json_tree, node = None, feature_names = None, inverse = False):
# ==============================================================================
def build_tree(json_tree, node=None, feature_names=None, inverse=False):
def max_id(node):
if "children" in node:
return max(node["nodeid"], *[max_id(n) for n in node["children"]])
else:
return node["nodeid"]
m = max_id(json_tree) + 1
def extract_data(json_node, root = None, feature_names = None):
def extract_data(json_node, root=None, feature_names=None):
i = json_node["nodeid"]
if (root is None):
node = xgnode(i)
else:
node = xgnode(i, parent = root)
node = xgnode(i, parent=root)
node.cover = json_node["cover"]
if "children" in json_node:
......@@ -73,8 +75,8 @@ def build_tree(json_tree, node = None, feature_names = None, inverse = False):
for c, n in enumerate(json_node["children"]):
child = extract_data(n, node, feature_names)
elif "leaf" in json_node:
node.values = json_node["leaf"]
if(inverse):
node.values = json_node["leaf"]
if (inverse):
node.values = -node.values
return node
......@@ -83,7 +85,7 @@ def build_tree(json_tree, node = None, feature_names = None, inverse = False):
#
#==============================================================================
# ==============================================================================
def walk_tree(node):
if (len(node.children) == 0):
# leaf
......@@ -95,7 +97,7 @@ def walk_tree(node):
#
#==============================================================================
# ==============================================================================
def scores_tree(node, sample):
if (len(node.children) == 0):
# leaf
......@@ -103,41 +105,43 @@ def scores_tree(node, sample):
else:
feature_branch = node.feature
sample_value = sample[feature_branch]
assert(sample_value is not None)
if(sample_value < node.threshold):
assert (sample_value is not None)
if (sample_value < node.threshold):
return scores_tree(node.children[0], sample)
else:
return scores_tree(node.children[1], sample)
#
#==============================================================================
# ==============================================================================
class TreeEnsemble:
""" An ensemble of decision trees.
This object provides a common interface to many different types of models.
"""
def __init__(self, model, feature_names = None, nb_classes = 0):
def __init__(self, model, feature_names=None, nb_classes=0):
self.model_type = "xgboost"
#self.original_model = model.get_booster()
# self.original_model = model.get_booster()
self.original_model = model
####
self.base_offset = None
json_trees = get_xgboost_json(self.original_model)
json_trees = get_xgboost_json(self.original_model, feature_names)
self.trees = [build_tree(json.loads(t), None, feature_names) for t in json_trees]
if(nb_classes == 2):
if (nb_classes == 2):
# NASTY trick for binary
# We change signs of values in leaves so that we can just sum all the values in leaves for class X
# and take max to get the right class
self.otrees = [build_tree(json.loads(t), None, feature_names, inverse = True) for t in json_trees]
self.otrees = [build_tree(json.loads(t), None, feature_names, inverse=True) for t in json_trees]
self.itrees = [build_tree(json.loads(t), None, feature_names) for t in json_trees]
self.trees = []
for i,_ in enumerate(self.otrees):
for i, _ in enumerate(self.otrees):
self.trees.append(self.otrees[i])
self.trees.append(self.itrees[i])
self.feature_names = feature_names
def print_tree(self):
for i,t in enumerate(self.trees):
for i, t in enumerate(self.trees):
print("tree number: ", i)
walk_tree(t)
......@@ -149,13 +153,14 @@ class TreeEnsemble:
self.invert_tree_prob(node.children[0])
self.invert_tree_prob(node.children[1])
return node
def predict(self, samples, nb_classes):
# https://github.com/dmlc/xgboost/issues/1746#issuecomment-290130695
prob = []
nb_estimators = int(len(self.trees)/nb_classes)
nb_estimators = int(len(self.trees) / nb_classes)
for sample in np.asarray(samples):
scores = []
for i,t in enumerate(self.trees):
for i, t in enumerate(self.trees):
s = scores_tree(t, sample)
scores.append((s))
scores = np.asarray(scores)
......@@ -163,30 +168,31 @@ class TreeEnsemble:
if (nb_classes == 2):
for i in range(nb_classes):
class_scores.append(math.exp(-(scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor
s0 = class_scores[0]
s1 = class_scores[1]
v0 = 1/(1 + s0)
v1 = 1/(1 + s1)
class_scores.append(math.exp(-(
scores[i::nb_classes]).sum())) # swap signs back as we had to use this trick in the contractor
s0 = class_scores[0]
s1 = class_scores[1]
v0 = 1 / (1 + s0)
v1 = 1 / (1 + s1)
class_scores[0] = v0
class_scores[1] = v1
else:
for i in range(0,nb_classes*nb_estimators,nb_estimators):
class_scores.append(math.exp((scores[i:i+nb_estimators]).sum()))
#for i in range(nb_classes):
for i in range(0, nb_classes * nb_estimators, nb_estimators):
class_scores.append(math.exp((scores[i:i + nb_estimators]).sum()))
# for i in range(nb_classes):
# class_scores.append(math.exp((scores[i::nb_classes]).sum()))
class_scores = np.asarray(class_scores)
prob.append(class_scores/class_scores.sum())
prob.append(class_scores / class_scores.sum())
return np.asarray(prob).reshape((-1, nb_classes))
#
#==============================================================================
def get_xgboost_json(model):
# ==============================================================================
def get_xgboost_json(model, feature_names):
""" REUSED FROM SHAP
This gets a JSON dump of an XGBoost model while ensuring the feature names are their indexes.
"""
fnames = model.feature_names
fnames = feature_names
model.feature_names = None
json_trees = model.get_dump(with_stats=True, dump_format="json")
model.feature_names = fnames
......
This diff is collapsed.
#!/usr/bin/env python3
#-*- coding:utf-8 -*-
# -*- coding:utf-8 -*-
##
## xrf.py
##
......@@ -9,32 +9,34 @@
##
#
#==============================================================================
# ==============================================================================
from __future__ import print_function
from pages.application.RandomForest.utils.data import Data
from pages.application.RandomForest.utils.data import Data
import os
import sys
import pickle
import resource
from pages.application.RandomForest.utils.xgbooster import preprocess_dataset
from pages.application.RandomForest.utils.xgbooster import preprocess_dataset
from pages.application.RandomForest.utils.xrf import XRF, RF2001, Dataset, Checker
from pages.application.RandomForest.utils.xrf import XRF, RF2001, Dataset, Checker
import numpy as np
##################
from pages.application.RandomForest.utils.xpLime import lime_call
from pages.application.RandomForest.utils.xpLime import lime_call
import math
import lime
import lime.lime_tabular
###
from pages.application.RandomForest.utils.xpAnchor import anchor_call
#from anchor import utils
from pages.application.RandomForest.utils.xpAnchor import anchor_call
# from anchor import utils
from anchor import anchor_tabular
################
#
#==============================================================================
# ==============================================================================
def show_info():
"""
Print info message.
......@@ -42,35 +44,37 @@ def show_info():
print("c XRF: eXplaining Random Forest.")
print('c')
#
#==============================================================================
# ==============================================================================
def pickle_save_file(filename, data):
try:
f = open(filename, "wb")
f = open(filename, "wb")
pickle.dump(data, f)
f.close()
except:
print("Cannot save to file", filename)
exit()
def pickle_load_file(filename):
try:
f = open(filename, "rb")
f = open(filename, "rb")
data = pickle.load(f)
f.close()
return data
except:
print("Cannot load from file", filename)
exit()
#
#==============================================================================
exit()
#
# ==============================================================================
if __name__ == '__main__':
# parsing command-line options
options = Options(sys.argv)
# making output unbuffered
if sys.version_info.major == 2:
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
......@@ -81,35 +85,34 @@ if __name__ == '__main__':
if (options.preprocess_categorical):
preprocess_dataset(options.files[0], options.preprocess_categorical_files, options.use_categorical)
exit()
if options.files:
cls = None
xrf = None
print("loading data ...")
data = Dataset(filename=options.files[0], mapfile=options.mapfile,
separator=options.separator, use_categorical = options.use_categorical)
separator=options.separator, use_categorical=options.use_categorical)
if options.train:
'''
data = Dataset(filename=options.files[0], mapfile=options.mapfile,
separator=options.separator,
use_categorical = options.use_categorical)
'''
'''
cls = RF2001(options)
train_accuracy, test_accuracy = cls.train(data)
if options.verb == 1:
print("----------------------")
print("Train accuracy: {0:.2f}".format(100. * train_accuracy))
print("Test accuracy: {0:.2f}".format(100. * test_accuracy))
print("----------------------")
print("----------------------")
xrf = XRF(options, cls, data)
#xrf.test_tree_ensemble()
# xrf.test_tree_ensemble()
bench_name = os.path.splitext(os.path.basename(options.files[0]))[0]
bench_dir_name = options.output + "/RF/" + bench_name
try:
......@@ -118,23 +121,22 @@ if __name__ == '__main__':
os.mkdir(bench_dir_name)
basename = (os.path.join(bench_dir_name, bench_name +
"_nbestim_" + str(options.n_estimators) +
"_maxdepth_" + str(options.maxdepth)))
"_nbestim_" + str(options.n_estimators) +
"_maxdepth_" + str(options.maxdepth)))
modfile = basename + '.mod.pkl'
modfile = basename + '.mod.pkl'
print("saving model to ", modfile)
pickle_save_file(modfile, cls)
#data_suffix = '.splitdata.pkl'
#filename_data = basename + data_suffix
#print("saving data to ", filename_data)
#pickle_save_file(filename_data, data)
# data_suffix = '.splitdata.pkl'
# filename_data = basename + data_suffix
# print("saving data to ", filename_data)
# pickle_save_file(filename_data, data)
# read a sample from options.explain
#if options.explain:
# if options.explain:
# options.explain = [float(v.strip()) for v in options.explain.split(',')]
'''
if options.encode:
# encode it and save the encoding to another file
......@@ -143,25 +145,24 @@ if __name__ == '__main__':
'''
if options.explain:
mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \
resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
if not xrf:
print("loading model ...")
cls = pickle_load_file(options.files[1])
cls = pickle_load_file(options.files[1])
xrf = XRF(options, cls, data)
#expl = xrf.explain(options.explain)
#expl_checker = Checker(xrf.f, data.num_class, data.extended_feature_names_as_array_strings)
cls.print_accuracy(data) # print test accuracy of the RF model
# expl = xrf.explain(options.explain)
# expl_checker = Checker(xrf.f, data.num_class, data.extended_feature_names_as_array_strings)
cls.print_accuracy(data) # print test accuracy of the RF model
samps_file = options.explain.strip()
print(samps_file)
with open(samps_file, 'r') as fp:
lines = fp.readlines()
# timers
atimes = []
lengths = []
......@@ -171,46 +172,46 @@ if __name__ == '__main__':
utimes = []
nSatCalls = []
nUnsCalls = []
ltimes = []
ctimes = []
wins = 0
for i, s in enumerate(lines):
sample = [float(v.strip()) for v in s.split(',')]
if tuple(sample) in tested:
continue
#print("inst#{0}".format(i+1))
# print("inst#{0}".format(i+1))
tested.add(tuple(sample))
#print('sample {0}: {1}'.format(i, ','.join(s.strip().split(','))))
# print('sample {0}: {1}'.format(i, ','.join(s.strip().split(','))))
xrf.encode(sample)
expl = xrf.explain(sample)
atimes.append(xrf.x.time)
atimes.append(xrf.x.time)
lengths.append(len(expl))
nvars = xrf.enc.cnf.nv
nclauses = len(xrf.enc.cnf.clauses)
#mSAT = max(xrf.x.stimes+[mSAT])
#mUNSAT = max(xrf.x.utimes+[mUNSAT])
# mSAT = max(xrf.x.stimes+[mSAT])
# mUNSAT = max(xrf.x.utimes+[mUNSAT])
if len(xrf.x.stimes):
stimes.append(max(xrf.x.stimes))
if len(xrf.x.utimes):
utimes.append(max(xrf.x.utimes))
nSatCalls.append(xrf.x.nsat)
nUnsCalls.append(xrf.x.nunsat)
#inst = data.transform(np.array(sample))[0]
#expl_checker.check(np.array(inst), expl)
# inst = data.transform(np.array(sample))[0]
# expl_checker.check(np.array(inst), expl)
#####check_expl(np.array(inst), expl, xrf.enc.forest, xrf.enc.intvs)
del xrf.enc
del xrf.x
#####################LIME###########
'''
_, ltime = lime_call(cls, data, sample, verbose=options.verb) # call lime
......@@ -218,20 +219,19 @@ if __name__ == '__main__':
#wins += 1
if atimes[-1] < ltime:
wins += 1
'''
_, ctime = anchor_call(cls, data, sample, verbose=options.verb) # call lime
'''
_, ctime = anchor_call(cls, data, sample, verbose=options.verb) # call lime
ctimes.append(ctime)
if atimes[-1] < ctime:
wins += 1
#if i == 1:
wins += 1
# if i == 1:
# break
mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \
resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem
resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss - mem
# reporting the time spent
print('')
print('tot time: {0:.2f}'.format(sum(atimes)))
......@@ -240,22 +240,21 @@ if __name__ == '__main__':
print('avg time: {0:.2f}'.format(sum(atimes) / len(atimes)))
print('')
####
print('avg length: {0:.0f}'.format(round(sum(lengths) / len(lengths))*100/len(sample)))
#print('max SAT: {0:.2f}'.format(mSAT))
#print('max UNSAT: {0:.2f}'.format(mUNSAT))
print('avg length: {0:.0f}'.format(round(sum(lengths) / len(lengths)) * 100 / len(sample)))
# print('max SAT: {0:.2f}'.format(mSAT))
# print('max UNSAT: {0:.2f}'.format(mUNSAT))
print('max SAT: {0:.2f}'.format(max(stimes)))
print('max UNSAT: {0:.2f}'.format(max(utimes)))
print('max UNSAT: {0:.2f}'.format(max(utimes)))
print('avg #SAT: {0:.0f}'.format(sum(nSatCalls) / len(nSatCalls)))
print('avg #UNSAT: {0:.0f}'.format(sum(nUnsCalls) / len(nUnsCalls)))
print('')
#reporting nof_vars and nof_clauses
# reporting nof_vars and nof_clauses
print('c nof vars: {0}'.format(nvars))
print('c nof clauses: {0}'.format(nclauses))
#
print('c nof instances: {0}'.format(len(tested)))
print("c mem used: {0:.2f} Mb".format(mem/(1024*1024)))
print("c mem used: {0:.2f} Mb".format(mem / (1024 * 1024)))
# LIME runtimes
'''
print('')
......@@ -263,12 +262,11 @@ if __name__ == '__main__':
print('avg time for Lime: {0:.2f}'.format(sum(ltimes) / len(ltimes)))
print('#wins {0} out of {1}'.format(wins, len(tested)) )
'''
# Anchor runtimes
print('')
print('tot time for Anchor: {0:.2f}'.format(sum(ctimes)))
print('max time for Anchor: {0:.2f}'.format(max(ctimes)))
print('min time for Anchor: {0:.2f}'.format(min(ctimes)))
print('avg time for Anchor: {0:.2f}'.format(sum(ctimes) / len(ctimes)))
print('#wins {0} out of {1}'.format(wins, len(tested)) )
\ No newline at end of file
print('#wins {0} out of {1}'.format(wins, len(tested)))
from pysat.formula import CNF, IDPool
from pysat.solvers import Solver
from pysat.card import *
from itertools import combinations
import collections
import six
from six.moves import range
from .tree import Forest, predict_tree
from .sortnetwrk import HSorNetwrk
#
#==============================================================================
class SATEncoder(object):
"""
Encoder of Random Forest classifier into SAT.
"""
def __init__(self, forest, feats, nof_classes, extended_feature_names=None, from_file=None):
#self.model = model
self.forest = forest
self.feats = {f: i for i, f in enumerate(feats)}
self.num_class = nof_classes
self.vpool = IDPool()
#self.optns = xgb.options
self.extended_feature_names = extended_feature_names
#encoding formula
self.cnf = None
# for interval-based encoding
self.intvs, self.imaps, self.ivars = None, None, None
#if from_file:
# self.load_from(from_file)
def newVar(self, name):
assert(name)
if name in self.vpool.obj2id: #var has been already created
return self.vpool.obj2id[name]
var = self.vpool.id('{0}'.format(name))
return var
def traverse(self, tree, k, clause):
"""
Traverse a tree and encode each node.
"""
if tree.children:
var = self.newVar(tree.name)
#print("{0} => {1}".format(tree.name, var))
pos, neg = var, -var
self.traverse(tree.children[0], k, clause + [-neg]) # -var
self.traverse(tree.children[1], k, clause + [-pos]) # --var
else: # leaf node
cvar = self.newVar('class{0}_tr{1}'.format(tree.values,k))
print('c: ', clause + [cvar])
self.cnf.append(clause + [cvar])
'''
def encode_node(self, node):
"""
Encode a node of a tree.
"""
if '_' not in node.name:
# continuous features => expecting an upper bound
# feature and its upper bound (value)
f, v = node.name, node.threshold
existing = True if tuple([f, v]) in self.idmgr.obj2id else False
vid = self.idmgr.id(tuple([f, v]))
bv = Symbol('bvar{0}'.format(vid), typename=BOOL)
if not existing:
if self.intvs:
d = self.imaps[f][v] + 1
pos, neg = self.ivars[f][:d], self.ivars[f][d:]
self.enc.append(Iff(bv, Or(pos)))
self.enc.append(Iff(Not(bv), Or(neg)))
else:
fvar, fval = Symbol(f, typename=REAL), Real(v)
self.enc.append(Iff(bv, LT(fvar, fval)))
return bv, Not(bv)
else:
# all features are expected to be categorical and
# encoded with one-hot encoding into Booleans
# each node is expected to be of the form: f_i < 0.5
bv = Symbol(node.name, typename=BOOL)
# left branch is positive, i.e. bv is true
# right branch is negative, i.e. bv is false
return Not(bv), bv
'''
def compute_intervals(self):
"""
Traverse all trees in the ensemble and extract intervals for each
feature.
At this point, the method only works for numerical datasets!
"""
def traverse_intervals(tree):
"""
Auxiliary function. Recursive tree traversal.
"""
if tree.children:
f = tree.name
v = tree.threshold
self.intvs[f].add(v)
traverse_intervals(tree.children[0])
traverse_intervals(tree.children[1])
# initializing the intervals
self.intvs = {'f{0}'.format(i): set([]) for i in range(len(self.feats))}
for tree in self.forest.trees:
traverse_intervals(tree)
# OK, we got all intervals; let's sort the values
self.intvs = {f: sorted(self.intvs[f]) + ['+'] for f in six.iterkeys(self.intvs)}
self.imaps, self.ivars = {}, {}
for feat, intvs in six.iteritems(self.intvs):
self.imaps[feat] = {}
self.ivars[feat] = []
for i, ub in enumerate(intvs):
self.imaps[feat][ub] = i
ivar = Symbol(name='{0}_intv{1}'.format(feat, i), typename=BOOL)
self.ivars[feat].append(ivar)
def encode(self, sample):
"""
Do the job.
"""
self.cnf = CNF()
# getting a tree ensemble
#self.forest = Forest(self.model, self.extended_feature_names)
num_tree = len(self.forest.trees)
# introducing class score variables
cvars = [[] for t in range(num_tree)]
for k in range(len(self.forest.trees)):
for j in range(self.num_class):
var = self.newVar('class{0}_tr{1}'.format(j,k))
cvars[k].append(-var)
# if targeting interval-based encoding,
# traverse all trees and extract all possible intervals
# for each feature
'''
if self.optns.encode == 'smtbool':
self.compute_intervals()
'''
# traversing and encoding each tree
for k, tree in enumerate(self.forest.trees):
print("Encode tree#{0}".format(k))
# encoding the tree
self.traverse(tree, k, [])
#exactly one class var is true this could could be squeezed
# more to reduce NB binary clauses!!!!!!!
enc = CardEnc.atmost(lits=[-v for v in cvars[k]],
vpool=self.vpool,
encoding=EncType.cardnetwrk) #AtMostOne constraint
self.cnf.extend(enc.clauses)
csum = [[] for c in range(self.num_class)]
for k, tree in enumerate(self.forest.trees):
c = predict_tree(tree, sample)
csum[c].append(k)
cvars[k][c] = - cvars[k][c]
# encoding the majority
cmaj,_ = max(enumerate(csum), key=(lambda x: len(x[1])))
sorted_lits = [[] for c in range(self.num_class)]
#sorting bits
for j in range(self.num_class):
tvars = [cvars[k][j] for k in range(num_tree)]
clauses, vout, _ = HSorNetwrk(lits=tvars, vpool = self.vpool)
self.cnf.extend(clauses)
sorted_lits[j] = vout
#print("tvars: {0} ==> {3} \nclauses: {1}\ntop: {2}".format(tvars, clauses, self.vpool.top, vout))
#compare bits
for j in range(self.num_class):
if j == cmaj:
continue
for k in range(num_tree):
self.cnf.append([ -sorted_lits[j][k], sorted_lits[cmaj][k] ]) # (v1 => v2)
#print("-{0} => {1}".format(sorted_lits[j][k], sorted_lits[cmaj][k]))
'''
# enforce exactly one of the feature values to be chosen
# (for categorical features)
categories = collections.defaultdict(lambda: [])
for f in self.extended_feature_names:
if '_' in f:
categories[f.split('_')[0]].append(self.newVar(f))
for c, feats in six.iteritems(categories):
#ExactlyOne feat is True
self.cnf.append(feats)
enc = CardEnc.atmost(lits=feats, vpool=self.vpool, encoding=EncType.cardnetwrk)
self.cnf.extend(enc.clauses)
'''
#if self.optns.verb:
# number of variables
print('#vars:', self.cnf.nv)
# number of clauses
print('#clauses:', len(self.cnf.clauses))
#print(self.cnf.clauses)
return self.cnf, self.intvs, self.imaps, self.ivars
'''
def test_sample(self, sample):
"""
Check whether or not the encoding "predicts" the same class
as the classifier given an input sample.
"""
# first, compute the scores for all classes as would be
# predicted by the classifier
# score arrays computed for each class
csum = [[] for c in range(self.num_class)]
#if self.optns.verb:
print('testing sample:', list(sample))
# traversing all trees
for i, tree in enumerate(self.forest.trees):
c = predict_tree(tree, sample)
csum[c].append(i)
# encoding the majority
cmaj,_ = max(enumerate(csum), key=(lambda x: len(x[1])))
# second, get the scores computed with the use of the encoding
assert self.cnf, "There is no encoding."
slv = Solver(name="minisat22")
slv.add_formula(self.cnf)
# asserting the sample
hypos = []
#for i, fval in enumerate(sample):
'''
def access(self):
"""
Get access to the encoding, features names, and the number of
classes.
"""
return self.cnf, self.intvs, self.imaps, self.ivars, self.feats, self.num_class
\ No newline at end of file
#define PY_SSIZE_T_CLEAN
#include <setjmp.h>
#include <signal.h>
#include <stdio.h>
#include <Python.h>
#include "sortcard.hh"
using namespace std;
// docstrings
//=============================================================================
static char module_docstring[] = "This module provides an interface for "
"encoding a few types of cardinality "
"constraints";
//static char atmost_docstring[] = "Create an AtMost(k) constraint.";
//static char atleast_docstring[] = "Create an AtLeast(k) constraint.";
static char sortn_docstring[] = "Sort an array of bits.";
static PyObject *CardError;
static jmp_buf env;
// function declaration for functions available in module
//=============================================================================
extern "C" {
//static PyObject *py_encode_atmost (PyObject *, PyObject *);
//static PyObject *py_encode_atleast (PyObject *, PyObject *);
static PyObject *py_sortn (PyObject *, PyObject *);
}
// module specification
//=============================================================================
static PyMethodDef module_methods[] = {
//{ "encode_atmost", py_encode_atmost, METH_VARARGS, atmost_docstring },
//{ "encode_atleast", py_encode_atleast, METH_VARARGS, atleast_docstring },
{ "HSort", py_sortn, METH_VARARGS, sortn_docstring },
{ NULL, NULL, 0, NULL }
};
extern "C" {
// signal handler for SIGINT
//=============================================================================
static void sigint_handler(int signum)
{
longjmp(env, -1);
}
//#if PY_MAJOR_VERSION >= 3 // for Python3
// PyInt_asLong()
//=============================================================================
static int pyint_to_cint(PyObject *i_obj)
{
return PyLong_AsLong(i_obj);
}
// PyInt_fromLong()
//=============================================================================
static PyObject *pyint_from_cint(int i)
{
return PyLong_FromLong(i);
}
// PyCapsule_New()
//=============================================================================
static PyObject *void_to_pyobj(void *ptr)
{
return PyCapsule_New(ptr, NULL, NULL);
}
// PyCapsule_GetPointer()
//=============================================================================
static void *pyobj_to_void(PyObject *obj)
{
return PyCapsule_GetPointer(obj, NULL);
}
// PyInt_Check()
//=============================================================================
static int pyint_check(PyObject *i_obj)
{
return PyLong_Check(i_obj);
}
// module initialization
//=============================================================================
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"pysortnetwrk", /* m_name */
module_docstring, /* m_doc */
-1, /* m_size */
module_methods, /* m_methods */
NULL, /* m_reload */
NULL, /* m_traverse */
NULL, /* m_clear */
NULL, /* m_free */
};
/*
PyMODINIT_FUNC PyInit_pycard(void)
{
PyObject *m = PyModule_Create(&module_def);
if (m == NULL)
return NULL;
CardError = PyErr_NewException((char *)"pycard.error", NULL, NULL);
Py_INCREF(CardError);
if (PyModule_AddObject(m, "error", CardError) < 0) {
Py_DECREF(CardError);
return NULL;
}
return m;
}*/
PyMODINIT_FUNC PyInit_pysortnetwrk(void)
{
PyObject *m = PyModule_Create(&module_def);
if (m == NULL)
return NULL;
CardError = PyErr_NewException((char *)"pycard.error", NULL, NULL);
Py_INCREF(CardError);
if (PyModule_AddObject(m, "error", CardError) < 0) {
Py_DECREF(CardError);
return NULL;
}
return m;
}
// auxiliary function for translating an iterable to a vector<int>
//=============================================================================
static bool pyiter_to_vector(PyObject *obj, vector<int>& vect)
{
PyObject *i_obj = PyObject_GetIter(obj);
if (i_obj == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Object does not seem to be an iterable.");
return false;
}
PyObject *l_obj;
while ((l_obj = PyIter_Next(i_obj)) != NULL) {
if (!pyint_check(l_obj)) {
Py_DECREF(l_obj);
Py_DECREF(i_obj);
PyErr_SetString(PyExc_TypeError, "integer expected");
return false;
}
int l = pyint_to_cint(l_obj);
Py_DECREF(l_obj);
if (l == 0) {
Py_DECREF(i_obj);
PyErr_SetString(PyExc_ValueError, "non-zero integer expected");
return false;
}
vect.push_back(l);
}
Py_DECREF(i_obj);
return true;
}
//
//=============================================================================
static PyObject *py_sortn(PyObject *self, PyObject *args)
{
PyObject *av_obj;
//PyObject *cv_obj;
int top;
int zvar;
//PyObject *lhs_obj;
//int rhs;
//int top;
//int enc;
int main_thread;
if (!PyArg_ParseTuple(args, "Oiii", &av_obj, &top, &zvar,
&main_thread))
return NULL;
vector<int> av;
if (pyiter_to_vector(av_obj, av) == false)
return NULL;
PyOS_sighandler_t sig_save;
if (main_thread) {
sig_save = PyOS_setsig(SIGINT, sigint_handler);
if (setjmp(env) != 0) {
PyErr_SetString(CardError, "Caught keyboard interrupt");
return NULL;
}
}
// calling encoder
ClauseSet dest;
vector<int> cv;
sortn_half_sorter_recur(top, dest, av, cv, zvar);
//_encode_atmost(dest, lhs, rhs, top, enc);
if (main_thread)
PyOS_setsig(SIGINT, sig_save);
// creating the resulting clause set
PyObject *dest_obj = PyList_New(dest.size());
for (size_t i = 0; i < dest.size(); ++i) {
PyObject *cl_obj = PyList_New(dest[i].size());
for (size_t j = 0; j < dest[i].size(); ++j) {
PyObject *lit_obj = pyint_from_cint(dest[i][j]);
PyList_SetItem(cl_obj, j, lit_obj);
}
PyList_SetItem(dest_obj, i, cl_obj);
}
PyObject *cv_obj = PyList_New(cv.size());
for (size_t i = 0; i < cv.size(); ++i) {
PyObject *lit_obj = pyint_from_cint(cv[i]);
PyList_SetItem(cv_obj, i, lit_obj);
}
PyObject *ret = Py_BuildValue("OOn", dest_obj, cv_obj, (Py_ssize_t)top);
Py_DECREF(dest_obj);
Py_DECREF(cv_obj);
return ret;
}
} // extern "C"
File deleted
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment