Source code for qtree.ctree

"""Classical decision trees.
"""
import numpy as np
from itertools import product
from sklearn import metrics
import warnings
import copy
import logging



[docs]
class Node(object):
    """Classical decision tree node (root, inner node or leaf).
    
    Parameters
    ----------
    
    depth : int
        Depth of this node. The root corresponds to ``depth=0``.
        
    y_dim : int
        Number of binary labels.
   
    default_y : int or None
        Default value based on majority of classes in training data. Is used 
        if a classification is ambiguous in a leaf. Set to None to disable.
        
    logger : logging.Logger or None
        Logger for all output purposes. If None, print to std is used.
    """

    def __init__(self, depth, y_dim, default_y=None, logger=None):
        self._depth = depth
        self._y_dim = y_dim
        self._split_index = None
        self._split_0_node = None
        self._split_1_node = None
        self._proba_prediction = None
        self._undefined_prediction = None
        self._default_y = default_y  # if no samples are available: predict class with the highest occurence in training
        self._logger = logger
        self._warning = None  # only store last warning

    @property
    def py_default(self):
        if self._default_y is None:
            py = [[.5] * self._y_dim, [.5] * self._y_dim]
        else:
            py = np.empty((2, self._y_dim))
            for yi in range(self._y_dim):
                if self._default_y[yi] is None:
                    py[:, yi] = [.5, .5]
                elif self._default_y[yi] == 0:
                    py[:, yi] = [1., 0.]
                elif self._default_y[yi] == 1:
                    py[:, yi] = [0., 1.]
                else:
                    raise ValueError
        return py

    def __str__(self):
        return self._to_str()

    def _print(self, msg, lvl=logging.DEBUG):
        if self._logger is not None:
            self._logger.log(lvl, msg)
        else:
            print(msg)

    def _push_warning(self, msg, omit_warnings):
        self._warning = msg
        if not omit_warnings:
            self._print("Warning: {}".format(msg), logging.WARNING)

    def _split(self, tree_dict):
        if len(tree_dict) > 0 and type(list(tree_dict.values())[0]) is list and np.all(
                [type(split) is dict for split in list(tree_dict.values())[0]]):
            self._split_index = list(tree_dict.keys())[0]
            split_list = list(tree_dict.values())[0]
            self._split_0_node = Node(self._depth + 1, self._y_dim, self._default_y, self._logger)
            self._split_0_node._split(split_list[0])
            self._split_1_node = Node(self._depth + 1, self._y_dim, self._default_y, self._logger)
            self._split_1_node._split(split_list[1])
        else:
            self._leaf(tree_dict)

    def _leaf(self, tree_dict):
        if len(tree_dict) == 0 or 0 not in tree_dict or 1 not in tree_dict:
            self._proba_prediction = self.py_default
            self._undefined_prediction = True
        else:
            py = [tree_dict[0], tree_dict[1]]
            self._proba_prediction = py
            self._undefined_prediction = False

    def _to_str(self, indent=0, tab="\t", title=""):
        out = ""
        if len(title) > 0:
            out += "{}{}".format(tab * indent, title) + "\n"
        if self._split_0_node is not None:
            out += "{}x[{}] = {}".format(tab * indent, self._split_index, 0) + "\n"
            out += self._split_0_node._to_str(indent + 1, tab)
        if self._split_1_node is not None:
            out += "{}x[{}] = {}".format(tab * indent, self._split_index, 1) + "\n"
            out += self._split_1_node._to_str(indent + 1, tab)
        if self._proba_prediction is not None:
            if self._default_y is None:
                undef_tag = "?"
            else:
                undef_tag = "! y=[{}]".format(
                    ",".join(["{:d}".format(y) if y is not None else "?" for y in self._default_y]))
            y_predict = self._predict(None, True)
            y_proba_0 = self._proba_prediction[0]
            y_proba_1 = self._proba_prediction[1]
            tag_str = (" " + undef_tag) if self._undefined_prediction else ""
            for idx in range(self._y_dim):
                out += "{}{}".format(tab * indent,
                                     "y_{} = {} ({:.3f}/{:.3f}){}".format(idx, y_predict[idx], y_proba_0[idx],
                                                                          y_proba_1[idx], tag_str)) + "\n"
        return out

    def _predict(self, x=None, omit_warnings=False, verbose=False):
        return np.argmax(self._predict_proba(x, omit_warnings, verbose), axis=0)

    def _predict_proba(self, x=None, omit_warnings=False,
                       verbose=False):  # verbose = True: print path through tree for each prediction
        if verbose:
            self._print("predict_proba: depth={}, split_index={}".format(self._depth, self._split_index))
        if self._proba_prediction is not None:
            if verbose:
                self._print("\tpredict_proba: prediction={}{}".format(self._proba_prediction,
                                                                      " (undefined)" if self._undefined_prediction else ""))
            if self._undefined_prediction:
                self._push_warning("Undefined prediction for x={}!".format(x), omit_warnings)
            return self._proba_prediction  # [[y0=0,y1=0,...],[y0=1,y1=1,...]]
        if x[self._split_index] == 0:
            if verbose:
                self._print("\tpredict_proba: x[{}]==0 split".format(self._split_index))
            return self._split_0_node._predict_proba(x, omit_warnings, verbose)
        else:
            if verbose:
                self._print("\tpredict_proba: x[{}]==1 split".format(self._split_index))
            return self._split_1_node._predict_proba(x, omit_warnings, verbose)




[docs]
class ClassicalTreeBase(Node):
    """Classical decision tree base.
    
    Parameters
    ----------
    
    tree_dict : dict
        Tree structure as dictionary, see ``qtree.QTree``.
        
    y_dim : int or None
        Number of binary labels. If None, extract from ``tree_dict``.
        
    y : array-like of shape (n_samples, n_labels) or None
        Target labels of training data. Used to find majority classes for 
        default predictions. If None, no defaults are used.
   
    tree_dict_detailed : dict or None
        Tree structure as detailed dictionary, see ``qtree.QTree``. Is used to 
        add optional information to each node. Set to None to disable.
        
    logger : logging.Logger or None
        Logger for all output purposes. If None, print to std is used.
    """

    def __init__(self, tree_dict, y_dim, y, tree_dict_detailed, logger):
        # y dimension
        y_dim_, found = self._get_y_dim(tree_dict)
        if not found:
            raise ValueError('tree_dict has wrong format')
        if y_dim is not None:
            assert y_dim == y_dim_, 'y_dim and tree_dict incompatible'
        else:
            y_dim = y_dim_

        # y data
        if y is None:
            default_y = None
        else:
            # to determine default_py0_shift (for default prediction based on training data support if no sampling is present in a leaf)
            y = np.asarray(y)
            assert y.shape[1] == y_dim, 'data and tree_dict incompatible'
            self._y0 = np.sum(y, axis=0)
            self._y1 = y.shape[0] - np.sum(y, axis=0)
            default_y = np.empty(y_dim, dtype=int)
            for yi in range(y_dim):
                if self._y0[yi] > self._y1[yi]:
                    default_y[yi] = 1
                elif self._y0[yi] < self._y1[yi]:
                    default_y[yi] = 0
                else:
                    default_y[yi] = None

                    # construct tree
        super().__init__(depth=0, y_dim=y_dim, default_y=default_y, logger=logger)
        self.from_tree_dict(tree_dict, tree_dict_detailed)

    def _get_y_dim(self, value):
        if type(value) is list:
            for value_ in value:
                y_dim, found = self._get_y_dim(value_)
                if found:
                    return y_dim, True
        elif type(value) is dict:
            if len(value) == 0:
                return None, False
            elif all([type(value) is list and all([type(v) is float or type(v) is int for v in value]) for value in
                      value.values()]):
                y_dim = len(list(value.values())[0])
                return y_dim, True
            else:
                for value_ in value.values():
                    y_dim, found = self._get_y_dim(value_)
                    if found:
                        return y_dim, True
        else:
            raise ValueError

    def _tree_dict_traversal(self, tree_dict, paths, path=[]):
        if len(tree_dict) > 0 and type(list(tree_dict.values())[0]) is list and np.all(
                [type(split) is dict for split in list(tree_dict.values())[0]]):
            split_index = list(tree_dict.keys())[0]
            split_list = list(tree_dict.values())[0]
            self._tree_dict_traversal(split_list[0], paths, path + [[split_index, 0]])
            self._tree_dict_traversal(split_list[1], paths, path + [[split_index, 1]])
        else:
            paths.append(path)




[docs]
class ClassicalTree(ClassicalTreeBase):
    """Classical decision tree base.
    
    Parameters
    ----------
    
    tree_dict : dict
        Tree structure as dictionary, see ``qtree.QTree``.
        
    y_dim : int or None
        Number of binary labels. If None, extract from ``tree_dict``. Defaults 
        to None.
        
    y : array-like of shape (n_samples, n_labels) or None
        Target labels of training data. Used to find majority classes for 
        default predictions. If None, no defaults are used. Defaults to None.
   
    tree_dict_detailed : dict or None
        Tree structure as detailed dictionary, see ``qtree.QTree``. Is used to 
        add optional information to each node. Set to None to disable. Defaults 
        to empty dictionary.
        
    logger : logging.Logger or None
        Logger for all output purposes. If None, print to std is used. Defaults 
        to None.
    """

    def __init__(self, tree_dict, y_dim=None, y=None, tree_dict_detailed={}, logger=None):
        super().__init__(tree_dict=tree_dict, y_dim=y_dim, y=y, tree_dict_detailed=tree_dict_detailed, logger=logger)

    def from_tree_dict(self, tree_dict, tree_dict_detailed={}):
        self._tree_dict = copy.deepcopy(tree_dict)
        if tree_dict_detailed is None:
            tree_dict_detailed = {}
        self._tree_dict_detailed = copy.deepcopy(
            tree_dict_detailed)  # additional information, not actually used for classifier
        self._split(self._tree_dict)
        return self


[docs]
    def predict(self, X, omit_warnings=False, verbose=False):
        """Make class prediction.
        """

        X = np.asarray(X)
        y = np.empty((X.shape[0], self._y_dim), dtype=bool)
        for idx in range(X.shape[0]):
            y[idx, :] = self._predict(X[idx, :], omit_warnings, verbose)
        return y



[docs]
    def predict_proba(self, X, omit_warnings=False, verbose=False):
        """Make class probability prediction.
        """

        X = np.asarray(X)
        p = np.empty((X.shape[0], 2, self._y_dim), dtype=np.double)
        for idx in range(X.shape[0]):
            p[idx, :, :] = np.array(self._predict_proba(X[idx, :], omit_warnings, verbose))  # .reshape(2, self._y_dim)
        return p



[docs]
    def print_classification_report(self, X, y, omit_warnings=False):
        """Print classification report.
        """

        y_predict = self.predict(X, omit_warnings)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print(metrics.classification_report(y, y_predict))  # true, pred
        return metrics.accuracy_score(y, y_predict)



[docs]
    def get_classification_dict(self, X, y, omit_warnings=False):
        """Return classification report as dictionary.
        """

        y_predict = self.predict(X, omit_warnings)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
        return metrics.classification_report(y, y_predict, output_dict=True)  # true, pred



[docs]
    def calc_exact_probabilities(self, X, y, omit_warnings=False):
        """Calculate the exact class prediction probabilities based on the 
        data ``X`` and ``y``.
        """

        # exact probabilities based on data
        paths = []
        self._tree_dict_traversal(self._tree_dict, paths)

        # exact tree dict
        tree_dict_exact = {}  # p(y)
        for path in paths:
            X_filter = X.copy()
            y_filter = y.copy()
            node = tree_dict_exact
            for split_idx, split_value in path:
                y_filter = y_filter[X_filter[:, split_idx] == split_value, :]
                X_filter = X_filter[X_filter[:, split_idx] == split_value, :]
                if split_idx not in node:
                    node[split_idx] = [{}, {}]
                node = node[split_idx][split_value]
            if y_filter.shape[0] > 0:
                py0 = [y_filter[y_filter[:, y_idx] == 0, y_idx].shape[0] / y_filter.shape[0] for y_idx in
                       range(y_filter.shape[1])]
                py1 = [y_filter[y_filter[:, y_idx] == 1, y_idx].shape[0] / y_filter.shape[0] for y_idx in
                       range(y_filter.shape[1])]
            else:  # fall back to unknown
                self._push_warning("Empty y_filter for path = {}".format(path), omit_warnings)
                py0 = [.5] * self._y_dim
                py1 = [.5] * self._y_dim
            node[0] = py0
            node[1] = py1

        def to_data_probabilities(x):
            N = x.shape[1]
            data_probabilities = {}
            for key in product('01', repeat=N):
                data_probabilities["".join(key)] = 0
            for key in product('01', repeat=N):
                key = "".join(key)
                values = [bool(int(value)) for value in key]
                if x.shape[0] > 0:
                    p = x[np.logical_and.reduce([x[:, index] == value for index, value in enumerate(values)])].shape[
                            0] / x.shape[0]
                else:
                    self._push_warning("Empty x for path = {}".format(path), omit_warnings)
                    p = .5
                key = key[::-1]  # correct qiskit ordering
                data_probabilities[key] = p
            return data_probabilities

        # exact conditional probabilities
        tree_dict_exact_data_probabilities = {}
        for path in paths:
            X_filter = X.copy()
            y_filter = y.copy()
            node = tree_dict_exact_data_probabilities
            split_idx_list = []
            for split_idx, split_value in path:
                y_filter = y_filter[X_filter[:, split_idx] == split_value, :]
                X_filter = X_filter[X_filter[:, split_idx] == split_value, :]
                split_idx_list.append(split_idx)
                if split_idx not in node:
                    node[split_idx] = [{}, {}]
                node = node[split_idx][split_value]
            X_filter = np.delete(X_filter, split_idx_list,
                                 axis=1)  # enable: save only reduced, disable: save everything
            Xy_filter = np.concatenate((X_filter, y_filter), axis=1)
            data_probabilities_X = to_data_probabilities(X_filter)
            node['p(x|cl)'] = data_probabilities_X
            data_probabilities_y = to_data_probabilities(y_filter)
            node['p(y|cl)'] = data_probabilities_y
            data_probabilities_Xy = to_data_probabilities(Xy_filter)
            node['p(x,y|cl)'] = data_probabilities_Xy
            p_cl = X_filter.shape[0] / X.shape[0]
            node['p(cl)'] = p_cl

        return paths, tree_dict_exact, tree_dict_exact_data_probabilities



[docs]
    def calc_difference_to_exact(self, paths, tree_dict_exact, tree_dict_exact_data_probabilities={}, N=None,
                                 epsilon=None, omit_warnings=False):
        """Calculate the distance between the class probabilities in the 
        leaves between the tree dictionary and another tree dictionary.
        """

        # distance between p(y) from two tree_dicts
        tree_dict = self._tree_dict
        tree_dict_detailed = self._tree_dict_detailed

        def p_distance(p_q, p_e):
            p_q = np.array(p_q)  # 1-dim
            p_e = np.array(p_e)  # 1-dim
            # from scipy.spatial.distance import jensenshannon
            # return jensenshannon(p_q, p_e)
            return np.linalg.norm(p_q - p_e)

        # 'p(y)'
        py_dist_list = []
        for path in paths:
            node_q = tree_dict
            node_e = tree_dict_exact
            for split_idx, split_value in path:
                node_q = node_q[split_idx][split_value]
                node_e = node_e[split_idx][split_value]
            if 0 in node_q and 1 in node_q:
                p0_q = node_q[0]
                p1_q = node_q[1]
            else:  # fall back to unknown
                self._push_warning("0, 1 not keys in node_q for path = {}".format(path), omit_warnings)
                p0_q = [.5] * self._y_dim
                p1_q = [.5] * self._y_dim
            p_q = np.array([p0_q, p1_q])[0, :]  # (2, y_dim) -> (y_dim)
            if 0 in node_e and 1 in node_e:
                p0_e = node_e[0]
                p1_e = node_e[1]
            else:  # fall back to unknown
                self._push_warning("0, 1 not keys in node_e for path = {}".format(path), omit_warnings)
                p0_e = [.5] * self._y_dim
                p1_e = [.5] * self._y_dim
            p_e = np.array([p0_e, p1_e])[0, :]  # (2, y_dim) -> (y_dim)
            d = p_distance(p_q, p_e)
            py_dist_list.append(d)
        py_dist_list = np.array(py_dist_list)
        py_dist_mean = np.mean(py_dist_list)

        # 'p(cl)'
        pcl_dist_list = []
        if len(tree_dict_detailed) > 0 and len(tree_dict_exact_data_probabilities) > 0:
            for path in paths:
                node_q = tree_dict_detailed
                node_e = tree_dict_exact_data_probabilities
                for split_idx, split_value in path:
                    node_q = node_q[split_idx][split_value]
                    node_e = node_e[split_idx][split_value]
                if 'p(cl)' in node_q:
                    p_q = node_q['p(cl)']
                else:  # fall back to unknown
                    self._push_warning("'p(cl)' not key in node_q for path = {}".format(path), omit_warnings)
                    p_q = 0
                if 'p(cl)' in node_e:
                    p_e = node_e['p(cl)']
                else:  # fall back to unknown
                    self._push_warning("'p(cl)' not key in node_e for path = {}".format(path), omit_warnings)
                    p_e = 0
                d = p_distance(p_q, p_e)
                pcl_dist_list.append(d)
            pcl_dist_mean = np.mean(pcl_dist_list)
        else:
            pcl_dist_mean = np.nan
        pcl_dist_list = np.array(pcl_dist_list)

        return py_dist_list, py_dist_mean, pcl_dist_list, pcl_dist_mean