Source code for DLL.MachineLearning.SupervisedLearning.Trees._AdaBoostClassifier

import torch
from math import log
from warnings import warn

from ._DecisionTree import DecisionTree
from ....Exceptions import NotFittedError



[docs]
class AdaBoostClassifier:
    """
    AdaBoostClassifier implements a classification algorithm fitting many consecutive :class:`DecisionTrees <DLL.MachineLearning.SupervisedLearning.Trees.DecisionTree>` to previously missclassified samples.

    Args:
        n_trees (int, optional): The number of trees used for predicting. Defaults to 10. Must be a positive integer.
        max_depth (int, optional): The maximum depth of the tree. Defaults to 25. Must be a positive integer.
        min_samples_split (int, optional): The minimum required samples in a leaf to make a split. Defaults to 2. Must be a positive integer.
        criterion (str, optional): The information criterion used to select optimal splits. Must be one of "entropy" or "gini". Defaults to "gini".
    Attributes:
        n_features (int): The number of features. Available after fitting.
        n_classes (int): The number of classes. 2 for binary classification. Available after fitting.
        confidences (torch.tensor of shape (n_trees,)): The confidence on each tree.
    """
    def __init__(self, n_trees=10, max_depth=25, min_samples_split=2, criterion="gini"):
        if not isinstance(n_trees, int) or n_trees < 1:
            raise ValueError("n_trees must be a positive integer.")
        if not isinstance(max_depth, int) or max_depth < 1:
            raise ValueError("max_depth must be a positive integer.")
        if not isinstance(min_samples_split, int) or min_samples_split < 1:
            raise ValueError("min_samples_split must be a positive integer.")
        if criterion not in ["entropy", "gini"]:
            raise ValueError('The chosen criterion must be one of "entropy" or "gini".')
        
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.trees = None


[docs]
    def fit(self, X, y, verbose=True):
        """
        Fits the AdaBoostClassifier model to the input data by fitting trees to the errors made by previous trees.

        Args:
            X (torch.Tensor of shape (n_samples, n_features)): The input data, where each row is a sample and each column is a feature.
            y (torch.Tensor of shape (n_samples,)): The labels corresponding to each sample. Every element must be in [0, ..., n_classes - 1].
            verbose (bool, optional): Determines if warnings are given if the training ends due to a weak learner being worse than random guessing. Defaults to True.
        Returns:
            The average errors after each tree.
        Raises:
            TypeError: If the input matrix or the label vector is not a PyTorch tensor or if the problem is binary and metrics is not a list or a tuple.
            ValueError: If the input matrix or the label vector is not the correct shape or the label vector contains wrong values.
        """
        if not isinstance(X, torch.Tensor) or not isinstance(y, torch.Tensor):
            raise TypeError("The input matrix and the label matrix must be a PyTorch tensor.")
        if X.ndim != 2:
            raise ValueError("The input matrix must be a 2 dimensional tensor.")
        if y.ndim != 1 or y.shape[0] != X.shape[0]:
            raise ValueError("The labels must be 1 dimensional with the same number of samples as the input data")
        vals = torch.unique(y).numpy()
        if set(vals) != {*range(len(vals))}:
            raise ValueError("y must only contain the values in [0, ..., n_classes - 1].")
        if not isinstance(verbose, bool):
            raise TypeError("verbose must be a boolean.")
        
        self.n_classes = len(vals)
        self.classes = vals
        y = y.to(X.dtype)
        self.n_features = X.shape[1]

        trees = []
        weights = torch.full_like(y, 1 / len(y))
        self.confidences = []

        errors = []

        for i in range(self.n_trees):
            indices = torch.multinomial(weights, len(y), replacement=True)
            X_sample = X[indices]
            y_sample = y[indices]

            #  Make sure atleast 1 datapoint is present from each class
            sample_classes = torch.unique(y_sample)
            if len(sample_classes) < self.n_classes:
                unseen_classes = list(set(self.classes) - set(sample_classes))
                X_new = []
                y_new = []
                for class_ in unseen_classes:
                    index = torch.multinomial((y == class_).float(), 1)
                    X_new.append(X[index].squeeze(0))
                    y_new.append(y[index].squeeze(0))
                X_sample = torch.cat([X_sample, torch.stack(X_new)])
                y_sample = torch.cat([y_sample, torch.stack(y_new)])
            
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion)
            tree.fit(X_sample, y_sample)
            prediction = tree.predict(X)
            incorrect = prediction != y
            eps = torch.sum(incorrect * weights)
            errors.append(eps)

            #  If better than random quessing, continue
            if eps < 1 - 1 / self.n_classes:
                alpha = 0.5 * (torch.log((1 - eps) / (eps + 1e-8)) + log(self.n_classes - 1))
                weights = weights * torch.exp(alpha * incorrect)
                weights /= weights.sum()  # keep weights as a probability distribution
                self.confidences.append(alpha)
            else:
                if verbose: warn(f"The latest weak learner is worse than random guessing. The training is stopped to reduce over fitting. Only {i} learners are used.")
                self.n_trees = i
                break

            trees.append(tree)
        
        self.trees = trees
        return errors



[docs]
    def predict(self, X):
        """
        Applies the fitted AdaBoostClassifier model to the input data, predicting the correct classes.

        Args:
            X (torch.Tensor of shape (n_samples, n_features)): The input data to be classified.

        Returns:
            labels (torch.Tensor of shape (n_samples,)): The predicted labels corresponding to each sample.

        Raises:
            NotFittedError: If the AdaBoostClassifier model has not been fitted before predicting.
            TypeError: If the input matrix is not a PyTorch tensor.
            ValueError: If the input matrix is not the correct shape.
        """
        if self.trees is None:
            raise NotFittedError("AdaBoostClassifier.fit() must be called before predicting.")
        if not isinstance(X, torch.Tensor):
            raise TypeError("The input matrix must be a PyTorch tensor.")
        if X.ndim != 2 or X.shape[1] != self.n_features:
            raise ValueError("The input matrix must be a 2 dimensional tensor with the same number of features as the fitted tensor.")

        preds = torch.zeros((len(X), self.n_classes))
        for alpha, tree in zip(self.confidences, self.trees):
            pred = tree.predict(X).int()
            one_hot_pred = torch.eye(self.n_classes)[pred]
            preds += alpha * one_hot_pred
        
        return preds.argmax(dim=1)

    

[docs]
    def predict_proba(self, X):
        """
        Applies the fitted AdaBoostClassifier model to the input data, predicting the probabilities of each class.

        Args:
            X (torch.Tensor of shape (n_samples, n_features)): The input data for which to predict probabilities.

        Returns:
            torch.Tensor of shape (n_samples, n_classes): The predicted probabilities for each class.
        
        Raises:
            NotFittedError: If the AdaBoostClassifier model has not been fitted before predicting.
            TypeError: If the input matrix is not a PyTorch tensor.
            ValueError: If the input matrix is not the correct shape.
        """
        if self.trees is None:
            raise NotFittedError("AdaBoostClassifier.fit() must be called before predicting.")
        if not isinstance(X, torch.Tensor):
            raise TypeError("The input matrix must be a PyTorch tensor.")
        if X.ndim != 2 or X.shape[1] != self.n_features:
            raise ValueError("The input matrix must be a 2 dimensional tensor with the same number of features as the fitted tensor.")

        preds = torch.zeros((len(X), self.n_classes))
        for alpha, tree in zip(self.confidences, self.trees):
            pred = tree.predict(X).int()
            one_hot_pred = torch.eye(self.n_classes)[pred]
            preds += alpha * one_hot_pred

        probs = torch.exp(preds - torch.max(preds, dim=1, keepdim=True).values)
        probs = probs / torch.sum(probs, dim=1, keepdim=True)
        if self.n_classes == 2:
            return probs[:, 1]
        return probs