import torch
from math import log
from warnings import warn
from ._DecisionTree import DecisionTree
from ....Exceptions import NotFittedError
[docs]
class AdaBoostClassifier:
"""
AdaBoostClassifier implements a classification algorithm fitting many consecutive :class:`DecisionTrees <DLL.MachineLearning.SupervisedLearning.Trees.DecisionTree>` to previously missclassified samples.
Args:
n_trees (int, optional): The number of trees used for predicting. Defaults to 10. Must be a positive integer.
max_depth (int, optional): The maximum depth of the tree. Defaults to 25. Must be a positive integer.
min_samples_split (int, optional): The minimum required samples in a leaf to make a split. Defaults to 2. Must be a positive integer.
criterion (str, optional): The information criterion used to select optimal splits. Must be one of "entropy" or "gini". Defaults to "gini".
Attributes:
n_features (int): The number of features. Available after fitting.
n_classes (int): The number of classes. 2 for binary classification. Available after fitting.
confidences (torch.tensor of shape (n_trees,)): The confidence on each tree.
"""
def __init__(self, n_trees=10, max_depth=25, min_samples_split=2, criterion="gini"):
if not isinstance(n_trees, int) or n_trees < 1:
raise ValueError("n_trees must be a positive integer.")
if not isinstance(max_depth, int) or max_depth < 1:
raise ValueError("max_depth must be a positive integer.")
if not isinstance(min_samples_split, int) or min_samples_split < 1:
raise ValueError("min_samples_split must be a positive integer.")
if criterion not in ["entropy", "gini"]:
raise ValueError('The chosen criterion must be one of "entropy" or "gini".')
self.n_trees = n_trees
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.criterion = criterion
self.trees = None
[docs]
def fit(self, X, y, verbose=True):
"""
Fits the AdaBoostClassifier model to the input data by fitting trees to the errors made by previous trees.
Args:
X (torch.Tensor of shape (n_samples, n_features)): The input data, where each row is a sample and each column is a feature.
y (torch.Tensor of shape (n_samples,)): The labels corresponding to each sample. Every element must be in [0, ..., n_classes - 1].
verbose (bool, optional): Determines if warnings are given if the training ends due to a weak learner being worse than random guessing. Defaults to True.
Returns:
The average errors after each tree.
Raises:
TypeError: If the input matrix or the label vector is not a PyTorch tensor or if the problem is binary and metrics is not a list or a tuple.
ValueError: If the input matrix or the label vector is not the correct shape or the label vector contains wrong values.
"""
if not isinstance(X, torch.Tensor) or not isinstance(y, torch.Tensor):
raise TypeError("The input matrix and the label matrix must be a PyTorch tensor.")
if X.ndim != 2:
raise ValueError("The input matrix must be a 2 dimensional tensor.")
if y.ndim != 1 or y.shape[0] != X.shape[0]:
raise ValueError("The labels must be 1 dimensional with the same number of samples as the input data")
vals = torch.unique(y).numpy()
if set(vals) != {*range(len(vals))}:
raise ValueError("y must only contain the values in [0, ..., n_classes - 1].")
if not isinstance(verbose, bool):
raise TypeError("verbose must be a boolean.")
self.n_classes = len(vals)
self.classes = vals
y = y.to(X.dtype)
self.n_features = X.shape[1]
trees = []
weights = torch.full_like(y, 1 / len(y))
self.confidences = []
errors = []
for i in range(self.n_trees):
indices = torch.multinomial(weights, len(y), replacement=True)
X_sample = X[indices]
y_sample = y[indices]
# Make sure atleast 1 datapoint is present from each class
sample_classes = torch.unique(y_sample)
if len(sample_classes) < self.n_classes:
unseen_classes = list(set(self.classes) - set(sample_classes))
X_new = []
y_new = []
for class_ in unseen_classes:
index = torch.multinomial((y == class_).float(), 1)
X_new.append(X[index].squeeze(0))
y_new.append(y[index].squeeze(0))
X_sample = torch.cat([X_sample, torch.stack(X_new)])
y_sample = torch.cat([y_sample, torch.stack(y_new)])
tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion)
tree.fit(X_sample, y_sample)
prediction = tree.predict(X)
incorrect = prediction != y
eps = torch.sum(incorrect * weights)
errors.append(eps)
# If better than random quessing, continue
if eps < 1 - 1 / self.n_classes:
alpha = 0.5 * (torch.log((1 - eps) / (eps + 1e-8)) + log(self.n_classes - 1))
weights = weights * torch.exp(alpha * incorrect)
weights /= weights.sum() # keep weights as a probability distribution
self.confidences.append(alpha)
else:
if verbose: warn(f"The latest weak learner is worse than random guessing. The training is stopped to reduce over fitting. Only {i} learners are used.")
self.n_trees = i
break
trees.append(tree)
self.trees = trees
return errors
[docs]
def predict(self, X):
"""
Applies the fitted AdaBoostClassifier model to the input data, predicting the correct classes.
Args:
X (torch.Tensor of shape (n_samples, n_features)): The input data to be classified.
Returns:
labels (torch.Tensor of shape (n_samples,)): The predicted labels corresponding to each sample.
Raises:
NotFittedError: If the AdaBoostClassifier model has not been fitted before predicting.
TypeError: If the input matrix is not a PyTorch tensor.
ValueError: If the input matrix is not the correct shape.
"""
if self.trees is None:
raise NotFittedError("AdaBoostClassifier.fit() must be called before predicting.")
if not isinstance(X, torch.Tensor):
raise TypeError("The input matrix must be a PyTorch tensor.")
if X.ndim != 2 or X.shape[1] != self.n_features:
raise ValueError("The input matrix must be a 2 dimensional tensor with the same number of features as the fitted tensor.")
preds = torch.zeros((len(X), self.n_classes))
for alpha, tree in zip(self.confidences, self.trees):
pred = tree.predict(X).int()
one_hot_pred = torch.eye(self.n_classes)[pred]
preds += alpha * one_hot_pred
return preds.argmax(dim=1)
[docs]
def predict_proba(self, X):
"""
Applies the fitted AdaBoostClassifier model to the input data, predicting the probabilities of each class.
Args:
X (torch.Tensor of shape (n_samples, n_features)): The input data for which to predict probabilities.
Returns:
torch.Tensor of shape (n_samples, n_classes): The predicted probabilities for each class.
Raises:
NotFittedError: If the AdaBoostClassifier model has not been fitted before predicting.
TypeError: If the input matrix is not a PyTorch tensor.
ValueError: If the input matrix is not the correct shape.
"""
if self.trees is None:
raise NotFittedError("AdaBoostClassifier.fit() must be called before predicting.")
if not isinstance(X, torch.Tensor):
raise TypeError("The input matrix must be a PyTorch tensor.")
if X.ndim != 2 or X.shape[1] != self.n_features:
raise ValueError("The input matrix must be a 2 dimensional tensor with the same number of features as the fitted tensor.")
preds = torch.zeros((len(X), self.n_classes))
for alpha, tree in zip(self.confidences, self.trees):
pred = tree.predict(X).int()
one_hot_pred = torch.eye(self.n_classes)[pred]
preds += alpha * one_hot_pred
probs = torch.exp(preds - torch.max(preds, dim=1, keepdim=True).values)
probs = probs / torch.sum(probs, dim=1, keepdim=True)
if self.n_classes == 2:
return probs[:, 1]
return probs