Source code for DLL.MachineLearning.UnsupervisedLearning.OutlierDetection._IsolationForest

import torch
from math import log

from ...SupervisedLearning.Trees._DecisionTree import Node
from ....Exceptions import NotFittedError


[docs] class IsolationForest: """ IsolationForest implements an algorithm to detect outliers in the data by fitting may isolation trees to the data. Args: n_trees (int, optional): The number of trees used for predictiong. Defaults to 10. Must be a positive integer. max_depth (int, optional): The maximum depth of the tree. Defaults to 10. Must be a positive integer. min_samples_split (int, optional): The minimum required samples in a leaf to make a split. Defaults to 2. Must be a positive integer. bootstrap (bool, optional): Determines if the samples for fitting are boostrapped from the given data. Must be a boolean. Defaults to False. threshold (int | float, optional): Determines how many standard deviations away from the mean score a datapoint must be to be considered an outlier. Must be a non-ngeative real number. Defaults to 4. """ def __init__(self, n_trees=10, max_depth=25, min_samples_split=2, bootstrap=False, threshold=4): if not isinstance(n_trees, int) or n_trees < 1: raise ValueError("n_trees must be a positive integer.") if not isinstance(max_depth, int) or max_depth < 1: raise ValueError("max_depth must be a positive integer.") if not isinstance(min_samples_split, int) or min_samples_split < 1: raise ValueError("min_samples_split must be a positive integer.") if not isinstance(bootstrap, bool): raise TypeError("bootstrap must be a boolean.") if not isinstance(threshold, int | float) or threshold < 0: raise ValueError("threshold must be a non-negative real number.") self.n_trees = n_trees self.max_depth = max_depth self.min_samples_split = min_samples_split self.bootstrap = bootstrap self.threshold = threshold self.trees = [IsolationTree(max_depth, min_samples_split) for _ in range(n_trees)]
[docs] def fit(self, X): """ Fits the IsolationTree model to the input data by generating a tree, which splits the data randomly. Args: X (torch.Tensor of shape (n_samples, n_features)): The input data, where each row is a sample and each column is a feature. Returns: None Raises: TypeError: If the input matrix is not a PyTorch tensor. ValueError: If the input matrix is not the correct shape. """ if not isinstance(X, torch.Tensor): raise TypeError("The input matrix must be a PyTorch tensor.") if X.ndim != 2: raise ValueError("The input matrix must be a 2 dimensional tensor.") for tree in self.trees: _X = self._bootstrap_sample(X) if self.bootstrap else X tree.fit(_X)
def _bootstrap_sample(self, x): indices = torch.randint(high=len(x), size=(len(x),)) return x[indices]
[docs] def predict(self, X, return_scores=False): """ Predicts the outliers in the input by considering scores, which are threshold standard deviations away from the mean. Args: X (torch.Tensor of shape (n_samples, n_features)): The input data, where each row is a sample and each column is a feature. return_scores (bool, optional): Determines if the scores of each datapoint are returned. Defaults to False. """ if not isinstance(X, torch.Tensor): raise TypeError("The input matrix must be a PyTorch tensor.") if X.ndim != 2: raise ValueError("The input matrix must be a 2 dimensional tensor.") if not isinstance(return_scores, bool): raise TypeError("return_scores must be a boolean.") scores = torch.zeros((self.n_trees, len(X))) for i, tree in enumerate(self.trees): tree_scores = torch.tensor([tree._path_length(point, tree.root) for point in X]) scores[i] = tree_scores scores = 2 ** -(torch.mean(scores, dim=0) / self._c(len(X))) if return_scores: return scores threshold = torch.mean(scores) + self.threshold * torch.std(scores) return scores > threshold
[docs] def fit_predict(self, X, return_scores=False): """ First fits the model to the input and then predicts, which of the inputs are outliers. Args: X (torch.Tensor of shape (n_samples, n_features)): The input data, where each row is a sample and each column is a feature. return_scores (bool, optional): Determines if the scores of each datapoint are returned. Defaults to False. """ self.fit(X) return self.predict(X, return_scores=return_scores)
def _c(self, n): return 2 * (log(n - 1) + 0.5772156649) - (2 * (n - 1) / n)
class IsolationTree: """ IsolationTree implements an algorithm to detect outliers in the data. Args: max_depth (int, optional): The maximum depth of the tree. Defaults to 25. Must be a positive integer. min_samples_split (int, optional): The minimum required samples in a leaf to make a split. Defaults to 2. Must be a positive integer. """ def __init__(self, max_depth=25, min_samples_split=2): if not isinstance(max_depth, int) or max_depth < 1: raise ValueError("max_depth must be a positive integer.") if not isinstance(min_samples_split, int) or min_samples_split < 1: raise ValueError("min_samples_split must be a positive integer.") self.max_depth = max_depth self.min_samples_split = min_samples_split self.root = None def fit(self, X): """ Fits the IsolationTree model to the input data by generating a tree, which splits the data randomly. Args: X (torch.Tensor of shape (n_samples, n_features)): The input data, where each row is a sample and each column is a feature. Returns: None Raises: TypeError: If the input matrix is not a PyTorch tensor. ValueError: If the input matrix is not the correct shape. """ if not isinstance(X, torch.Tensor): raise TypeError("The input matrix must be a PyTorch tensor.") if X.ndim != 2: raise ValueError("The input matrix must be a 2 dimensional tensor.") self.n_features = X.shape[1] self.root = self._grow_tree(X, 0) def _grow_tree(self, x, depth): self.n_samples, n_features = x.size() if depth >= self.max_depth or self.n_samples < self.min_samples_split: return Node(size=self.n_samples, depth=depth) split_threshold, split_index = self._random_split(x) if split_threshold is None: return Node(size=self.n_samples, depth=depth) left_indicies, right_indicies = self._split(x[:, split_index], split_threshold) left = self._grow_tree(x[left_indicies], depth + 1) right = self._grow_tree(x[right_indicies], depth + 1) return Node(left, right, split_threshold, split_index) def _random_split(self, x): split_index = torch.randint(x.shape[1], size=tuple()) feature_min, feature_max = torch.min(x[:, split_index]), torch.max(x[:, split_index]) if feature_min == feature_max: return None, None split_threshold = (feature_max - feature_min) * torch.rand(size=tuple()) + feature_min return split_threshold, split_index def _split(self, feature_values, threshold): left_indicies = torch.argwhere(feature_values <= threshold).flatten() right_indicies = torch.argwhere(feature_values > threshold).flatten() return left_indicies, right_indicies def _path_length(self, point, current_node): if current_node.depth is not None: return current_node.depth if point[current_node.feature_index] <= current_node.threshold: return self._path_length(point, current_node.left) return self._path_length(point, current_node.right)