// File: ml.js/** * @class WmbataBotML * @description A foundational Machine Learning module for the WmbataBot library, * providing implementations of core ML algorithms from scratch for classification and clustering, * along with essential linear algebra utilities and data preprocessing tools. */class WmbataBotML { constructor() { // --- Naive Bayes Classifier Model --- this._naiveBayesModel = { classPriors: new Map(), // P(Class) featureLikelihoods: new Map(), // P(Feature | Class) classFeatureTotals: new Map(), // Total count of features in each class vocabularySize: 0, // Total number of unique features/words in the vocabulary classes: new Set(), // All unique class labels alpha: 1 // Laplace smoothing parameter }; // --- Logistic Regression Classifier Model --- this._logisticRegressionModel = { weights: [], // Model parameters bias: 0, classes: [], // Unique class labels (for binary classification, [negative_class, positive_class]) featureCount: 0 }; // --- Decision Tree Classifier Model --- this._decisionTreeModel = { tree: null, // The root node of the decision tree featureNames: [], // Names of features, if available, for debugging/interpretation classes: [] // All unique class labels }; // --- Random Forest Classifier Model --- this._randomForestModel = { trees: [], // Array of individual decision trees featureNames: [], classes: [] }; // --- Linear SVM Model --- this._linearSVMModel = { weights: [], bias: 0, classes: [], // [negative_class, positive_class] featureCount: 0 }; // --- KNN Classifier/Regressor Model --- this._knnModel = { X_train: [], y_train: [], classes: [] // For classification }; // --- MinMaxScaler Properties --- this._minMaxScaler = { min: [], // Minimum value for each feature max: [], // Maximum value for each feature isFitted: false }; // --- StandardScaler Properties --- this._standardScaler = { mean: [], stdDev: [], isFitted: false }; // --- OneHotEncoder Properties --- this._oneHotEncoder = { categoryMaps: new Map(), // Map> featureLengths: new Map(), // Map isFitted: false }; } /** * ===================================================================== * LINEAR ALGEBRA UTILITIES (Static Methods) * ===================================================================== * These static methods provide fundamental vector and matrix operations, * useful for many machine learning algorithms. */ /** * Calculates the dot product of two vectors. * @param {number[]} v1 - The first vector. * @param {number[]} v2 - The second vector. * @returns {number} The dot product. * @throws {Error} If vectors have different lengths. */ static dotProduct(v1, v2) { if (v1.length !== v2.length) { throw new Error("Vectors must have the same length for dot product."); } let sum = 0; for (let i = 0; i < v1.length; i++) { sum += v1[i] * v2[i]; } return sum; } /** * Calculates the magnitude (Euclidean norm) of a vector. * @param {number[]} v - The vector. * @returns {number} The magnitude of the vector. */ static magnitude(v) { let sumSq = 0; for (let i = 0; i < v.length; i++) { sumSq += v[i] * v[i]; } return Math.sqrt(sumSq); } /** * Normalizes a vector to a unit vector (magnitude 1). * @param {number[]} v - The vector to normalize. * @returns {number[]} The normalized vector. */ static normalizeVector(v) { const mag = WmbataBotML.magnitude(v); if (mag === 0) { return new Array(v.length).fill(0); // Return zero vector if magnitude is zero } return v.map(component => component / mag); } /** * Calculates the Euclidean distance between two vectors. * @param {number[]} v1 - The first vector. * @param {number[]} v2 - The second vector. * @returns {number} The Euclidean distance. * @throws {Error} If vectors have different lengths. */ static euclideanDistance(v1, v2) { if (v1.length !== v2.length) { throw new Error("Vectors must have the same length for Euclidean distance."); } let sumSqDiff = 0; for (let i = 0; i < v1.length; i++) { sumSqDiff += Math.pow(v1[i] - v2[i], 2); } return Math.sqrt(sumSqDiff); } /** * Calculates the cosine similarity between two vectors. * Cosine Similarity = (A . B) / (||A|| * ||B||) * @param {number[]} v1 - The first vector. * @param {number[]} v2 - The second vector. * @returns {number} The cosine similarity score (-1 to 1). * @throws {Error} If vectors have different lengths. */ static cosineSimilarity(v1, v2) { if (v1.length !== v2.length) { throw new Error("Vectors must have the same length for cosine similarity."); } const dot = WmbataBotML.dotProduct(v1, v2); const mag1 = WmbataBotML.magnitude(v1); const mag2 = WmbataBotML.magnitude(v2); if (mag1 === 0 || mag2 === 0) { return 0; // Or handle as an error if appropriate for your use case } return dot / (mag1 * mag2); } /** * Adds two vectors element-wise. * @param {number[]} v1 - The first vector. * @param {number[]} v2 - The second vector. * @returns {number[]} The resulting vector. * @throws {Error} If vectors have different lengths. */ static addVectors(v1, v2) { if (v1.length !== v2.length) { throw new Error("Vectors must have the same length for addition."); } return v1.map((val, i) => val + v2[i]); } /** * Subtracts the second vector from the first vector element-wise. * @param {number[]} v1 - The first vector. * @param {number[]} v2 - The second vector. * @returns {number[]} The resulting vector. * @throws {Error} If vectors have different lengths. */ static subtractVectors(v1, v2) { if (v1.length !== v2.length) { throw new Error("Vectors must have the same length for subtraction."); } return v1.map((val, i) => val - v2[i]); } /** * Multiplies a vector by a scalar. * @param {number} scalar - The scalar value. * @param {number[]} v - The vector. * @returns {number[]} The resulting vector. */ static scalarMultiplyVector(scalar, v) { if (typeof scalar !== 'number') { throw new TypeError("Scalar must be a number."); } if (!Array.isArray(v) || !v.every(n => typeof n === 'number')) { throw new TypeError("Vector must be an array of numbers."); } return v.map(val => val * scalar); } /** * Performs element-wise multiplication (Hadamard product) of two vectors. * @param {number[]} v1 - The first vector. * @param {number[]} v2 - The second vector. * @returns {number[]} The resulting vector. * @throws {Error} If vectors have different lengths. */ static hadamardProduct(v1, v2) { if (v1.length !== v2.length) { throw new Error("Vectors must have the same length for Hadamard product."); } return v1.map((val, i) => val * v2[i]); } /** * Computes the transpose of a matrix. * @param {number[][]} matrix - The input matrix. * @returns {number[][]} The transposed matrix. * @throws {Error} If input is not a valid matrix. */ static transposeMatrix(matrix) { if (!Array.isArray(matrix) || matrix.length === 0 || !Array.isArray(matrix[0])) { throw new Error("Input must be a non-empty matrix."); } const rows = matrix.length; const cols = matrix[0].length; const transposed = Array(cols).fill(null).map(() => Array(rows).fill(0)); for (let i = 0; i < rows; i++) { for (let j = 0; j < cols; j++) { transposed[j][i] = matrix[i][j]; } } return transposed; } /** * Multiplies two matrices. * @param {number[][]} m1 - The first matrix (rows1 x cols1). * @param {number[][]} m2 - The second matrix (cols1 x cols2). * @returns {number[][]} The resulting matrix (rows1 x cols2). * @throws {Error} If matrices cannot be multiplied (cols1 !== rows2). */ static multiplyMatrices(m1, m2) { if (!Array.isArray(m1) || !Array.isArray(m2) || m1.length === 0 || m2.length === 0 || !Array.isArray(m1[0]) || !Array.isArray(m2[0])) { throw new Error("Inputs must be non-empty matrices."); } const m1Rows = m1.length; const m1Cols = m1[0].length; const m2Rows = m2.length; const m2Cols = m2[0].length; if (m1Cols !== m2Rows) { throw new Error(`Cannot multiply matrices: inner dimensions mismatch (${m1Cols} !== ${m2Rows}).`); } const result = Array(m1Rows).fill(null).map(() => Array(m2Cols).fill(0)); for (let i = 0; i < m1Rows; i++) { for (let j = 0; j < m2Cols; j++) { let sum = 0; for (let k = 0; k < m1Cols; k++) { sum += m1[i][k] * m2[k][j]; } result[i][j] = sum; } } return result; } /** * ===================================================================== * DATA PREPROCESSING - MIN-MAX SCALER * ===================================================================== * Scales features to a given range (default 0 to 1). */ /** * Fits the MinMaxScaler to the provided data, calculating min and max values for each feature. * @param {number[][]} data - An array of feature vectors (each inner array is a sample). * @throws {Error} If data is empty or malformed. */ fitMinMaxScaler(data) { if (!Array.isArray(data) || data.length === 0 || !data.every(d => Array.isArray(d) && d.every(n => typeof n === 'number'))) { throw new TypeError("Data for MinMaxScaler must be a non-empty array of number arrays."); } const numFeatures = data[0].length; if (data.some(d => d.length !== numFeatures)) { throw new Error("All data vectors must have the same dimension for MinMaxScaler."); } const minVals = new Array(numFeatures).fill(Infinity); const maxVals = new Array(numFeatures).fill(-Infinity); for (const sample of data) { for (let i = 0; i < numFeatures; i++) { minVals[i] = Math.min(minVals[i], sample[i]); maxVals[i] = Math.max(maxVals[i], sample[i]); } } this._minMaxScaler.min = minVals; this._minMaxScaler.max = maxVals; this._minMaxScaler.isFitted = true; } /** * Transforms the data using the fitted Min-Max Scaler. * @param {number[][]} data - An array of feature vectors to transform. * @param {number} [minRange=0] - The desired minimum value of the scaled range. * @param {number} [maxRange=1] - The desired maximum value of the scaled range. * @returns {number[][]} The transformed data. * @throws {Error} If the scaler has not been fitted or data dimensions don't match. */ transformMinMaxScaler(data, minRange = 0, maxRange = 1) { if (!this._minMaxScaler.isFitted) { throw new Error("MinMaxScaler has not been fitted. Call fitMinMaxScaler() first."); } if (!Array.isArray(data) || data.length === 0 || !data.every(d => Array.isArray(d) && d.every(n => typeof n === 'number'))) { throw new TypeError("Data for MinMaxScaler must be a non-empty array of number arrays."); } const numFeatures = this._minMaxScaler.min.length; if (data.some(d => d.length !== numFeatures)) { throw new Error(`Data dimensions (${data[0].length}) do not match scaler's fitted dimensions (${numFeatures}).`); } return data.map(sample => { return sample.map((value, i) => { const min = this._minMaxScaler.min[i]; const max = this._minMaxScaler.max[i]; if (max - min === 0) { return (minRange + maxRange) / 2; // Return midpoint if range is zero } return minRange + (value - min) * (maxRange - minRange) / (max - min); }); }); } /** * Transforms a single sample using the fitted Min-Max Scaler. * @param {number[]} sample - A single feature vector to transform. * @param {number} [minRange=0] - The desired minimum value of the scaled range. * @param {number} [maxRange=1] - The desired maximum value of the scaled range. * @returns {number[]} The transformed sample. * @throws {Error} If the scaler has not been fitted or sample dimension doesn't match. */ transformSingleSampleMinMaxScaler(sample, minRange = 0, maxRange = 1) { if (!this._minMaxScaler.isFitted) { throw new Error("MinMaxScaler has not been fitted. Call fitMinMaxScaler() first."); } if (!Array.isArray(sample) || !sample.every(n => typeof n === 'number')) { throw new TypeError("Sample for MinMaxScaler must be an array of numbers."); } const numFeatures = this._minMaxScaler.min.length; if (sample.length !== numFeatures) { throw new Error(`Sample dimension (${sample.length}) does not match scaler's fitted dimensions (${numFeatures}).`); } return sample.map((value, i) => { const min = this._minMaxScaler.min[i]; const max = this._minMaxScaler.max[i]; if (max - min === 0) { return (minRange + maxRange) / 2; } return minRange + (value - min) * (maxRange - minRange) / (max - min); }); } /** * Resets the MinMaxScaler state. */ resetMinMaxScaler() { this._minMaxScaler = { min: [], max: [], isFitted: false }; } /** * ===================================================================== * DATA PREPROCESSING - STANDARD SCALER (Z-Score Normalization) * ===================================================================== * Scales features to have a mean of 0 and a standard deviation of 1. */ /** * Fits the StandardScaler to the provided data, calculating mean and standard deviation for each feature. * @param {number[][]} data - An array of feature vectors. * @throws {Error} If data is empty or malformed. */ fitStandardScaler(data) { if (!Array.isArray(data) || data.length === 0 || !data.every(d => Array.isArray(d) && d.every(n => typeof n === 'number'))) { throw new TypeError("Data for StandardScaler must be a non-empty array of number arrays."); } const numSamples = data.length; const numFeatures = data[0].length; if (data.some(d => d.length !== numFeatures)) { throw new Error("All data vectors must have the same dimension for StandardScaler."); } const means = new Array(numFeatures).fill(0); const stdDevs = new Array(numFeatures).fill(0); // Calculate means for (const sample of data) { for (let i = 0; i < numFeatures; i++) { means[i] += sample[i]; } } for (let i = 0; i < numFeatures; i++) { means[i] /= numSamples; } // Calculate standard deviations for (const sample of data) { for (let i = 0; i < numFeatures; i++) { stdDevs[i] += Math.pow(sample[i] - means[i], 2); } } for (let i = 0; i < numFeatures; i++) { stdDevs[i] = Math.sqrt(stdDevs[i] / numSamples); // Population standard deviation if (stdDevs[i] === 0) { // Avoid division by zero stdDevs[i] = 1; // Or some small epsilon, or handle as a constant feature. 1 is common. } } this._standardScaler.mean = means; this._standardScaler.stdDev = stdDevs; this._standardScaler.isFitted = true; } /** * Transforms the data using the fitted StandardScaler. * @param {number[][]} data - An array of feature vectors to transform. * @returns {number[][]} The transformed data. * @throws {Error} If the scaler has not been fitted or data dimensions don't match. */ transformStandardScaler(data) { if (!this._standardScaler.isFitted) { throw new Error("StandardScaler has not been fitted. Call fitStandardScaler() first."); } if (!Array.isArray(data) || data.length === 0 || !data.every(d => Array.isArray(d) && d.every(n => typeof n === 'number'))) { throw new TypeError("Data for StandardScaler must be a non-empty array of number arrays."); } const numFeatures = this._standardScaler.mean.length; if (data.some(d => d.length !== numFeatures)) { throw new Error(`Data dimensions (${data[0].length}) do not match scaler's fitted dimensions (${numFeatures}).`); } return data.map(sample => { return sample.map((value, i) => { return (value - this._standardScaler.mean[i]) / this._standardScaler.stdDev[i]; }); }); } /** * Transforms a single sample using the fitted StandardScaler. * @param {number[]} sample - A single feature vector to transform. * @returns {number[]} The transformed sample. * @throws {Error} If the scaler has not been fitted or sample dimension doesn't match. */ transformSingleSampleStandardScaler(sample) { if (!this._standardScaler.isFitted) { throw new Error("StandardScaler has not been fitted. Call fitStandardScaler() first."); } if (!Array.isArray(sample) || !sample.every(n => typeof n === 'number')) { throw new TypeError("Sample for StandardScaler must be an array of numbers."); } const numFeatures = this._standardScaler.mean.length; if (sample.length !== numFeatures) { throw new Error(`Sample dimension (${sample.length}) does not match scaler's fitted dimensions (${numFeatures}).`); } return sample.map((value, i) => { return (value - this._standardScaler.mean[i]) / this._standardScaler.stdDev[i]; }); } /** * Resets the StandardScaler state. */ resetStandardScaler() { this._standardScaler = { mean: [], stdDev: [], isFitted: false }; } /** * ===================================================================== * DATA PREPROCESSING - ONE-HOT ENCODER * ===================================================================== * Converts categorical features into a numerical format. */ /** * Fits the OneHotEncoder to the provided data, learning unique categories for specified feature indices. * @param {Array>} data - An array of samples, where each sample is an array of feature values. * @param {number[]} [featureIndices=null] - An array of indices of categorical features to encode. If null, encodes all features. * @throws {Error} If data is empty or malformed. */ fitOneHotEncoder(data, featureIndices = null) { if (!Array.isArray(data) || data.length === 0 || !Array.isArray(data[0])) { throw new TypeError("Data for OneHotEncoder must be a non-empty array of arrays."); } const numFeatures = data[0].length; const selectedFeatures = featureIndices || Array.from({ length: numFeatures }, (_, i) => i); this._oneHotEncoder.categoryMaps.clear(); this._oneHotEncoder.featureLengths.clear(); for (const featureIndex of selectedFeatures) { const uniqueCategories = new Set(); for (const sample of data) { if (featureIndex < sample.length) { uniqueCategories.add(sample[featureIndex]); } } const categoryMap = new Map(); let oneHotIndex = 0; for (const category of uniqueCategories) { categoryMap.set(category, oneHotIndex++); } this._oneHotEncoder.categoryMaps.set(featureIndex, categoryMap); this._oneHotEncoder.featureLengths.set(featureIndex, uniqueCategories.size); } this._oneHotEncoder.isFitted = true; } /** * Transforms the data using the fitted OneHotEncoder. * Categorical features are replaced by binary (0 or 1) vectors. Non-encoded features are kept as is. * @param {Array>} data - An array of samples to transform. * @returns {number[][]} The transformed data as numerical vectors. * @throws {Error} If the encoder has not been fitted. */ transformOneHotEncoder(data) { if (!this._oneHotEncoder.isFitted) { throw new Error("OneHotEncoder has not been fitted. Call fitOneHotEncoder() first."); } if (!Array.isArray(data) || data.length === 0 || !Array.isArray(data[0])) { throw new TypeError("Data for OneHotEncoder must be a non-empty array of arrays."); } const originalNumFeatures = data[0].length; const transformedData = []; for (const sample of data) { let transformedSample = []; for (let i = 0; i < originalNumFeatures; i++) { if (this._oneHotEncoder.categoryMaps.has(i)) { // This feature is to be one-hot encoded const categoryMap = this._oneHotEncoder.categoryMaps.get(i); const numCategories = this._oneHotEncoder.featureLengths.get(i); const oneHotVector = new Array(numCategories).fill(0); const categoryValue = sample[i]; const hotIndex = categoryMap.get(categoryValue); if (hotIndex !== undefined) { oneHotVector[hotIndex] = 1; } transformedSample = transformedSample.concat(oneHotVector); } else { // This feature is not encoded, keep as is (assuming it's numerical or can be implicitly converted) transformedSample.push(sample[i]); } } transformedData.push(transformedSample); } return transformedData; } /** * Transforms a single sample using the fitted OneHotEncoder. * @param {Array} sample - A single sample (array of feature values) to transform. * @returns {number[]} The transformed sample as a numerical vector. * @throws {Error} If the encoder has not been fitted. */ transformSingleSampleOneHotEncoder(sample) { if (!this._oneHotEncoder.isFitted) { throw new Error("OneHotEncoder has not been fitted. Call fitOneHotEncoder() first."); } if (!Array.isArray(sample)) { throw new TypeError("Sample for OneHotEncoder must be an array."); } const originalNumFeatures = sample.length; let transformedSample = []; for (let i = 0; i < originalNumFeatures; i++) { if (this._oneHotEncoder.categoryMaps.has(i)) { const categoryMap = this._oneHotEncoder.categoryMaps.get(i); const numCategories = this._oneHotEncoder.featureLengths.get(i); const oneHotVector = new Array(numCategories).fill(0); const categoryValue = sample[i]; const hotIndex = categoryMap.get(categoryValue); if (hotIndex !== undefined) { oneHotVector[hotIndex] = 1; } transformedSample = transformedSample.concat(oneHotVector); } else { transformedSample.push(sample[i]); } } return transformedSample; } /** * Resets the OneHotEncoder state. */ resetOneHotEncoder() { this._oneHotEncoder = { categoryMaps: new Map(), featureLengths: new Map(), isFitted: false }; } /** * ===================================================================== * NAIVE BAYES CLASSIFIER * ===================================================================== * A simple probabilistic classifier based on Bayes' theorem with naive * (strong) independence assumptions between the features. Great for text classification. */ /** * Trains the Naive Bayes classifier model. * Assumes features (X) are count-based vectors (e.g., Bag-of-Words). * @param {number[][]} X - An array of feature vectors (e.g., from WmbataBotNLP.vectorizeBoW). * @param {string[]} y - An array of corresponding class labels. * @param {number} vocabularySize - The total size of the vocabulary (number of unique features). * @param {number} [alpha=1] - Laplace smoothing parameter (prevents zero probabilities). * @throws {Error} If X and y have different lengths, or if vocabularySize is invalid. */ trainNaiveBayes(X, y, vocabularySize, alpha = 1) { if (X.length !== y.length) { throw new Error("Feature vectors (X) and labels (y) must have the same number of samples."); } if (typeof vocabularySize !== 'number' || vocabularySize <= 0) { throw new Error("Vocabulary size must be a positive number."); } const model = this._naiveBayesModel; model.classPriors.clear(); model.featureLikelihoods.clear(); model.classFeatureTotals.clear(); model.classes.clear(); model.vocabularySize = vocabularySize; model.alpha = alpha; // Populate class counts and feature counts for each class for (let i = 0; i < X.length; i++) { const features = X[i]; const label = y[i]; model.classes.add(label); model.classPriors.set(label, (model.classPriors.get(label) || 0) + 1); if (!model.featureLikelihoods.has(label)) { model.featureLikelihoods.set(label, new Map()); } if (!model.classFeatureTotals.has(label)) { model.classFeatureTotals.set(label, 0); } const currentFeatureCounts = model.featureLikelihoods.get(label); let currentClassTotal = model.classFeatureTotals.get(label); for (let j = 0; j < features.length; j++) { const featureCount = features[j]; if (featureCount > 0) { // Only count present features currentFeatureCounts.set(j, (currentFeatureCounts.get(j) || 0) + featureCount); currentClassTotal += featureCount; } } model.classFeatureTotals.set(label, currentClassTotal); } } /** * Predicts the class label for a single feature vector using the trained Naive Bayes model. * @param {number[]} features - The feature vector to classify. * @returns {string} The predicted class label. * @throws {Error} If the model has not been trained. */ predictNaiveBayes(features) { const model = this._naiveBayesModel; if (model.classPriors.size === 0) { throw new Error("Naive Bayes model has not been trained. Call trainNaiveBayes() first."); } if (features.length !== model.vocabularySize) { console.warn(`Input feature vector length (${features.length}) does not match trained vocabulary size (${model.vocabularySize}). This might lead to incorrect predictions.`); } let bestClass = null; let maxLogProb = -Infinity; const totalSamples = Array.from(model.classPriors.values()).reduce((sum, count) => sum + count, 0); for (const label of model.classes) { const logPrior = Math.log(model.classPriors.get(label) / totalSamples); let logLikelihood = 0; const currentFeatureCounts = model.featureLikelihoods.get(label) || new Map(); const currentClassTotal = model.classFeatureTotals.get(label) || 0; for (let j = 0; j < features.length; j++) { const featureCount = features[j]; // For Naive Bayes, we usually care about presence/absence or count. // Assuming count-based features (e.g., BoW) if (featureCount > 0) { const termCountInClass = currentFeatureCounts.get(j) || 0; // Apply Laplace smoothing const probability = (termCountInClass + model.alpha) / (currentClassTotal + model.alpha * model.vocabularySize); logLikelihood += Math.log(probability); } } const totalProb = logPrior + logLikelihood; if (totalProb > maxLogProb) { maxLogProb = totalProb; bestClass = label; } } return bestClass; } /** * Saves the current Naive Bayes model state. * @returns {object} A serializable object representing the model state. */ saveNaiveBayesModel() { const model = this._naiveBayesModel; return { classPriors: Array.from(model.classPriors.entries()), featureLikelihoods: Array.from(model.featureLikelihoods.entries()).map(([cls, fMap]) => [cls, Array.from(fMap.entries())]), classFeatureTotals: Array.from(model.classFeatureTotals.entries()), vocabularySize: model.vocabularySize, classes: Array.from(model.classes), alpha: model.alpha }; } /** * Loads a Naive Bayes model state. * @param {object} state - The state object to load. * @throws {TypeError} If the state object is malformed. */ loadNaiveBayesModel(state) { if (typeof state !== 'object' || state === null || !Array.isArray(state.classPriors) || !Array.isArray(state.featureLikelihoods) || !Array.isArray(state.classFeatureTotals) || typeof state.vocabularySize !== 'number' || !Array.isArray(state.classes) || typeof state.alpha !== 'number') { throw new TypeError('Invalid Naive Bayes model state object.'); } const model = this._naiveBayesModel; model.classPriors = new Map(state.classPriors); model.featureLikelihoods = new Map(state.featureLikelihoods.map(([cls, fArray]) => [cls, new Map(fArray)])); model.classFeatureTotals = new Map(state.classFeatureTotals); model.vocabularySize = state.vocabularySize; model.classes = new Set(state.classes); model.alpha = state.alpha; } /** * ===================================================================== * K-NEAREST NEIGHBORS (KNN) CLASSIFIER / REGRESSOR * ===================================================================== * A non-parametric, instance-based learning algorithm. */ /** * Trains the KNN model by simply storing the training data. * @param {number[][]} X - An array of feature vectors. * @param {Array} y - An array of corresponding labels (string for classification, number for regression). * @throws {Error} If X and y have different lengths. */ trainKNN(X, y) { if (X.length !== y.length) { throw new Error("Feature vectors (X) and labels (y) must have the same number of samples."); } if (X.length === 0) { console.warn("No data provided to train KNN."); return; } this._knnModel.X_train = X; this._knnModel.y_train = y; this._knnModel.classes = [...new Set(y.filter(label => typeof label === 'string'))]; // Store classes if for classification } /** * Predicts the class label for a single feature vector using K-Nearest Neighbors. * @param {number[]} features - The feature vector to classify. * @param {number} k - The number of neighbors to consider. * @returns {string|null} The predicted class label (majority vote), or null if model not trained. * @throws {Error} If model not trained, k is invalid, or feature length mismatch. */ predictKNN(features, k) { const model = this._knnModel; if (model.X_train.length === 0) { throw new Error("KNN model has not been trained. Call trainKNN() first."); } if (k <= 0 || k > model.X_train.length || !Number.isInteger(k)) { throw new Error(`Invalid k (${k}). k must be a positive integer less than or equal to the number of training samples.`); } if (features.length !== model.X_train[0].length) { throw new Error(`Input feature vector length (${features.length}) does not match training data dimensions (${model.X_train[0].length}).`); } // Calculate distances to all training points const distances = []; for (let i = 0; i < model.X_train.length; i++) { const dist = WmbataBotML.euclideanDistance(features, model.X_train[i]); distances.push({ dist, label: model.y_train[i] }); } // Sort by distance and get the k nearest neighbors distances.sort((a, b) => a.dist - b.dist); const nearestNeighbors = distances.slice(0, k); // Count votes for classification const labelCounts = new Map(); for (const neighbor of nearestNeighbors) { labelCounts.set(neighbor.label, (labelCounts.get(neighbor.label) || 0) + 1); } // Find the majority class let bestLabel = null; let maxCount = -1; for (const [label, count] of labelCounts.entries()) { if (count > maxCount) { maxCount = count; bestLabel = label; } } return bestLabel; } /** * Predicts the regression value for a single feature vector using K-Nearest Neighbors. * @param {number[]} features - The feature vector to predict. * @param {number} k - The number of neighbors to consider. * @returns {number|null} The predicted numerical value (average), or null if model not trained. * @throws {Error} If model not trained, k is invalid, or feature length mismatch. */ predictKNNRegression(features, k) { const model = this._knnModel; if (model.X_train.length === 0) { throw new Error("KNN model has not been trained. Call trainKNN() first."); } if (k <= 0 || k > model.X_train.length || !Number.isInteger(k)) { throw new Error(`Invalid k (${k}). k must be a positive integer less than or equal to the number of training samples.`); } if (features.length !== model.X_train[0].length) { throw new Error(`Input feature vector length (${features.length}) does not match training data dimensions (${model.X_train[0].length}).`); } const distances = []; for (let i = 0; i < model.X_train.length; i++) { const dist = WmbataBotML.euclideanDistance(features, model.X_train[i]); distances.push({ dist, value: model.y_train[i] }); } distances.sort((a, b) => a.dist - b.dist); const nearestNeighbors = distances.slice(0, k); const sumValues = nearestNeighbors.reduce((sum, neighbor) => sum + neighbor.value, 0); return sumValues / k; } /** * Saves the current KNN model state. * @returns {object} A serializable object representing the model state. */ saveKNNModel() { const model = this._knnModel; return { X_train: model.X_train, y_train: model.y_train, classes: model.classes // For classification }; } /** * Loads a KNN model state. * @param {object} state - The state object to load. * @throws {TypeError} If the state object is malformed. */ loadKNNModel(state) { if (typeof state !== 'object' || state === null || !Array.isArray(state.X_train) || !Array.isArray(state.y_train) || !Array.isArray(state.classes)) { throw new TypeError('Invalid KNN model state object.'); } this._knnModel.X_train = state.X_train; this._knnModel.y_train = state.y_train; this._knnModel.classes = state.classes; } /** * ===================================================================== * LOGISTIC REGRESSION CLASSIFIER * ===================================================================== * A linear model for binary classification, outputting probabilities via the sigmoid function. * Trained using Gradient Descent. */ /** * The sigmoid (logistic) activation function. * @param {number} z - The input value. * @returns {number} The output of the sigmoid function (between 0 and 1). */ _sigmoid(z) { return 1 / (1 + Math.exp(-z)); } /** * Initializes weights and bias for Logistic Regression. * @param {number} numFeatures - The number of features in the input data. * @private */ _initializeLogisticRegression(numFeatures) { this._logisticRegressionModel.weights = new Array(numFeatures).fill(0); // Initialize weights to zero this._logisticRegressionModel.bias = 0; this._logisticRegressionModel.featureCount = numFeatures; } /** * Trains the Logistic Regression model using Gradient Descent. * @param {number[][]} X - An array of feature vectors. * @param {string[]} y - An array of corresponding class labels (must be binary, e.g., ['positive', 'negative']). * @param {number} learningRate - The step size for gradient descent. * @param {number} numIterations - The number of training iterations. * @throws {Error} If X and y have different lengths or labels are not binary. */ trainLogisticRegression(X, y, learningRate, numIterations) { if (X.length !== y.length) { throw new Error("Feature vectors (X) and labels (y) must have the same number of samples."); } if (X.length === 0) { console.warn("No data provided to train Logistic Regression."); return; } const numSamples = X.length; const numFeatures = X[0].length; // Determine unique classes and map them to 0 and 1 for binary classification const uniqueClasses = [...new Set(y)]; if (uniqueClasses.length !== 2) { throw new Error(`Logistic Regression requires exactly two unique class labels. Found: ${uniqueClasses.join(', ')}`); } this._logisticRegressionModel.classes = uniqueClasses; const classMap = new Map(); classMap.set(uniqueClasses[0], 0); // Negative class (0) classMap.set(uniqueClasses[1], 1); // Positive class (1) const y_mapped = y.map(label => classMap.get(label)); this._initializeLogisticRegression(numFeatures); let weights = this._logisticRegressionModel.weights; let bias = this._logisticRegressionModel.bias; for (let iter = 0; iter < numIterations; iter++) { let dw = new Array(numFeatures).fill(0); // Gradient for weights let db = 0; // Gradient for bias for (let i = 0; i < numSamples; i++) { const x_i = X[i]; const y_i = y_mapped[i]; // Calculate prediction (h_theta(x)) const z = WmbataBotML.dotProduct(weights, x_i) + bias; const prediction = this._sigmoid(z); // Calculate error const error = prediction - y_i; // Update gradients for (let j = 0; j < numFeatures; j++) { dw[j] += error * x_i[j]; } db += error; } // Average gradients and update weights and bias for (let j = 0; j < numFeatures; j++) { weights[j] -= learningRate * (dw[j] / numSamples); } bias -= learningRate * (db / numSamples); // Optional: Print loss for debugging (e.g., every 100 iterations) // if (iter % 100 === 0) { // const cost = this._calculateLogisticRegressionCost(X, y_mapped, weights, bias); // console.log(`Iteration ${iter}, Cost: ${cost.toFixed(4)}`); // } } this._logisticRegressionModel.weights = weights; this._logisticRegressionModel.bias = bias; } /** * Predicts the probability of a sample belonging to the positive class (mapped to 1). * @param {number[]} features - The feature vector to predict. * @returns {number} The probability (between 0 and 1) of belonging to the positive class. * @throws {Error} If the model has not been trained. */ predictProbaLogisticRegression(features) { const model = this._logisticRegressionModel; if (model.weights.length === 0) { throw new Error("Logistic Regression model has not been trained. Call trainLogisticRegression() first."); } if (features.length !== model.featureCount) { throw new Error(`Input feature vector length (${features.length}) does not match trained feature count (${model.featureCount}).`); } const z = WmbataBotML.dotProduct(model.weights, features) + model.bias; return this._sigmoid(z); } /** * Predicts the class label for a single feature vector using the trained Logistic Regression model. * @param {number[]} features - The feature vector to classify. * @param {number} [threshold=0.5] - The probability threshold for classification. * @returns {string} The predicted class label. */ predictLogisticRegression(features, threshold = 0.5) { const prob = this.predictProbaLogisticRegression(features); const classes = this._logisticRegressionModel.classes; // [negative_class, positive_class] return prob >= threshold ? classes[1] : classes[0]; } /** * Calculates the binary cross-entropy cost (loss) for Logistic Regression. * @param {number[][]} X - Feature vectors. * @param {number[]} y_mapped - Mapped binary labels (0 or 1). * @param {number[]} weights - Current weights. * @param {number} bias - Current bias. * @returns {number} The computed cost. * @private */ _calculateLogisticRegressionCost(X, y_mapped, weights, bias) { let cost = 0; const numSamples = X.length; for (let i = 0; i < numSamples; i++) { const x_i = X[i]; const y_i = y_mapped[i]; const z = WmbataBotML.dotProduct(weights, x_i) + bias; const h_x = this._sigmoid(z); // Add a small epsilon to avoid log(0) const epsilon = 1e-10; cost += y_i * Math.log(h_x + epsilon) + (1 - y_i) * Math.log(1 - h_x + epsilon); } return -cost / numSamples; } /** * Saves the current Logistic Regression model state. * @returns {object} A serializable object representing the model state. */ saveLogisticRegressionModel() { const model = this._logisticRegressionModel; return { weights: [...model.weights], bias: model.bias, classes: [...model.classes], featureCount: model.featureCount }; } /** * Loads a Logistic Regression model state. * @param {object} state - The state object to load. * @throws {TypeError} If the state object is malformed. */ loadLogisticRegressionModel(state) { if (typeof state !== 'object' || state === null || !Array.isArray(state.weights) || typeof state.bias !== 'number' || !Array.isArray(state.classes) || typeof state.featureCount !== 'number') { throw new TypeError('Invalid Logistic Regression model state object.'); } this._logisticRegressionModel.weights = [...state.weights]; this._logisticRegressionModel.bias = state.bias; this._logisticRegressionModel.classes = [...state.classes]; this._logisticRegressionModel.featureCount = state.featureCount; } /** * ===================================================================== * LINEAR SUPPORT VECTOR CLASSIFIER (SVC) * ===================================================================== * A linear model for binary classification, trained using Stochastic Gradient Descent. * Note: This is a basic implementation of a linear SVM, without kernel tricks. */ /** * Initializes weights and bias for Linear SVM. * @param {number} numFeatures - The number of features. * @private */ _initializeLinearSVM(numFeatures) { this._linearSVMModel.weights = new Array(numFeatures).fill(0); this._linearSVMModel.bias = 0; this._linearSVMModel.featureCount = numFeatures; } /** * Trains the Linear SVM model using Stochastic Gradient Descent. * @param {number[][]} X - An array of feature vectors. * @param {string[]} y - An array of corresponding class labels (must be binary, e.g., ['negative', 'positive']). * @param {number} learningRate - The step size for gradient descent. * @param {number} numIterations - The number of training iterations. * @param {number} C - Regularization parameter (trade-off between margin maximization and misclassification). * @throws {Error} If X and y have different lengths or labels are not binary. */ trainLinearSVM(X, y, learningRate, numIterations, C = 1.0) { if (X.length !== y.length) { throw new Error("Feature vectors (X) and labels (y) must have the same number of samples."); } if (X.length === 0) { console.warn("No data provided to train Linear SVM."); return; } const numSamples = X.length; const numFeatures = X[0].length; const uniqueClasses = [...new Set(y)]; if (uniqueClasses.length !== 2) { throw new Error(`Linear SVM requires exactly two unique class labels. Found: ${uniqueClasses.join(', ')}`); } this._linearSVMModel.classes = uniqueClasses; // Map labels to -1 and 1 for SVM (common convention) const classMap = new Map(); classMap.set(uniqueClasses[0], -1); // Negative class classMap.set(uniqueClasses[1], 1); // Positive class const y_mapped = y.map(label => classMap.get(label)); this._initializeLinearSVM(numFeatures); let weights = this._linearSVMModel.weights; let bias = this._linearSVMModel.bias; for (let iter = 0; iter < numIterations; iter++) { // Shuffle data for Stochastic Gradient Descent const indices = Array.from({ length: numSamples }, (_, i) => i); for (let i = indices.length - 1; i > 0; i--) { const j = Math.floor(Math.random() * (i + 1)); [indices[i], indices[j]] = [indices[j], indices[i]]; } for (const idx of indices) { const x_i = X[idx]; const y_i = y_mapped[idx]; // Predict margin const margin = y_i * (WmbataBotML.dotProduct(weights, x_i) + bias); // Update rule based on hinge loss if (margin >= 1) { // Correctly classified and outside the margin, only regularize weights for (let j = 0; j < numFeatures; j++) { weights[j] -= learningRate * (weights[j] / numSamples); // L2 regularization part } } else { // Misclassified or inside the margin, update weights and bias for (let j = 0; j < numFeatures; j++) { weights[j] -= learningRate * (weights[j] / numSamples - C * y_i * x_i[j]); } bias -= learningRate * (-C * y_i); } } } this._linearSVMModel.weights = weights; this._linearSVMModel.bias = bias; } /** * Predicts the class label for a single feature vector using the trained Linear SVM model. * @param {number[]} features - The feature vector to classify. * @returns {string} The predicted class label. * @throws {Error} If the model has not been trained or input feature length is incorrect. */ predictLinearSVM(features) { const model = this._linearSVMModel; if (model.weights.length === 0) { throw new Error("Linear SVM model has not been trained. Call trainLinearSVM() first."); } if (features.length !== model.featureCount) { throw new Error(`Input feature vector length (${features.length}) does not match trained feature count (${model.featureCount}).`); } const decision = WmbataBotML.dotProduct(model.weights, features) + model.bias; // Map decision to the original class labels // model.classes[0] is mapped to -1, model.classes[1] is mapped to 1 return decision >= 0 ? model.classes[1] : model.classes[0]; } /** * Saves the current Linear SVM model state. * @returns {object} A serializable object representing the model state. */ saveLinearSVMModel() { const model = this._linearSVMModel; return { weights: [...model.weights], bias: model.bias, classes: [...model.classes], featureCount: model.featureCount }; } /** * Loads a Linear SVM model state. * @param {object} state - The state object to load. * @throws {TypeError} If the state object is malformed. */ loadLinearSVMModel(state) { if (typeof state !== 'object' || state === null || !Array.isArray(state.weights) || typeof state.bias !== 'number' || !Array.isArray(state.classes) || typeof state.featureCount !== 'number') { throw new TypeError('Invalid Linear SVM model state object.'); } this._linearSVMModel.weights = [...state.weights]; this._linearSVMModel.bias = state.bias; this._linearSVMModel.classes = [...state.classes]; this._linearSVMModel.featureCount = state.featureCount; } /** * ===================================================================== * K-MEANS CLUSTERING * ===================================================================== * An unsupervised learning algorithm used to group similar data points * into 'k' clusters. */ /** * Performs K-Means clustering on the input data. * @param {number[][]} data - An array of feature vectors to cluster. * @param {number} k - The number of clusters to form. * @param {number} [maxIterations=300] - Maximum number of iterations for the clustering algorithm. * @returns {object} An object containing `centroids` (array of cluster centroid vectors) * and `assignments` (array where index i is the cluster index for data[i]). * @throws {Error} If k is invalid or data is empty/malformed. */ kmeans(data, k, maxIterations = 300) { if (!Array.isArray(data) || data.length === 0 || !data.every(d => Array.isArray(d) && d.every(n => typeof n === 'number'))) { throw new TypeError("Data for K-Means must be an array of number arrays."); } if (typeof k !== 'number' || k <= 0 || !Number.isInteger(k) || k > data.length) { throw new Error(`Invalid k (${k}). k must be a positive integer less than or equal to the number of data points.`); } const numDimensions = data[0].length; if (data.some(d => d.length !== numDimensions)) { throw new Error("All data vectors must have the same dimension."); } let centroids = []; let assignments = new Array(data.length).fill(-1); // 1. Initialize centroids randomly (choose k data points as initial centroids) const chosenIndices = new Set(); while (chosenIndices.size < k) { const randomIndex = Math.floor(Math.random() * data.length); if (!chosenIndices.has(randomIndex)) { chosenIndices.add(randomIndex); centroids.push([...data[randomIndex]]); // Create a copy } } let iterations = 0; let changedAssignments = true; while (changedAssignments && iterations < maxIterations) { changedAssignments = false; iterations++; // 2. Assign each data point to the nearest centroid for (let i = 0; i < data.length; i++) { const point = data[i]; let minDistance = Infinity; let closestCentroidIndex = -1; for (let j = 0; j < k; j++) { const dist = WmbataBotML.euclideanDistance(point, centroids[j]); if (dist < minDistance) { minDistance = dist; closestCentroidIndex = j; } } if (assignments[i] !== closestCentroidIndex) { assignments[i] = closestCentroidIndex; changedAssignments = true; } } // 3. Recalculate centroids const newCentroids = Array(k).fill(null).map(() => new Array(numDimensions).fill(0)); const clusterCounts = new Array(k).fill(0); for (let i = 0; i < data.length; i++) { const clusterIndex = assignments[i]; if (clusterIndex !== -1) { // Ensure point was assigned for (let d = 0; d < numDimensions; d++) { newCentroids[clusterIndex][d] += data[i][d]; } clusterCounts[clusterIndex]++; } } for (let j = 0; j < k; j++) { if (clusterCounts[j] > 0) { for (let d = 0; d < numDimensions; d++) { newCentroids[j][d] /= clusterCounts[j]; } } else { // Handle empty cluster: reinitialize centroid randomly // This could lead to infinite loops if data is very sparse or k is too high const randomIndex = Math.floor(Math.random() * data.length); centroids[j] = [...data[randomIndex]]; changedAssignments = true; // Force another iteration } } centroids = newCentroids; } return { centroids, assignments }; } /** * ===================================================================== * PRINCIPAL COMPONENT ANALYSIS (PCA) * ===================================================================== * A dimensionality reduction technique. * Note: Full, robust eigenvalue decomposition from scratch is very complex * and numerically sensitive. This implementation focuses on mean centering * and covariance matrix calculation. The `transformPCA` assumes principal * components (eigenvectors) are provided or can be obtained externally. * A simple power iteration is included to find the *first* principal component. */ /** * Calculates the mean of each feature in the dataset. * @param {number[][]} data - Input data. * @returns {number[]} Array of means for each feature. * @private */ _calculateFeatureMeans(data) { const numFeatures = data[0].length; const numSamples = data.length; const means = new Array(numFeatures).fill(0); for (const sample of data) { for (let i = 0; i < numFeatures; i++) { means[i] += sample[i]; } } return means.map(mean => mean / numSamples); } /** * Centers the data by subtracting the mean of each feature. * @param {number[][]} data - Input data. * @param {number[]} means - Means of each feature. * @returns {number[][]} Centered data. * @private */ _centerData(data, means) { return data.map(sample => WmbataBotML.subtractVectors(sample, means)); } /** * Calculates the covariance matrix of the given data. * @param {number[][]} data - The input data, typically already mean-centered. * @returns {number[][]} The covariance matrix. * @private */ _calculateCovarianceMatrix(data) { if (data.length === 0 || data[0].length === 0) { return []; } const numSamples = data.length; const numFeatures = data[0].length; const covarianceMatrix = Array(numFeatures).fill(null).map(() => Array(numFeatures).fill(0)); for (let i = 0; i < numFeatures; i++) { for (let j = 0; j < numFeatures; j++) { let sumOfProducts = 0; for (let k = 0; k < numSamples; k++) { sumOfProducts += data[k][i] * data[k][j]; } covarianceMatrix[i][j] = sumOfProducts / (numSamples - 1); // Sample covariance } } return covarianceMatrix; } /** * Finds the principal components (eigenvectors) and explained variance (eigenvalues). * * IMPORTANT NOTE: Robust and efficient calculation of eigenvalues and eigenvectors * for general matrices is a complex numerical problem typically handled by highly * optimized linear algebra libraries (e.g., LAPACK, BLAS, numeric.js in JS, or libraries * in Python/R/Julia). Implementing a full robust eigen-decomposition from scratch * in pure JS is beyond the practical scope of this library for numerical stability * and performance reasons. * * This method will only implement a very basic Power Iteration to find the *largest* * eigenvalue and its corresponding eigenvector (the first principal component), * and will indicate where more advanced methods would be needed. * * @param {number[][]} covarianceMatrix - The covariance matrix. * @param {number} [numComponents=1] - The number of principal components to approximate. * @param {number} [iterations=100] - Number of iterations for power iteration. * @returns {object} { eigenvalues: [], eigenvectors: [[]] } (approximate for first component) * @private */ _approximateEigenDecomposition(covarianceMatrix, numComponents = 1, iterations = 100) { if (covarianceMatrix.length === 0 || numComponents === 0) { return { eigenvalues: [], eigenvectors: [] }; } const dim = covarianceMatrix.length; const eigenvalues = []; const eigenvectors = []; // Simple Power Iteration for the largest eigenvalue/eigenvector // This is primarily for conceptual demonstration for the first component. if (numComponents >= 1) { let b_k = new Array(dim).fill(0).map((_, i) => (i === 0 ? 1 : 0)); // Initial random vector b_k = WmbataBotML.normalizeVector(b_k); for (let i = 0; i < iterations; i++) { let b_k_plus_1 = Array(dim).fill(0); for (let row = 0; row < dim; row++) { for (let col = 0; col < dim; col++) { b_k_plus_1[row] += covarianceMatrix[row][col] * b_k[col]; } } const newMag = WmbataBotML.magnitude(b_k_plus_1); if (newMag === 0) { break; } // Avoid division by zero b_k = WmbataBotML.scalarMultiplyVector(1 / newMag, b_k_plus_1); } // Approximate eigenvalue for the largest component const lambda = WmbataBotML.dotProduct(b_k, WmbataBotML.scalarMultiplyVector(1, WmbataBotML.multiplyMatrices([covarianceMatrix], [b_k]).flat())); eigenvalues.push(lambda); eigenvectors.push(b_k); } if (numComponents > 1) { console.warn("WmbataBotML.PCA: Full eigenvalue decomposition for multiple components from scratch is not robustly implemented. Only the first principal component is approximated using Power Iteration. For more components or high accuracy, consider specialized linear algebra libraries."); } return { eigenvalues, eigenvectors }; } /** * Fits the PCA model to the data. * This calculates the mean and covariance matrix. It provides a conceptual * means to obtain components but highlights that robust eigen-decomposition * is typically external. * @param {number[][]} data - The input data. * @returns {object} { means: [], covarianceMatrix: [[]], approximateEigen: {eigenvalues: [], eigenvectors: [[]]} } * @throws {Error} If data is empty or malformed. */ fitPCA(data) { if (!Array.isArray(data) || data.length === 0 || !data.every(d => Array.isArray(d) && d.every(n => typeof n === 'number'))) { throw new TypeError("Data for PCA must be a non-empty array of number arrays."); } const numDimensions = data[0].length; if (data.some(d => d.length !== numDimensions)) { throw new Error("All data vectors must have the same dimension for PCA."); } const means = this._calculateFeatureMeans(data); const centeredData = this._centerData(data, means); const covarianceMatrix = this._calculateCovarianceMatrix(centeredData); // This is an approximation/conceptual. Real PCA needs robust eigen-decomposition. const approximateEigen = this._approximateEigenDecomposition(covarianceMatrix, 1); return { means, covarianceMatrix, approximateEigen }; } /** * Transforms the data by projecting it onto the principal components. * Assumes you have already obtained the principal components (eigenvectors), * typically from `fitPCA` (or an external source if more robust PCA is needed). * @param {number[][]} data - The input data to transform. * @param {number[][]} principalComponents - The eigenvectors (each inner array is a component). * @param {number[]} means - The means of the original features (obtained from fitPCA). * @returns {number[][]} The transformed data in the new PCA space. * @throws {Error} If inputs are invalid or dimensions mismatch. */ transformPCA(data, principalComponents, means) { if (!Array.isArray(data) || data.length === 0 || !data.every(d => Array.isArray(d) && d.every(n => typeof n === 'number'))) { throw new TypeError("Data for PCA transform must be a non-empty array of number arrays."); } if (!Array.isArray(principalComponents) || principalComponents.length === 0 || !principalComponents.every(c => Array.isArray(c))) { throw new TypeError("Principal components must be a non-empty array of arrays (eigenvectors)."); } if (!Array.isArray(means) || means.length === 0 || !means.every(n => typeof n === 'number')) { throw new TypeError("Means must be a non-empty array of numbers."); } const numOriginalFeatures = data[0].length; const numComponents = principalComponents.length; if (principalComponents[0].length !== numOriginalFeatures) { throw new Error(`Principal components (${principalComponents[0].length}) must have the same dimension as original features (${numOriginalFeatures}).`); } if (means.length !== numOriginalFeatures) { throw new Error(`Means length (${means.length}) must match original feature dimension (${numOriginalFeatures}).`); } const centeredData = this._centerData(data, means); const transformedData = []; for (const sample of centeredData) { const transformedSample = new Array(numComponents).fill(0); for (let i = 0; i < numComponents; i++) { // Project sample onto each principal component transformedSample[i] = WmbataBotML.dotProduct(sample, principalComponents[i]); } transformedData.push(transformedSample); } return transformedData; } /** * ===================================================================== * DECISION TREE CLASSIFIER * ===================================================================== * A non-linear classification model that makes decisions based on features * in a tree-like structure. Supports numerical features with binary splits. */ /** * Calculates the Gini impurity for a set of labels. * Gini Impurity = 1 - sum( (proportion_of_class_i)^2 ) * Lower Gini impurity means a more pure (homogeneous) set. * @param {string[]} labels - An array of class labels. * @returns {number} The Gini impurity. * @private */ _calculateGiniImpurity(labels) { if (labels.length === 0) return 0; const counts = new Map(); for (const label of labels) { counts.set(label, (counts.get(label) || 0) + 1); } let impurity = 1; for (const count of counts.values()) { const prob = count / labels.length; impurity -= prob * prob; } return impurity; } /** * Finds the best feature and split point to divide the dataset, minimizing Gini impurity. * @param {number[][]} X - Feature vectors (subset for current node). * @param {string[]} y - Class labels (subset for current node). * @param {number[]} [featureIndices=null] - Optional subset of feature indices to consider for splitting (for Random Forest). * @returns {object} An object containing bestFeatureIndex, bestSplitValue, bestGini, leftIndices, rightIndices. * @private */ _findBestSplit(X, y, featureIndices = null) { const numSamples = X.length; if (numSamples <= 1) return null; // Cannot split const currentGini = this._calculateGiniImpurity(y); let bestGini = currentGini; let bestFeatureIndex = -1; let bestSplitValue = null; let bestLeftIndices = null; let bestRightIndices = null; const featuresToConsider = featureIndices || Array.from({ length: X[0].length }, (_, i) => i); for (const featureIdx of featuresToConsider) { const uniqueValues = [...new Set(X.map(row => row[featureIdx]))].sort((a, b) => a - b); // Sort for numerical splits for (let i = 0; i < uniqueValues.length; i++) { const splitValue = uniqueValues[i]; const leftIndices = []; const rightIndices = []; const leftLabels = []; const rightLabels = []; for (let sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) { if (X[sampleIdx][featureIdx] <= splitValue) { leftIndices.push(sampleIdx); leftLabels.push(y[sampleIdx]); } else { rightIndices.push(sampleIdx); rightLabels.push(y[sampleIdx]); } } if (leftLabels.length === 0 || rightLabels.length === 0) { continue; // No meaningful split } const giniLeft = this._calculateGiniImpurity(leftLabels); const giniRight = this._calculateGiniImpurity(rightLabels); const weightedGini = (leftLabels.length / numSamples) * giniLeft + (rightLabels.length / numSamples) * giniRight; if (weightedGini < bestGini) { bestGini = weightedGini; bestFeatureIndex = featureIdx; bestSplitValue = splitValue; bestLeftIndices = leftIndices; bestRightIndices = rightIndices; } } } if (bestFeatureIndex === -1) { return null; // No split improves purity } return { bestFeatureIndex, bestSplitValue, bestGini, bestLeftIndices, bestRightIndices }; } /** * Recursively builds the decision tree. * @param {number[][]} X_subset - Feature vectors for the current node. * @param {string[]} y_subset - Class labels for the current node. * @param {number} currentDepth - Current depth of the tree. * @param {number} maxDepth - Maximum allowed depth of the tree. * @param {number} minSamplesSplit - Minimum number of samples required to split a node. * @param {number[]} [featureIndices=null] - Subset of feature indices to consider for splitting. * @returns {object} The node object (leaf or internal node). * @private */ _buildTree(X_subset, y_subset, currentDepth, maxDepth, minSamplesSplit, featureIndices = null) { const numSamples = X_subset.length; const uniqueLabels = [...new Set(y_subset)]; // Helper to find majority class const getMajorityClass = (labels) => { const counts = new Map(); for (const label of labels) { counts.set(label, (counts.get(label) || 0) + 1); } let majorityClass = null; let maxCount = -1; for (const [label, count] of counts.entries()) { if (count > maxCount) { maxCount = count; majorityClass = label; } } return majorityClass; }; // Base cases for stopping splitting // 1. If all samples in node belong to the same class (pure node) if (uniqueLabels.length === 1) { return { type: 'leaf', prediction: uniqueLabels[0], count: numSamples }; } // 2. If max depth is reached if (currentDepth >= maxDepth) { return { type: 'leaf', prediction: getMajorityClass(y_subset), count: numSamples }; } // 3. If number of samples is below minSamplesSplit if (numSamples < minSamplesSplit) { return { type: 'leaf', prediction: getMajorityClass(y_subset), count: numSamples }; } // Find the best split const splitInfo = this._findBestSplit(X_subset, y_subset, featureIndices); if (!splitInfo) { // No good split found, make this a leaf node return { type: 'leaf', prediction: getMajorityClass(y_subset), count: numSamples }; } const { bestFeatureIndex, bestSplitValue, bestLeftIndices, bestRightIndices } = splitInfo; // Create subsets for children nodes const X_left = bestLeftIndices.map(idx => X_subset[idx]); const y_left = bestLeftIndices.map(idx => y_subset[idx]); const X_right = bestRightIndices.map(idx => X_subset[idx]); const y_right = bestRightIndices.map(idx => y_subset[idx]); // Recursively build left and right children const leftChild = this._buildTree(X_left, y_left, currentDepth + 1, maxDepth, minSamplesSplit, featureIndices); const rightChild = this._buildTree(X_right, y_right, currentDepth + 1, maxDepth, minSamplesSplit, featureIndices); // Return internal node return { type: 'internal', featureIndex: bestFeatureIndex, splitValue: bestSplitValue, leftChild: leftChild, rightChild: rightChild, count: numSamples }; } /** * Predicts the class label for a single feature vector using a given Decision Tree node. * @param {object} treeNode - The current node in the decision tree. * @param {number[]} features - The feature vector to classify. * @returns {string|null} The predicted class label. * @private */ _predictTree(treeNode, features) { if (treeNode.type === 'leaf') { return treeNode.prediction; } const featureValue = features[treeNode.featureIndex]; if (featureValue <= treeNode.splitValue) { return this._predictTree(treeNode.leftChild, features); } else { return this._predictTree(treeNode.rightChild, features); } } /** * Trains the Decision Tree Classifier model. * @param {number[][]} X - An array of feature vectors. * @param {string[]} y - An array of corresponding class labels. * @param {object} [options={}] - Training options. * @param {number} [options.maxDepth=10] - Maximum depth of the tree. * @param {number} [options.minSamplesSplit=2] - Minimum number of samples required to split an internal node. * @param {string[]} [options.featureNames=[]] - Optional names for features (for debugging/visualization). * @throws {Error} If X and y have different lengths or data is empty. */ trainDecisionTree(X, y, options = {}) { if (X.length !== y.length) { throw new Error("Feature vectors (X) and labels (y) must have the same number of samples."); } if (X.length === 0) { console.warn("No data provided to train Decision Tree."); return; } const { maxDepth = 10, minSamplesSplit = 2, featureNames = [] } = options; this._decisionTreeModel.classes = [...new Set(y)]; this._decisionTreeModel.featureNames = featureNames; this._decisionTreeModel.tree = this._buildTree(X, y, 0, maxDepth, minSamplesSplit, null); } /** * Predicts the class label for a single feature vector using the trained Decision Tree model. * @param {number[]} features - The feature vector to classify. * @returns {string|null} The predicted class label, or null if the model is not trained. * @throws {Error} If the model has not been trained or input feature length is incorrect. */ predictDecisionTree(features) { const model = this._decisionTreeModel; if (!model.tree) { throw new Error("Decision Tree model has not been trained. Call trainDecisionTree() first."); } if (model.featureNames.length > 0 && features.length !== model.featureNames.length) { throw new Error(`Input feature vector length (${features.length}) does not match trained feature count (${model.featureNames.length}).`); } return this._predictTree(model.tree, features); } /** * Saves the current Decision Tree model state. * @returns {object} A serializable object representing the model state. */ saveDecisionTreeModel() { const model = this._decisionTreeModel; return { tree: model.tree, featureNames: [...model.featureNames], classes: [...model.classes] }; } /** * Loads a Decision Tree model state. * @param {object} state - The state object to load. * @throws {TypeError} If the state object is malformed. */ loadDecisionTreeModel(state) { if (typeof state !== 'object' || state === null || typeof state.tree !== 'object' || !Array.isArray(state.featureNames) || !Array.isArray(state.classes)) { throw new TypeError('Invalid Decision Tree model state object.'); } this._decisionTreeModel.tree = state.tree; this._decisionTreeModel.featureNames = [...state.featureNames]; this._decisionTreeModel.classes = [...state.classes]; } /** * ===================================================================== * RANDOM FOREST CLASSIFIER * ===================================================================== * An ensemble learning method for classification that operates by constructing * a multitude of decision trees at training time and outputting the class * that is the mode of the classes (classification) or mean prediction (regression) * of the individual trees. */ /** * Creates a bootstrap sample (sampling with replacement) of the original data. * @param {number[][]} X - Original feature vectors. * @param {string[]} y - Original labels. * @returns {object} { X_sample, y_sample } * @private */ _bootstrapSample(X, y) { const numSamples = X.length; const X_sample = []; const y_sample = []; for (let i = 0; i < numSamples; i++) { const randomIndex = Math.floor(Math.random() * numSamples); X_sample.push(X[randomIndex]); y_sample.push(y[randomIndex]); } return { X_sample, y_sample }; } /** * Selects a random subset of feature indices for a tree. * @param {number} totalFeatures - Total number of features. * @param {number} maxFeatures - Maximum number of features to select (or a fraction). * @returns {number[]} Array of selected feature indices. * @private */ _selectRandomFeatures(totalFeatures, maxFeatures) { let numFeaturesToSelect; if (maxFeatures <= 1) { // Interpret as a fraction numFeaturesToSelect = Math.max(1, Math.floor(totalFeatures * maxFeatures)); } else { numFeaturesToSelect = Math.min(totalFeatures, Math.floor(maxFeatures)); } const allFeatureIndices = Array.from({ length: totalFeatures }, (_, i) => i); const selected = new Set(); while (selected.size < numFeaturesToSelect) { const randomIndex = Math.floor(Math.random() * totalFeatures); selected.add(randomIndex); } return Array.from(selected); } /** * Trains the Random Forest Classifier model. * @param {number[][]} X - An array of feature vectors. * @param {string[]} y - An array of corresponding class labels. * @param {object} [options={}] - Training options. * @param {number} [options.numTrees=10] - Number of decision trees in the forest. * @param {number} [options.maxDepth=10] - Maximum depth for each tree. * @param {number} [options.minSamplesSplit=2] - Minimum samples to split a node for each tree. * @param {number} [options.maxFeatures=null] - Number or fraction of features to consider at each split. If null, sqrt(num_features) for classification is common. * @param {string[]} [options.featureNames=[]] - Optional names for features. * @throws {Error} If X and y have different lengths or data is empty. */ trainRandomForest(X, y, options = {}) { if (X.length !== y.length) { throw new Error("Feature vectors (X) and labels (y) must have the same number of samples."); } if (X.length === 0) { console.warn("No data provided to train Random Forest."); return; } const { numTrees = 10, maxDepth = 10, minSamplesSplit = 2, maxFeatures = Math.sqrt(X[0].length), // Common default for classification featureNames = [] } = options; this._randomForestModel.trees = []; this._randomForestModel.classes = [...new Set(y)]; this._randomForestModel.featureNames = featureNames; const totalFeatures = X[0].length; for (let i = 0; i < numTrees; i++) { const { X_sample, y_sample } = this._bootstrapSample(X, y); const featureIndices = this._selectRandomFeatures(totalFeatures, maxFeatures); // Create a temporary ML instance to build a tree const tempML = new WmbataBotML(); const tree = tempML._buildTree(X_sample, y_sample, 0, maxDepth, minSamplesSplit, featureIndices); this._randomForestModel.trees.push(tree); } } /** * Predicts the class label for a single feature vector using the trained Random Forest model. * @param {number[]} features - The feature vector to classify. * @returns {string|null} The predicted class label (majority vote), or null if model not trained. * @throws {Error} If model not trained or input feature length is incorrect. */ predictRandomForest(features) { const model = this._randomForestModel; if (model.trees.length === 0) { throw new Error("Random Forest model has not been trained. Call trainRandomForest() first."); } if (model.featureNames.length > 0 && features.length !== model.featureNames.length) { console.warn(`Input feature vector length (${features.length}) does not match trained feature count (${model.featureNames.length}).`); } const predictions = []; const tempML = new WmbataBotML(); // Use a temporary instance to call _predictTree for (const tree of model.trees) { predictions.push(tempML._predictTree(tree, features)); } // Aggregate predictions (majority vote) const voteCounts = new Map(); for (const prediction of predictions) { voteCounts.set(prediction, (voteCounts.get(prediction) || 0) + 1); } let bestPrediction = null; let maxVotes = -1; for (const [prediction, count] of voteCounts.entries()) { if (count > maxVotes) { maxVotes = count; bestPrediction = prediction; } } return bestPrediction; } /** * Saves the current Random Forest model state. * @returns {object} A serializable object representing the model state. */ saveRandomForestModel() { const model = this._randomForestModel; return { trees: model.trees, featureNames: [...model.featureNames], classes: [...model.classes] }; } /** * Loads a Random Forest model state. * @param {object} state - The state object to load. * @throws {TypeError} If the state object is malformed. */ loadRandomForestModel(state) { if (typeof state !== 'object' || state === null || !Array.isArray(state.trees) || !Array.isArray(state.featureNames) || !Array.isArray(state.classes)) { throw new TypeError('Invalid Random Forest model state object.'); } this._randomForestModel.trees = state.trees; this._randomForestModel.featureNames = [...state.featureNames]; this._randomForestModel.classes = [...state.classes]; } /** * ===================================================================== * MODEL EVALUATION & SELECTION * ===================================================================== * Functions to evaluate the performance of classification models and tune hyperparameters. */ /** * Performs K-Fold Cross-Validation for a given model. * @param {object} modelInstance - An instance of WmbataBotML. * @param {number[][]} X - Full feature dataset. * @param {Array} y - Full label dataset. * @param {number} kFolds - Number of folds for cross-validation. * @param {string} trainMethodName - Name of the training method on modelInstance (e.g., 'trainLogisticRegression'). * @param {string} predictMethodName - Name of the prediction method on modelInstance (e.g., 'predictLogisticRegression'). * @param {string} evaluationMetricName - Name of the static evaluation metric method on WmbataBotML (e.g., 'calculateClassificationMetrics', 'rocAucScore'). * @param {object} [trainOptions={}] - Options to pass to the training method. * @returns {Array