namespace VisualMath.Accord.Statistics { using System; using System.Collections.Generic; using VisualMath.Accord.Math; /// /// Set of statistics functions /// /// /// /// This class represents collection of functions used in statistics. /// Every Matrix function assumes data is organized in a table-like /// model, where Columns represents variables and Rows represents a /// observation of each variable. /// /// public static class Tools { #region Arrays /// Computes the Mean of the given values. /// A double array containing the vector members. /// The mean of the given data. public static double Mean(this double[] values) { double sum = 0.0; double n = values.Length; for (int i = 0; i < values.Length; i++) { sum += values[i]; } return sum / n; } /// Computes the Weighted Mean of the given values. /// A double array containing the vector members. /// An unit vector containing the importance of each sample /// in . The sum of this array elements should add up to 1. /// The mean of the given data. public static double Mean(this double[] values, double[] weights) { double sum = 0.0; for (int i = 0; i < values.Length; i++) { sum += values[i] * weights[i]; } return sum; } /// Computes the Standard Deviation of the given values. /// A double array containing the vector members. /// The standard deviation of the given data. public static double StandardDeviation(this double[] values) { return StandardDeviation(values, Mean(values)); } /// Computes the Standard Deviation of the given values. /// A double array containing the vector members. /// The mean of the vector, if already known. /// The standard deviation of the given data. public static double StandardDeviation(this double[] values, double mean) { return System.Math.Sqrt(Variance(values, mean)); } /// Computes the Standard Deviation of the given values. /// A double array containing the vector members. /// The mean of the vector, if already known. /// An unit vector containing the importance of each sample /// in . The sum of this array elements should add up to 1. /// The standard deviation of the given data. public static double StandardDeviation(this double[] values, double mean, double[] weights) { return System.Math.Sqrt(Variance(values, mean, weights)); } /// /// Computes the Standard Error for a sample size, which estimates the /// standard deviation of the sample mean based on the population mean. /// /// The sample size. /// The sample standard deviation. /// The standard error for the sample. public static double StandardError(int samples, double standardDeviation) { return standardDeviation / System.Math.Sqrt(samples); } /// /// Computes the Standard Error for a sample size, which estimates the /// standard deviation of the sample mean based on the population mean. /// /// A double array containing the samples. /// The standard error for the sample. public static double StandardError(double[] values) { return StandardError(values.Length, StandardDeviation(values)); } /// Computes the Median of the given values. /// A double array containing the vector members. /// The median of the given data. public static double Median(double[] values) { return Median(values, false); } /// Computes the Median of the given values. /// An integer array containing the vector members. /// A boolean parameter informing if the given values have already been sorted. /// The median of the given data. public static double Median(double[] values, bool alreadySorted) { double[] data = new double[values.Length]; values.CopyTo(data, 0); // Creates a copy of the given values, if (!alreadySorted) // So we can sort it without modifying the original array. Array.Sort(data); int N = data.Length; if (N % 2 == 0) return (data[N / 2] + data[(N / 2) - 1]) * 0.5; // N is even else return data[N / 2]; // N is odd } /// Computes the Variance of the given values. /// A double precision number array containing the vector members. /// The variance of the given data. public static double Variance(double[] values) { return Variance(values, Mean(values)); } /// Computes the Variance of the given values. /// A number array containing the vector members. /// The mean of the array, if already known. /// The variance of the given data. public static double Variance(double[] values, double mean) { double variance = 0.0; double N = values.Length; double x; for (int i = 0; i < values.Length; i++) { x = values[i] - mean; variance += x * x; } // Sample variance return variance / (N - 1); } /// Computes the weighted Variance of the given values. /// A number array containing the vector members. /// The mean of the array, if already known. /// An unit vector containing the importance of each sample /// in . The sum of this array elements should add up to 1. /// The variance of the given data. public static double Variance(double[] values, double mean, double[] weights) { // http://en.wikipedia.org/wiki/Weighted_variance#Weighted_sample_variance // http://www.gnu.org/software/gsl/manual/html_node/Weighted-Samples.html double variance = 0.0; double V2 = 0.0; double x, w; for (int i = 0; i < values.Length; i++) { x = values[i] - mean; variance += x * x * weights[i]; w = weights[i]; V2 += w * w; } return variance / (1 - V2); } /// Computes the Mode of the given values. /// A number array containing the vector values. /// The variance of the given data. public static double Mode(double[] values) { int[] itemCount = new int[values.Length]; double[] itemArray = new double[values.Length]; int count = 0; for (int i = 0; i < values.Length; i++) { int index = Array.IndexOf(itemArray, values[i], 0, count); if (index >= 0) { itemCount[index]++; } else { itemArray[count] = values[i]; itemCount[count] = 1; count++; } } int maxValue = 0; int maxIndex = 0; for (int i = 0; i < count; i++) { if (itemCount[i] > maxValue) { maxValue = itemCount[i]; maxIndex = i; } } return itemArray[maxIndex]; } /// Computes the Covariance between two values arrays. /// A number array containing the first vector members. /// A number array containing the second vector members. /// The variance of the given data. public static double[,] Covariance(double[] u, double[] v) { double[][] vectors = new double[][] { u, v }; return Scatter(vectors, Mean(vectors, 1), vectors.Length, 1); } /// /// Computes the Skewness for the given values. /// /// /// Skewness characterizes the degree of asymmetry of a distribution /// around its mean. Positive skewness indicates a distribution with /// an asymmetric tail extending towards more positive values. Negative /// skewness indicates a distribution with an asymmetric tail extending /// towards more negative values. /// /// A number array containing the vector values. /// The skewness of the given data. public static double Skewness(double[] values) { double mean = Mean(values); return Skewness(values, mean, StandardDeviation(values, mean)); } /// /// Computes the Skewness for the given values. /// /// /// Skewness characterizes the degree of asymmetry of a distribution /// around its mean. Positive skewness indicates a distribution with /// an asymmetric tail extending towards more positive values. Negative /// skewness indicates a distribution with an asymmetric tail extending /// towards more negative values. /// /// A number array containing the vector values. /// The values' mean, if already known. /// The values' standard deviations, if already known. /// The skewness of the given data. public static double Skewness(double[] values, double mean, double standardDeviation) { int n = values.Length; double sum = 0.0; for (int i = 0; i < n; i++) { // Sum of third moment deviations sum += System.Math.Pow(values[i] - mean, 3); } return sum / ((double)n * System.Math.Pow(standardDeviation, 3)); } /// /// Computes the Kurtosis for the given values. /// /// A number array containing the vector values. /// The kurtosis of the given data. public static double Kurtosis(double[] values) { double mean = Mean(values); return Kurtosis(values, mean, StandardDeviation(values, mean)); } /// /// Computes the Kurtosis for the given values. /// /// A number array containing the vector values. /// The values' mean, if already known. /// The values' variance, if already known. /// The kurtosis of the given data. public static double Kurtosis(double[] values, double mean, double standardDeviation) { int n = values.Length; double sum = 0.0, deviation; for (int i = 0; i < n; i++) { // Sum of fourth moment deviations deviation = (values[i] - mean); sum += System.Math.Pow(deviation, 4); } return sum / ((double)n * System.Math.Pow(standardDeviation, 4)) - 3.0; } #endregion // ------------------------------------------------------------ #region Matrix /// Calculates the matrix Mean vector. /// A matrix whose means will be calculated. /// Returns a row vector containing the column means of the given matrix. /// /// /// double[,] matrix = /// { /// { 2, -1.0, 5 }, /// { 7, 0.5, 9 }, /// }; /// /// // column means are equal to (4.5, -0.25, 7.0) /// double[] means = Accord.Statistics.Tools.Mean(matrix); /// /// public static double[] Mean(double[,] matrix) { return Mean(matrix, 0); } /// Calculates the matrix Mean vector. /// A matrix whose means will be calculated. /// /// The dimension along which the means will be calculated. Pass /// 0 to compute a row vector containing the mean of each column, /// or 1 to compute a column vector containing the mean of each row. /// Default value is 0. /// /// Returns a vector containing the means of the given matrix. /// /// /// double[,] matrix = /// { /// { 2, -1.0, 5 }, /// { 7, 0.5, 9 }, /// }; /// /// // column means are equal to (4.5, -0.25, 7.0) /// double[] colMeans = Accord.Statistics.Tools.Mean(matrix, 0); /// /// // row means are equal to (2.0, 5.5) /// double[] rowMeans = Accord.Statistics.Tools.Mean(matrix, 1); /// /// public static double[] Mean(double[,] matrix, int dimension) { if (dimension == 0) { double[] mean = new double[matrix.GetLength(1)]; double rows = matrix.GetLength(0); // for each column for (int j = 0; j < matrix.GetLength(1); j++) { // for each row for (int i = 0; i < matrix.GetLength(0); i++) mean[j] += matrix[i, j]; mean[j] = mean[j] / rows; } return mean; } else if (dimension == 1) { double[] mean = new double[matrix.GetLength(0)]; double cols = matrix.GetLength(1); // for each row for (int j = 0; j < matrix.GetLength(0); j++) { // for each column for (int i = 0; i < matrix.GetLength(1); i++) mean[j] += matrix[j, i]; mean[j] = mean[j] / cols; } return mean; } else { throw new ArgumentException("Invalid dimension.", "dimension"); } } /// Calculates the matrix Mean vector. /// A matrix whose means will be calculated. /// Returns a row vector containing the column means of the given matrix. /// /// /// double[][] matrix = /// { /// new double[] { 2, -1.0, 5 }, /// new double[] { 7, 0.5, 9 }, /// }; /// /// // column means are equal to (4.5, -0.25, 7.0) /// double[] means = Accord.Statistics.Tools.Mean(matrix); /// /// public static double[] Mean(double[][] matrix) { return Mean(matrix, 0); } /// Calculates the matrix Mean vector. /// A matrix whose means will be calculated. /// /// The dimension along which the means will be calculated. Pass /// 0 to compute a row vector containing the mean of each column, /// or 1 to compute a column vector containing the mean of each row. /// Default value is 0. /// /// Returns a vector containing the means of the given matrix. /// /// /// double[][] matrix = /// { /// new double[] { 2, -1.0, 5 }, /// new double[] { 7, 0.5, 9 }, /// }; /// /// // column means are equal to (4.5, -0.25, 7.0) /// double[] colMeans = Accord.Statistics.Tools.Mean(matrix, 0); /// /// // row means are equal to (2.0, 5.5) /// double[] rowMeans = Accord.Statistics.Tools.Mean(matrix, 1); /// /// public static double[] Mean(double[][] matrix, int dimension) { int rows = matrix.Length; if (rows == 0) return new double[0]; int cols = matrix[0].Length; if (dimension == 0) { double[] mean = new double[cols]; double N = rows; // for each column for (int j = 0; j < cols; j++) { // for each row for (int i = 0; i < rows; i++) mean[j] += matrix[i][j]; mean[j] = mean[j] / N; } return mean; } else if (dimension == 1) { double[] mean = new double[rows]; double N = cols; // for each row for (int j = 0; j < rows; j++) { // for each column for (int i = 0; i < cols; i++) mean[j] += matrix[j][i]; mean[j] = mean[j] / N; } return mean; } else { throw new ArgumentException("Invalid dimension.", "dimension"); } } /// Calculates the weighted matrix Mean vector. /// A matrix whose means will be calculated. /// A vector containing the importance of each sample in the matrix. /// Returns a vector containing the means of the given matrix. /// public static double[] Mean(double[][] matrix, double[] weights) { return Mean(matrix, weights, 0); } /// Calculates the weighted matrix Mean vector. /// A matrix whose means will be calculated. /// A vector containing the importance of each sample in the matrix. /// /// The dimension along which the means will be calculated. Pass /// 0 to compute a row vector containing the mean of each column, /// or 1 to compute a column vector containing the mean of each row. /// Default value is 0. /// /// Returns a vector containing the means of the given matrix. /// public static double[] Mean(double[][] matrix, double[] weights, int dimension) { int rows = matrix.Length; if (rows == 0) return new double[0]; int cols = matrix[0].Length; if (dimension == 0) { double[] mean = new double[cols]; // for each row for (int i = 0; i < rows; i++) { double[] row = matrix[i]; double w = weights[i]; // for each column for (int j = 0; j < cols; j++) mean[j] += row[j] * w; } return mean; } else if (dimension == 1) { double[] mean = new double[rows]; // for each row for (int j = 0; j < rows; j++) { double[] row = matrix[j]; double w = weights[j]; // for each column for (int i = 0; i < cols; i++) mean[j] += row[i] * w; } return mean; } else { throw new ArgumentException("Invalid dimension.", "dimension"); } } /// Calculates the matrix Mean vector. /// A matrix whose means will be calculated. /// The sum vector containing already calculated sums for each column of the matix. /// Returns a vector containing the means of the given matrix. public static double[] Mean(double[,] matrix, double[] sums) { int rows = matrix.GetLength(0); int cols = matrix.GetLength(1); double[] mean = new double[cols]; double N = rows; for (int j = 0; j < cols; j++) mean[j] = sums[j] / N; return mean; } /// Calculates the matrix Standard Deviations vector. /// A matrix whose deviations will be calculated. /// Returns a vector containing the standard deviations of the given matrix. public static double[] StandardDeviation(double[,] matrix) { return StandardDeviation(matrix, Mean(matrix)); } /// Calculates the matrix Standard Deviations vector. /// A matrix whose deviations will be calculated. /// The mean vector containing already calculated means for each column of the matix. /// Returns a vector containing the standard deviations of the given matrix. public static double[] StandardDeviation(this double[,] matrix, double[] means) { return Matrix.Sqrt(Variance(matrix, means)); } /// Calculates the matrix Standard Deviations vector. /// A matrix whose deviations will be calculated. /// The mean vector containing already calculated means for each column of the matix. /// Returns a vector containing the standard deviations of the given matrix. public static double[] StandardDeviation(this double[][] matrix, double[] means) { return Matrix.Sqrt(Variance(matrix, means)); } /// Calculates the matrix Standard Deviations vector. /// A matrix whose deviations will be calculated. /// Returns a vector containing the standard deviations of the given matrix. public static double[] StandardDeviation(this double[][] matrix) { return StandardDeviation(matrix, Mean(matrix)); } /// Centers an observation, subtracting the empirical mean from the variable. public static void Center(double[] observation) { Center(observation, Mean(observation)); } /// Centers an observation, subtracting the empirical mean from the variable. public static void Center(double[] observation, double mean) { for (int i = 0; i < observation.Length; i++) observation[i] -= mean; } /// Calculates the matrix Variance vector. /// A matrix whose variancees will be calculated. /// Returns a vector containing the variances of the given matrix. public static double[] Variance(this double[,] matrix) { return Variance(matrix, Mean(matrix)); } /// Calculates the matrix Variance vector. /// A matrix whose variances will be calculated. /// The mean vector containing already calculated means for each column of the matix. /// Returns a vector containing the variances of the given matrix. public static double[] Variance(this double[,] matrix, double[] means) { int rows = matrix.GetLength(0); int cols = matrix.GetLength(1); double N = rows; double[] variance = new double[cols]; // for each column (for each variable) for (int j = 0; j < cols; j++) { double sum1 = 0.0; double sum2 = 0.0; double x = 0.0; // for each row (observation of the variable) for (int i = 0; i < rows; i++) { x = matrix[i, j] - means[j]; sum1 += x; sum2 += x * x; } // calculate the variance variance[j] = (sum2 - ((sum1 * sum1) / N)) / (N - 1); } return variance; } /// Calculates the matrix Variance vector. /// A matrix whose variances will be calculated. /// Returns a vector containing the variances of the given matrix. public static double[] Variance(this double[][] matrix) { return Variance(matrix, Mean(matrix)); } /// Calculates the matrix Variance vector. /// A matrix whose variances will be calculated. /// The mean vector containing already calculated means for each column of the matix. /// Returns a vector containing the variances of the given matrix. public static double[] Variance(this double[][] matrix, double[] means) { int rows = matrix.Length; if (rows == 0) return new double[0]; int cols = matrix[0].Length; double N = rows; double[] variance = new double[cols]; // for each column (for each variable) for (int j = 0; j < cols; j++) { double sum1 = 0.0; double sum2 = 0.0; double x = 0.0; // for each row (observation of the variable) for (int i = 0; i < rows; i++) { x = matrix[i][j] - means[j]; sum1 += x; sum2 += x * x; } // calculate the variance variance[j] = (sum2 - ((sum1 * sum1) / N)) / (N - 1); } return variance; } /// Calculates the matrix Medians vector. /// A matrix whose medians will be calculated. /// Returns a vector containing the medians of the given matrix. public static double[] Median(double[,] matrix) { int rows = matrix.GetLength(0); int cols = matrix.GetLength(1); double[] medians = new double[cols]; for (int i = 0; i < cols; i++) { double[] data = new double[rows]; // Creates a copy of the given values for (int j = 0; j < rows; j++) data[j] = matrix[j, i]; Array.Sort(data); // Sort it int N = data.Length; if (N % 2 == 0) medians[i] = (data[N / 2] + data[(N / 2) - 1]) * 0.5; // N is even else medians[i] = data[N / 2]; // N is odd } return medians; } /// Calculates the matrix Medians vector. /// A matrix whose medians will be calculated. /// Returns a vector containing the medians of the given matrix. public static double[] Median(double[][] matrix) { int rows = matrix.Length; int cols = matrix[0].Length; double[] medians = new double[cols]; for (int i = 0; i < cols; i++) { double[] data = new double[rows]; // Creates a copy of the given values for (int j = 0; j < rows; j++) data[j] = matrix[j][i]; Array.Sort(data); // Sort it int N = data.Length; if (N % 2 == 0) medians[i] = (data[N / 2] + data[(N / 2) - 1]) * 0.5; // N is even else medians[i] = data[N / 2]; // N is odd } return medians; } /// Calculates the matrix Modes vector. /// A matrix whose modes will be calculated. /// Returns a vector containing the modes of the given matrix. public static double[] Mode(this double[,] matrix) { int rows = matrix.GetLength(0); int cols = matrix.GetLength(1); double[] mode = new double[cols]; for (int i = 0; i < cols; i++) { int[] itemCount = new int[rows]; double[] itemArray = new double[rows]; int count = 0; // for each row for (int j = 0; j < rows; j++) { int index = Array.IndexOf(itemArray, matrix[j, i], 0, count); if (index >= 0) { itemCount[index]++; } else { itemArray[count] = matrix[j, i]; itemCount[count] = 1; count++; } } int maxValue = 0; int maxIndex = 0; for (int j = 0; j < count; j++) { if (itemCount[j] > maxValue) { maxValue = itemCount[j]; maxIndex = j; } } mode[i] = itemArray[maxIndex]; } return mode; } /// /// Computes the Skewness for the given values. /// /// /// Skewness characterizes the degree of asymmetry of a distribution /// around its mean. Positive skewness indicates a distribution with /// an asymmetric tail extending towards more positive values. Negative /// skewness indicates a distribution with an asymmetric tail extending /// towards more negative values. /// /// A number matrix containing the matrix values. /// The skewness of the given data. public static double[] Skewness(double[,] matrix) { double[] means = Mean(matrix); return Skewness(matrix, means, StandardDeviation(matrix, means)); } /// /// Computes the Skewness vector for the given matrix. /// /// /// Skewness characterizes the degree of asymmetry of a distribution /// around its mean. Positive skewness indicates a distribution with /// an asymmetric tail extending towards more positive values. Negative /// skewness indicates a distribution with an asymmetric tail extending /// towards more negative values. /// /// A number array containing the vector values. /// The values' mean, if already known. /// The values' standard deviations, if already known. /// The skewness of the given data. public static double[] Skewness(double[,] matrix, double[] means, double[] standardDeviations) { int n = matrix.GetLength(0); double[] skewness = new double[matrix.GetLength(1)]; for (int j = 0; j < skewness.Length; j++) { double sum = 0.0; for (int i = 0; i < n; i++) { // Sum of third moment deviations sum += System.Math.Pow(matrix[i, j] - means[j], 3); } skewness[j] = sum / ((n - 1) * System.Math.Pow(standardDeviations[j], 3)); } return skewness; } /// /// Computes the Kurtosis vector for the given matrix. /// /// A number multi-dimensional array containing the matrix values. /// The kurtosis vector of the given data. public static double[] Kurtosis(double[,] matrix) { double[] means = Mean(matrix); return Kurtosis(matrix, means, StandardDeviation(matrix, means)); } /// /// Computes the Kurtosis vector for the given matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// The values' standard deviation vector, if already known. /// The kurtosis vector of the given data. public static double[] Kurtosis(double[,] matrix, double[] means, double[] standardDeviations) { int n = matrix.GetLength(0); double[] kurtosis = new double[matrix.GetLength(1)]; for (int j = 0; j < kurtosis.Length; j++) { double sum = 0.0; for (int i = 0; i < n; i++) { // Sum of fourth moment deviations sum += System.Math.Pow(matrix[i, j] - means[j], 4); } kurtosis[j] = sum / (n * System.Math.Pow(standardDeviations[j], 4)) - 3.0; } return kurtosis; } /// /// Computes the Standard Error vector for a given matrix. /// /// A number multi-dimensional array containing the matrix values. /// Returns the standard error vector for the matrix. public static double[] StandardError(double[,] matrix) { return StandardError(matrix.GetLength(0), StandardDeviation(matrix)); } /// /// Computes the Standard Error vector for a given matrix. /// /// The number of samples in the matrix. /// The values' standard deviation vector, if already known. /// Returns the standard error vector for the matrix. public static double[] StandardError(int samples, double[] standardDeviations) { double[] standardErrors = new double[standardDeviations.Length]; double sqrt = System.Math.Sqrt(samples); for (int i = 0; i < standardDeviations.Length; i++) { standardErrors[i] = standardDeviations[i] / sqrt; } return standardErrors; } /// /// Calculates the covariance matrix of a sample matrix. /// /// /// In statistics and probability theory, the covariance matrix is a matrix of /// covariances between elements of a vector. It is the natural generalization /// to higher dimensions of the concept of the variance of a scalar-valued /// random variable. /// /// A number multi-dimensional array containing the matrix values. /// The covariance matrix. public static double[,] Covariance(this double[,] matrix) { return Covariance(matrix, Mean(matrix)); } /// /// Calculates the covariance matrix of a sample matrix. /// /// /// In statistics and probability theory, the covariance matrix is a matrix of /// covariances between elements of a vector. It is the natural generalization /// to higher dimensions of the concept of the variance of a scalar-valued /// random variable. /// /// A number multi-dimensional array containing the matrix values. /// /// The dimension of the matrix to consider as observations. Pass 0 if the matrix has /// observations as rows and variables as columns, pass 1 otherwise. Default is 0. /// /// The covariance matrix. public static double[,] Covariance(this double[,] matrix, int dimension) { return Scatter(matrix, Mean(matrix, dimension), matrix.GetLength(dimension) - 1, dimension); } /// /// Calculates the covariance matrix of a sample matrix. /// /// /// In statistics and probability theory, the covariance matrix is a matrix of /// covariances between elements of a vector. It is the natural generalization /// to higher dimensions of the concept of the variance of a scalar-valued /// random variable. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// The covariance matrix. public static double[,] Covariance(this double[,] matrix, double[] means) { return Scatter(matrix, means, matrix.GetLength(0) - 1, 0); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// The covariance matrix. public static double[,] Scatter(double[,] matrix, double[] means) { return Scatter(matrix, means, 1.0, 0); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// A real number to divide each member of the matrix. /// The covariance matrix. public static double[,] Scatter(double[,] matrix, double[] means, double divisor) { return Scatter(matrix, means, divisor, 0); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// /// Pass 0 to if mean vector is a row vector, 1 otherwise. Default value is 0. /// /// The covariance matrix. public static double[,] Scatter(double[,] matrix, double[] means, int dimension) { return Scatter(matrix, means, 1.0, dimension); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// A real number to divide each member of the matrix. /// /// Pass 0 if the mean vector is a row vector, 1 otherwise. Default value is 0. /// /// The covariance matrix. public static double[,] Scatter(double[,] matrix, double[] means, double divisor, int dimension) { int rows = matrix.GetLength(0); int cols = matrix.GetLength(1); double[,] cov; if (dimension == 0) { if (means.Length != cols) throw new ArgumentException( "Length of the mean vector should equal the number of columns", "mean"); cov = new double[cols, cols]; for (int i = 0; i < cols; i++) { for (int j = i; j < cols; j++) { double s = 0.0; for (int k = 0; k < rows; k++) s += (matrix[k, j] - means[j]) * (matrix[k, i] - means[i]); s /= divisor; cov[i, j] = s; cov[j, i] = s; } } } else if (dimension == 1) { if (means.Length != rows) throw new ArgumentException( "Length of the mean vector should equal the number of rows", "mean"); cov = new double[rows, rows]; for (int i = 0; i < rows; i++) { for (int j = i; j < rows; j++) { double s = 0.0; for (int k = 0; k < cols; k++) s += (matrix[j, k] - means[j]) * (matrix[i, k] - means[i]); s /= divisor; cov[i, j] = s; cov[j, i] = s; } } } else { throw new ArgumentException("Invalid dimension.", "dimension"); } return cov; } /// /// Calculates the covariance matrix of a sample matrix. /// /// /// In statistics and probability theory, the covariance matrix is a matrix of /// covariances between elements of a vector. It is the natural generalization /// to higher dimensions of the concept of the variance of a scalar-valued /// random variable. /// /// A number multi-dimensional array containing the matrix values. /// The covariance matrix. public static double[,] Covariance(this double[][] matrix) { return Covariance(matrix, Mean(matrix)); } /// /// Calculates the covariance matrix of a sample matrix. /// /// /// In statistics and probability theory, the covariance matrix is a matrix of /// covariances between elements of a vector. It is the natural generalization /// to higher dimensions of the concept of the variance of a scalar-valued /// random variable. /// /// A number multi-dimensional array containing the matrix values. /// /// The dimension of the matrix to consider as observations. Pass 0 if the matrix has /// observations as rows and variables as columns, pass 1 otherwise. Default is 0. /// /// The covariance matrix. public static double[,] Covariance(this double[][] matrix, int dimension) { int size = (dimension == 0) ? matrix.Length : matrix[0].Length; return Scatter(matrix, Mean(matrix, dimension), size - 1, dimension); } /// /// Calculates the covariance matrix of a sample matrix. /// /// /// In statistics and probability theory, the covariance matrix is a matrix of /// covariances between elements of a vector. It is the natural generalization /// to higher dimensions of the concept of the variance of a scalar-valued /// random variable. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// The covariance matrix. public static double[,] Covariance(this double[][] matrix, double[] means) { return Scatter(matrix, means, matrix.Length - 1, 0); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// The covariance matrix. public static double[,] Scatter(double[][] matrix, double[] means) { return Scatter(matrix, means, 1.0, 0); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// A real number to divide each member of the matrix. /// The covariance matrix. public static double[,] Scatter(double[][] matrix, double[] means, double divisor) { return Scatter(matrix, means, divisor, 0); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// /// Pass 0 to if mean vector is a row vector, 1 otherwise. Default value is 0. /// /// The covariance matrix. public static double[,] Scatter(double[][] matrix, double[] means, int dimension) { return Scatter(matrix, means, 1.0, dimension); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// An unit vector containing the importance of each sample /// in . The sum of this array elements should add up to 1. /// The covariance matrix. public static double[,] Covariance(double[][] matrix, double[] means, double[] weights) { double sw = 1.0; for (int i = 0; i < weights.Length; i++) sw -= weights[i] * weights[i]; return Scatter(matrix, means, sw, 0, weights); } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// A real number to divide each member of the matrix. /// /// Pass 0 to if mean vector is a row vector, 1 otherwise. Default value is 0. /// /// An unit vector containing the importance of each sample /// in . The sum of this array elements should add up to 1. /// The covariance matrix. public static double[,] Scatter(double[][] matrix, double[] means, double divisor, int dimension, double[] weights) { int rows = matrix.Length; if (rows == 0) return new double[0, 0]; int cols = matrix[0].Length; double[,] cov; if (dimension == 0) { if (means.Length != cols) throw new ArgumentException( "Length of the mean vector should equal the number of columns", "mean"); cov = new double[cols, cols]; for (int i = 0; i < cols; i++) { for (int j = i; j < cols; j++) { double s = 0.0; for (int k = 0; k < rows; k++) s += weights[k] * (matrix[k][j] - means[j]) * (matrix[k][i] - means[i]); s /= divisor; cov[i, j] = s; cov[j, i] = s; } } } else if (dimension == 1) { if (means.Length != rows) throw new ArgumentException( "Length of the mean vector should equal the number of rows", "mean"); cov = new double[rows, rows]; for (int i = 0; i < rows; i++) { for (int j = i; j < rows; j++) { double s = 0.0; for (int k = 0; k < cols; k++) s += weights[k] * (matrix[j][k] - means[j]) * (matrix[i][k] - means[i]); s /= divisor; cov[i, j] = s; cov[j, i] = s; } } } else { throw new ArgumentException("Invalid dimension.", "dimension"); } return cov; } /// /// Calculates the scatter matrix of a sample matrix. /// /// /// By dividing the Scatter matrix by the sample size, we get the population /// Covariance matrix. By dividing by the sample size minus one, we get the /// sample Covariance matrix. /// /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// A real number to divide each member of the matrix. /// /// Pass 0 to if mean vector is a row vector, 1 otherwise. Default value is 0. /// /// The covariance matrix. public static double[,] Scatter(double[][] matrix, double[] means, double divisor, int dimension) { int rows = matrix.Length; int cols = matrix[0].Length; double[,] cov; if (dimension == 0) { if (means.Length != cols) throw new ArgumentException( "Length of the mean vector should equal the number of columns", "mean"); cov = new double[cols, cols]; for (int i = 0; i < cols; i++) { for (int j = i; j < cols; j++) { double s = 0.0; for (int k = 0; k < rows; k++) s += (matrix[k][j] - means[j]) * (matrix[k][i] - means[i]); s /= divisor; cov[i, j] = s; cov[j, i] = s; } } } else if (dimension == 1) { if (means.Length != rows) throw new ArgumentException( "Length of the mean vector should equal the number of rows", "mean"); cov = new double[rows, rows]; for (int i = 0; i < rows; i++) { for (int j = i; j < rows; j++) { double s = 0.0; for (int k = 0; k < cols; k++) s += (matrix[j][k] - means[j]) * (matrix[i][k] - means[i]); s /= divisor; cov[i, j] = s; cov[j, i] = s; } } } else { throw new ArgumentException("Invalid dimension.", "dimension"); } return cov; } /// /// Calculates the correlation matrix for a matrix of samples. /// /// /// In statistics and probability theory, the correlation matrix is the same /// as the covariance matrix of the standardized random variables. /// /// A multi-dimensional array containing the matrix values. /// The correlation matrix. public static double[,] Correlation(double[,] matrix) { double[] means = Mean(matrix); return Correlation(matrix, means, StandardDeviation(matrix, means)); } /// /// Calculates the correlation matrix for a matrix of samples. /// /// /// In statistics and probability theory, the correlation matrix is the same /// as the covariance matrix of the standardized random variables. /// /// A multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// The values' standard deviation vector, if already known. /// The correlation matrix. public static double[,] Correlation(double[,] matrix, double[] means, double[] standardDeviations) { double[,] scores = ZScores(matrix, means, standardDeviations); int rows = matrix.GetLength(0); int cols = matrix.GetLength(1); double N = rows; double[,] cor = new double[cols, cols]; for (int i = 0; i < cols; i++) { for (int j = i; j < cols; j++) { double c = 0.0; for (int k = 0; k < rows; k++) c += scores[k, j] * scores[k, i]; c /= N - 1.0; cor[i, j] = c; cor[j, i] = c; } } return cor; } /// Generates the Standard Scores, also known as Z-Scores, the core from the given data. /// A number multi-dimensional array containing the matrix values. /// The Z-Scores for the matrix. public static double[,] ZScores(double[,] matrix) { double[] mean = Mean(matrix); return ZScores(matrix, mean, StandardDeviation(matrix, mean)); } /// Generates the Standard Scores, also known as Z-Scores, the core from the given data. /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// The values' standard deviation vector, if already known. /// The Z-Scores for the matrix. public static double[,] ZScores(double[,] matrix, double[] means, double[] standardDeviations) { double[,] m = (double[,])matrix.Clone(); Center(m, means); Standardize(m, standardDeviations); return m; } /// Generates the Standard Scores, also known as Z-Scores, the core from the given data. /// A number multi-dimensional array containing the matrix values. /// The Z-Scores for the matrix. public static double[][] ZScores(double[][] matrix) { double[] mean = Mean(matrix); return ZScores(matrix, mean, StandardDeviation(matrix, mean)); } /// Generates the Standard Scores, also known as Z-Scores, the core from the given data. /// A number multi-dimensional array containing the matrix values. /// The values' mean vector, if already known. /// The values' standard deviation vector, if already known. /// The Z-Scores for the matrix. public static double[][] ZScores(double[][] matrix, double[] means, double[] standardDeviations) { double[][] m = (double[][])matrix.Clone(); Center(m, means); Standardize(m, standardDeviations); return m; } /// Centers column data, subtracting the empirical mean from each variable. /// A matrix where each column represent a variable and each row represent a observation. public static void Center(double[,] matrix) { Center(matrix, Mean(matrix)); } /// Centers column data, subtracting the empirical mean from each variable. /// A matrix where each column represent a variable and each row represent a observation. /// The values' mean vector, if already known. public static void Center(double[,] matrix, double[] means) { int rows = matrix.GetLength(0); int cols = matrix.GetLength(1); for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) matrix[i, j] -= means[j]; } /// Centers column data, subtracting the empirical mean from each variable. /// A matrix where each column represent a variable and each row represent a observation. public static void Center(double[][] matrix) { Center(matrix, Mean(matrix)); } /// Centers column data, subtracting the empirical mean from each variable. /// A matrix where each column represent a variable and each row represent a observation. /// The values' mean vector, if already known. public static void Center(double[][] matrix, double[] means) { for (int i = 0; i < matrix.Length; i++) { double[] row = matrix[i]; for (int j = 0; j < row.Length; j++) row[j] -= means[j]; } } /// Standardizes column data, removing the empirical standard deviation from each variable. /// A matrix where each column represent a variable and each row represent a observation. /// This method does not remove the empirical mean prior to execution. public static void Standardize(double[,] matrix) { Standardize(matrix, StandardDeviation(matrix)); } /// Standardizes column data, removing the empirical standard deviation from each variable. /// A matrix where each column represent a variable and each row represent a observation. /// This method does not remove the empirical mean prior to execution. /// The values' standard deviation vector, if already known. public static void Standardize(this double[,] matrix, double[] standardDeviations) { int rows = matrix.GetLength(0); int cols = matrix.GetLength(1); for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) matrix[i, j] /= standardDeviations[j]; } /// Standardizes column data, removing the empirical standard deviation from each variable. /// A matrix where each column represent a variable and each row represent a observation. /// This method does not remove the empirical mean prior to execution. public static void Standardize(double[][] matrix) { Standardize(matrix, StandardDeviation(matrix)); } /// Standardizes column data, removing the empirical standard deviation from each variable. /// A matrix where each column represent a variable and each row represent a observation. /// This method does not remove the empirical mean prior to execution. /// The values' standard deviation vector, if already known. public static void Standardize(this double[][] matrix, double[] standardDeviations) { for (int i = 0; i < matrix.Length; i++) { double[] row = matrix[i]; for (int j = 0; j < row.Length; j++) row[j] /= standardDeviations[j]; } } #endregion // ------------------------------------------------------------ #region Summarizing, grouping and extending operations /// /// Calculates the prevalence of a class. /// /// An array of counts detailing the occurence of the first class. /// An array of counts detailing the occurence of the second class. /// An array containing the proportion of the first class over the total of occurances. public static double[] Proportions(int[] positives, int[] negatives) { double[] r = new double[positives.Length]; for (int i = 0; i < r.Length; i++) r[i] = (double)positives[i] / (positives[i] + negatives[i]); return r; } /// /// Calculates the prevalence of a class. /// /// A matrix containing counted, grouped data. /// The index for the column which contains counts for occurence of the first class. /// The index for the column which contains counts for occurence of the second class. /// An array containing the proportion of the first class over the total of occurances. public static double[] Proportions(int[][] data, int positiveColumn, int negativeColumn) { double[] r = new double[data.Length]; for (int i = 0; i < r.Length; i++) r[i] = (double)data[i][positiveColumn] / (data[i][positiveColumn] + data[i][negativeColumn]); return r; } /// /// Groups the occurances contained in data matrix of binary (dichotomous) data. /// /// A data matrix containing at least a column of binary data. /// Index of the column which contains the group label name. /// Index of the column which contains the binary [0,1] data. /// /// A matrix containing the group label in the first column, the number of occurances of the first class /// in the second column and the number of occurances of the second class in the third column. /// public static int[][] Group(int[][] data, int labelColumn, int dataColumn) { var groups = new List(); var groupings = new List(); for (int i = 0; i < data.Length; i++) { int group = data[i][labelColumn]; if (!groups.Contains(group)) { groups.Add(group); int positives = 0, negatives = 0; for (int j = 0; j < data.Length; j++) { if (data[j][labelColumn] == group) { if (data[j][dataColumn] == 0) negatives++; else positives++; } } groupings.Add(new int[] { group, positives, negatives }); } } return groupings.ToArray(); } /// /// Extends a grouped data into a full observation matrix. /// /// The group labels. /// /// An array containing he occurence of the positive class /// for each of the groups. /// /// An array containing he occurence of the negative class /// for each of the groups. /// A full sized observation matrix. public static int[][] Extend(int[] group, int[] positives, int[] negatives) { List rows = new List(); for (int i = 0; i < group.Length; i++) { for (int j = 0; j < positives[i]; j++) rows.Add(new int[] { group[i], 1 }); for (int j = 0; j < negatives[i]; j++) rows.Add(new int[] { group[i], 0 }); } return rows.ToArray(); } /// /// Extendes a grouped data into a full observation matrix. /// /// The grouped data matrix. /// Index of the column which contains the labels /// in the grouped data matrix. /// Index of the column which contains /// the occurances for the first class. /// Index of the column which contains /// the occurances for the second class. /// A full sized observation matrix. public static int[][] Extend(int[][] data, int labelColumn, int positiveColumn, int negativeColumn) { List rows = new List(); for (int i = 0; i < data.Length; i++) { for (int j = 0; j < data[i][positiveColumn]; j++) rows.Add(new int[] { data[i][labelColumn], 1 }); for (int j = 0; j < data[i][negativeColumn]; j++) rows.Add(new int[] { data[i][labelColumn], 0 }); } return rows.ToArray(); } #endregion #region Determination and performance measures /// /// Gets the coefficient of determination, as known as the R-Squared (R²) /// /// /// The coefficient of determination is used in the context of statistical models /// whose main purpose is the prediction of future outcomes on the basis of other /// related information. It is the proportion of variability in a data set that /// is accounted for by the statistical model. It provides a measure of how well /// future outcomes are likely to be predicted by the model. /// /// The R^2 coefficient of determination is a statistical measure of how well the /// regression approximates the real data points. An R^2 of 1.0 indicates that the /// regression perfectly fits the data. /// public static double Determination(double[] actual, double[] expected) { // R-squared = 100 * SS(regression) / SS(total) int N = actual.Length; double SSe = 0.0; double SSt = 0.0; double avg = 0.0; double d; // Calculate expected output mean for (int i = 0; i < N; i++) avg += expected[i]; avg /= N; // Calculate SSe and SSt for (int i = 0; i < N; i++) { d = expected[i] - actual[i]; SSe += d * d; d = expected[i] - avg; SSt += d * d; } // Calculate R-Squared return 1.0 - (SSe / SSt); } #endregion #region Permutations and combinatorials /// /// Returns a random sample of size k from a population of size n. /// public static int[] Random(int n, int k) { int[] idx = Tools.Random(n); return idx.Submatrix(k); } /// /// Returns a random permutation of size n. /// public static int[] Random(int n) { Random random = Accord.Math.Tools.Random; double[] x = new double[n]; int[] idx = Matrix.Indexes(0, n); for (int i = 0; i < n; i++) x[i] = random.NextDouble(); Array.Sort(x, idx); return idx; } /// /// Shuffles an array. /// public static void Shuffle(int[] array) { Random random = Accord.Math.Tools.Random; // i is the number of items remaining to be shuffled. for (int i = array.Length; i > 1; i--) { // Pick a random element to swap with the i-th element. int j = random.Next(i); // Swap array elements. var aux = array[j]; array[j] = array[i - 1]; array[i - 1] = aux; } } #endregion } }