diff --git a/config.yaml b/config.yaml index 8662f8e..34bdf1d 100644 --- a/config.yaml +++ b/config.yaml @@ -794,6 +794,74 @@ methods: normalize: True seed: 42 + LMNN: + run: ['metric'] + script: methods/mlpack/lmnn.py + format: [csv, txt] + datasets: + - files: ['datasets/iris_train.csv', + 'datasets/satellite_train.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv', + 'datasets/oilspill_train.csv', 'datasets/shuttle_train.csv', + 'datasets/ecoli_train.csv', 'datasets/vehicle.csv'] + options: + num_targets: 3 + passes: 10 + range: 20 + seed: 42 + + - files: ['datasets/letter_recognition.csv', + 'datasets/shuttle_train.csv', 'datasets/isolet_train.csv', + 'datasets/covtype.csv', 'datasets/optdigits_train.csv', + 'datasets/mnist_all.csv', 'datasets/Twitter.csv'] + options: + num_targets: 3 + passes: 3 + range: 100 + seed: 42 + + - files: ['datasets/iris_train.csv', + 'datasets/ecoli_train.csv', 'datasets/vehicle.csv', + 'datasets/balance_scale.csv', 'datasets/ionosphere.csv'] + options: + num_targets: 3 + passes: 5 + optimizer: bbsgd + seed: 42 + + - files: ['datasets/iris_train.csv', + 'datasets/satellite_train.csv', 'datasets/ionosphere.csv', + 'datasets/ecoli_train.csv', 'datasets/vehicle.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + options: + num_targets: 3 + passes: 5 + optimizer: sgd + range: 50 + step_size: 1e-07 + seed: 42 + + - files: ['datasets/iris_train.csv', + 'datasets/satellite_train.csv', 'datasets/ionosphere.csv', + 'datasets/ecoli_train.csv', 'datasets/vehicle.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + options: + num_targets: 3 + max_iterations: 2000 + optimizer: lbfgs + seed: 42 + range: 50 + + - files: ['datasets/covtype.csv', + 'datasets/shuttle_train.csv', 'datasets/isolet_train.csv', + 'datasets/mnist_all.csv', 'datasets/letter_recognition.csv'] + options: + num_targets: 3 + max_iterations: 2000 + optimizer: lbfgs + seed: 42 + range: 100 + HMMTRAIN: run: ['metric'] script: methods/mlpack/hmm_train.py @@ -884,6 +952,21 @@ methods: new_dimensionality: 2 scaled: True + LMNN: + run: ['metric'] + script: methods/matlab/lmnn.py + format: [csv, txt] + datasets: + - files: ['datasets/iris_train.csv', + 'datasets/satellite_train.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/vehicle.csv', + 'datasets/oilspill_train.csv', 'datasets/ecoli_train.csv', + 'datasets/letter_recognition.csv', 'datasets/shuttle_train.csv', + 'datasets/isolet_train.csv', 'datasets/optdigits_train.csv', + 'datasets/covtype.csv', 'datasets/mnist_all.csv'] + options: + k: 3 + PERCEPTRON: run: ['metric'] script: methods/matlab/perceptron.py @@ -2174,6 +2257,26 @@ methods: options: lambda1: 0.01 + LMNN: + run: ['metric'] + script: methods/shogun/lmnn.py + format: [csv, txt] + datasets: + - files: [ ['datasets/iris_train.csv'], + ['datasets/ecoli_train.csv'], + ['datasets/vehicle.csv'], + ['datasets/ionosphere.csv'], + ['datasets/shuttle_train.csv'], + ['datasets/letter_recognition.csv'], + ['datasets/balance_scale.csv'], + ['datasets/oilspill_train.csv'], + ['datasets/mnist_all.csv'], + ['datasets/Twitter.csv'], + ['datasets/isolet_train.csv'], + ['datasets/covtype.csv']] + options: + k: 3 + QDA: run: ['metric','metric'] script: methods/shogun/qda.py @@ -3109,4 +3212,4 @@ methods: ['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'], ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'], ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'], - ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ] \ No newline at end of file + ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ] diff --git a/datasets/dataset-urls.txt b/datasets/dataset-urls.txt index 1d7a0e2..6713bc9 100644 --- a/datasets/dataset-urls.txt +++ b/datasets/dataset-urls.txt @@ -10,6 +10,7 @@ artificial_1DSignal*.csv mlpack.org/datasets/artificial_1DSignal.tar.gz artificial_2DSignal*.csv mlpack.org/datasets/artificial_2DSignal.tar.gz artificial_40D*.csv mlpack.org/datasets/artificial_40D.tar.gz artificial_5DSignal*.csv mlpack.org/datasets/artificial_5DSignal.tar.gz +balance_scale*.csv mlpack.org/datasets/balance_scale.tar.gz bank8FM.csv mlpack.org/datasets/bank8FM.tar.gz cal_housing.csv mlpack.org/datasets/cal_housing.tar.gz circle_data.csv mlpack.org/datasets/circle.tar.gz @@ -25,6 +26,7 @@ faces.csv mlpack.org/datasets/faces.tar.gz ionosphere.csv mlpack.org/datasets/ionosphere.tar.gz iris*.csv mlpack.org/datasets/iris.tar.gz isolet*.csv mlpack.org/datasets/isolet.tar.gz +letter_recognition*.csv http://www.mlpack.org/datasets/letter_recognition.tar.gz madelon*.csv mlpack.org/datasets/madelon.tar.gz mammography*.csv mlpack.org/datasets/mammography.tar.gz mnist*.csv mlpack.org/datasets/mnist.tar.gz diff --git a/methods/matlab/LMNN.m b/methods/matlab/LMNN.m new file mode 100644 index 0000000..69c534e --- /dev/null +++ b/methods/matlab/LMNN.m @@ -0,0 +1,168 @@ +% @file lmnn.m + +function lmnn(cmd) +% LMNN Learns a metric using large-margin nearest neighbor metric learning +% +% The function uses large-margin nearest neighbor (LMNN) metric learning to +% learn a metric on the data set specified by the NxD matrix X and the +% corresponding Nx1 vector labels. The metric is returned in M. +% +% Required options: +% (-i) [string] Input dataset to perform LMNN on. +% Options: +% (-k) [int] Desired number of targets. +% +% +% This file is part of the Matlab Toolbox for Dimensionality Reduction. +% The toolbox can be obtained from http://homepage.tudelft.nl/19j49 +% You are free to use, change, or redistribute this code in any way you +% want for non-commercial purposes. However, it is appreciated if you +% maintain the name of the original author. +% +% (C) Laurens van der Maaten, Delft University of Technology + + inputFile = regexp(cmd, '.*?-i ([^\s]+)', 'tokens', 'once'); + + % Load input dataset. + X = csvread(inputFile{:}); + + % Use the last row of the data as the labels. + labels = X(:,end); + % Remove the label row. + X = X(:,1:end-1); + + % Variable K can't be used + % K = regexp(cmd, '.*?-k ([^\s]+)', 'tokens', 'once'); + % K = str2num(K{1}); + + total_time = tic; + + % Initialize some variables + [N, D] = size(X); + assert(length(labels) == N); + [lablist, ~, labels] = unique(labels); + K = length(lablist); + label_matrix = false(N, K); + label_matrix(sub2ind(size(label_matrix), (1:length(labels))', labels)) = true; + same_label = logical(double(label_matrix) * double(label_matrix')); + M = eye(D); + C = Inf; prev_C = Inf; + + % Set learning parameters + min_iter = 50; % minimum number of iterations + max_iter = 1000; % maximum number of iterations + eta = .1; % learning rate + mu = .5; % weighting of pull and push terms + tol = 1e-3; % tolerance for convergence + best_C = Inf; % best error obtained so far + best_M = M; % best metric found so far + no_targets = 3; % number of target neighbors + + % Select target neighbors + sum_X = sum(X .^ 2, 2); + DD = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * (X * X'))); + DD(~same_label) = Inf; DD(1:N + 1:end) = Inf; + [~, targets_ind] = sort(DD, 2, 'ascend'); + targets_ind = targets_ind(:,1:no_targets); + targets = false(N, N); + targets(sub2ind([N N], vec(repmat((1:N)', [1 no_targets])), vec(targets_ind))) = true; + + % Compute pulling term between target neigbhors to initialize gradient + slack = zeros(N, N, no_targets); + G = zeros(D, D); + for i=1:no_targets + G = G + (1 - mu) .* (X - X(targets_ind(:,i),:))' * (X - X(targets_ind(:,i),:)); + end + + % Perform main learning iterations + iter = 0; + while (prev_C - C > tol || iter < min_iter) && iter < max_iter + + % Compute pairwise distances under current metric + XM = X * M; + sum_X = sum(XM .* X, 2); + DD = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * (XM * X'))); + + % Compute value of slack variables + old_slack = slack; + for i=1:no_targets + slack(:,:,i) = ~same_label .* max(0, bsxfun(@minus, 1 + DD(sub2ind([N N], (1:N)', targets_ind(:,i))), DD)); + end + + % Compute value of cost function + prev_C = C; + C = (1 - mu) .* sum(DD(targets)) + ... % push terms between target neighbors + mu .* sum(slack(:)); % pull terms between impostors + + % Maintain best solution found so far (subgradient method) + if C < best_C + best_C = C; + best_M = M; + end + + % Perform gradient update + for i=1:no_targets + + % Add terms for new violations + [r, c] = find(slack(:,:,i) > 0 & old_slack(:,:,i) == 0); + G = G + mu .* ((X(r,:) - X(targets_ind(r, i),:))' * ... + (X(r,:) - X(targets_ind(r, i),:)) - ... + (X(r,:) - X(c,:))' * (X(r,:) - X(c,:))); + + % Remove terms for resolved violations + [r, c] = find(slack(:,:,i) == 0 & old_slack(:,:,i) > 0); + G = G - mu .* ((X(r,:) - X(targets_ind(r, i),:))' * ... + (X(r,:) - X(targets_ind(r, i),:)) - ... + (X(r,:) - X(c,:))' * (X(r,:) - X(c,:))); + end + M = M - (eta ./ N) .* G; + + % Project metric back onto the PSD cone + [V, L] = eig(M); + V = real(V); L = real(L); + ind = find(diag(L) > 0); + if isempty(ind) + warning('Projection onto PSD cone failed. All eigenvalues were negative.'); break + end + M = V(:,ind) * L(ind, ind) * V(:,ind)'; + if any(isinf(M(:))) + warning('Projection onto PSD cone failed. Metric contains Inf values.'); break + end + if any(isnan(M(:))) + warning('Projection onto PSD cone failed. Metric contains NaN values.'); break + end + + % Update learning rate + if prev_C > C + eta = eta * 1.01; + else + eta = eta * .5; + end + + % Print out progress + iter = iter + 1; + no_slack = sum(slack(:) > 0); + if rem(iter, 10) == 0 + [~, sort_ind] = sort(DD, 2, 'ascend'); + disp(['Iteration ' num2str(iter) ': error is ' num2str(C ./ N) ... + ', nearest neighbor error is ' num2str(sum(labels(sort_ind(:,2)) ~= labels) ./ N) ... + ', number of constraints: ' num2str(no_slack)]); + end + end + + % Return best metric and error + M = best_M; + C = best_C; + + % Compute mapped data + [L, S, ~] = svd(M); + L = bsxfun(@times, sqrt(diag(S)), L); + disp(sprintf('[INFO ] total_time: %fs', toc(total_time))) + + % Save learned distance. + csvwrite('distance.csv', L); +end + +function x = vec(x) + x = x(:); +end diff --git a/methods/matlab/lmnn.py b/methods/matlab/lmnn.py new file mode 100644 index 0000000..cca2054 --- /dev/null +++ b/methods/matlab/lmnn.py @@ -0,0 +1,146 @@ +''' + @file lmnn.py + @author Manish Kumar + + Class to benchmark the matlab Large Margin Nearest Neighbors method. +''' + +import os +import sys +import inspect + +# Import the util path, this method even works if the path contains symlinks to +# modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util"))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +#Import the metrics definitions path. +metrics_folder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics"))) +if metrics_folder not in sys.path: + sys.path.insert(0, metrics_folder) + +from log import * +from profiler import * +from definitions import * + +import shlex +import subprocess +import re +import collections + +''' +This class implements the Large Margin Nearest Neighbors benchmark. +''' +class LMNN(object): + + ''' + Create the Large Margin Nearest Neighbors benchmark instance. + + @param dataset - Input dataset to perform Logistic Regression on. + @param timeout - The time until the timeout. Default no timeout. + @param path - Path to the matlab binary. + @param verbose - Display informational messages. + ''' + def __init__(self, dataset, timeout=0, path=os.environ["MATLAB_BIN"], + verbose=True): + self.verbose = verbose + self.dataset = dataset + self.path = path + self.timeout = timeout + self.k = 1 + + ''' + Destructor to clean up at the end. Use this method to remove created file. + ''' + def __del__(self): + Log.Info("Clean up.", self.verbose) + filelist = ["distance.csv"] + for f in filelist: + if os.path.isfile(f): + os.remove(f) + + ''' + Large Margin Nearest Neighbors benchmark instance. If the method has been + successfully completed return the elapsed time in seconds. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def RunMetrics(self, options): + Log.Info("Perform Large Margin Nearest Neighbors.", self.verbose) + + if "k" in options: + self.k = int(options.pop("k")) + + # No options accepted for this script. + if len(options) > 0: + Log.Fatal("Unknown parameters: " + str(options)) + raise Exception("unknown parameters") + + inputCmd = "-i " + self.dataset + " -k " + str(self.k) + + # Split the command using shell-like syntax. + cmd = shlex.split(self.path + "matlab -nodisplay -nosplash -r \"try, " + + "LMNN('" + inputCmd + "'), catch, exit(1), end, exit(0)\"") + + # Run command with the nessecary arguments and return its output as a byte + # string. We have untrusted input so we disable all shell based features. + try: + s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False, + timeout=self.timeout) + except subprocess.TimeoutExpired as e: + Log.Warn(str(e)) + return -2 + except Exception as e: + Log.Fatal("Could not execute command: " + str(cmd)) + return -1 + + # Datastructure to store the results. + metrics = {} + + # Parse data: runtime. + timer = self.parseTimer(s) + + if timer != -1: + metrics['Runtime'] = timer.total_time + distance = np.genfromtxt("distance.csv", delimiter = ',') + data = np.genfromtxt(self.dataset, delimiter=',') + + dataList = [data[:,:-1], data[:, (data.shape[1] - 1)]] + metrics['Accuracy_1_NN'] = Metrics.KNNAccuracy(distance, dataList, 1, False) + metrics['Accuracy_3_NN'] = Metrics.KNNAccuracy(distance, dataList, 3, False) + metrics['Accuracy_3_NN_DW'] = Metrics.KNNAccuracy(distance, dataList, 3, True) + metrics['Accuracy_5_NN'] = Metrics.KNNAccuracy(distance, dataList, 5, False) + metrics['Accuracy_5_NN_DW'] = Metrics.KNNAccuracy(distance, dataList, 5, True) + + Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) + + return metrics + + ''' + Parse the timer data form a given string. + + @param data - String to parse timer data from. + @return - Namedtuple that contains the timer data or -1 in case of an error. + ''' + def parseTimer(self, data): + # Compile the regular expression pattern into a regular expression object to + # parse the timer data. + pattern = re.compile(br""" + .*?total_time: (?P.*?)s.*? + """, re.VERBOSE|re.MULTILINE|re.DOTALL) + + match = pattern.match(data) + if not match: + Log.Fatal("Can't parse the data: wrong format") + return -1 + else: + # Create a namedtuple and return the timer data. + timer = collections.namedtuple("timer", ["total_time"]) + + return timer(float(match.group("total_time"))) + diff --git a/methods/metrics/definitions.py b/methods/metrics/definitions.py index 262fc56..4f35bd8 100644 --- a/methods/metrics/definitions.py +++ b/methods/metrics/definitions.py @@ -9,6 +9,10 @@ import numpy as np import math +from modshogun import RealFeatures +from modshogun import MulticlassLabels +from modshogun import KNN, EuclideanDistance + class Metrics(object): ''' @@ -466,3 +470,50 @@ def SimpleMeanSquaredError(truelabels, predictedlabels): simplemse += difference * difference simplemse /= n return simplemse + + ''' + @param distance - Matrix containing learned distance. + @param data - List containing data & true labels. + @param k - Number of targets used calculation. + @param flag - Switch to control whether to use distance weighted KNN or not. + This method computes the accuracy based on the true labels and + predicted labels from knn classifier. + ''' + @staticmethod + def KNNAccuracy(distance, data, k, flag): + transformedData = np.dot(data[0], distance.T) + feat = RealFeatures(transformedData.T) + labels = MulticlassLabels(data[1].astype(np.float64)) + dist = EuclideanDistance(feat, feat) + knn = KNN(k + 1, dist, labels) + knn.train(feat) + # Get nearest neighbors. + nn = knn.nearest_neighbors() + nn = np.delete(nn, 0, 0) + # Compute unique labels. + uniqueLabels = np.unique(labels) + # Keep count correct predictions. + count = 0 + # Normalize labels + for i in range(data[0].shape[0]): + for j in range(len(uniqueLabels)): + if (labels[i] == uniqueLabels[j]): + labels[i] = j + break + + for i in range(nn.shape[1]): + mapLabels = [0 for x in range(len(uniqueLabels))] + for j in range(nn.shape[0]): + if (flag): + distPoints = np.linalg.norm(data[0][nn[j][i],:] - data[0][i,:]) + # Add constant factor of 1 incase two points overlap + mapLabels[int(labels[nn[j, i]])] += 1 / (distPoints + 1)**2 + else: + # Subtract a variable factor to avoid draw condition without + # affecting actual result. + mapLabels[int(labels[nn[j, i]])] += 1 - j * 1e-8 + maxInd = np.argmax(mapLabels) + if (maxInd == labels[i]): + count += 1 + accuracy = (count / nn.shape[1]) * 100 + return accuracy diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py new file mode 100644 index 0000000..6936740 --- /dev/null +++ b/methods/mlpack/lmnn.py @@ -0,0 +1,244 @@ +''' + @file lmnn.py + @author Manish Kumar + + Class to benchmark the mlpack Large Margin Nearest Neighbors method. +''' + +import os +import sys +import inspect + +# Import the util path, this method even works if the path contains symlinks to +# modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util"))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +#Import the metrics definitions path. +metrics_folder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics"))) +if metrics_folder not in sys.path: + sys.path.insert(0, metrics_folder) + +from log import * +from profiler import * +from definitions import * +from misc import * + +import shlex + +try: + import subprocess32 as subprocess +except ImportError: + import subprocess + +import numpy as np +import re +import collections + +''' +This class implements the Large Margin Nearest Neighbors benchmark. +''' +class LMNN(object): + + ''' + Create the Large Margin Nearest Neighbors benchmark instance, show some + informations and return the instance. + + @param dataset - Input dataset to perform LMNN on. + @param timeout - The time until the timeout. Default no timeout. + @param path - Path to the mlpack executable. + @param verbose - Display informational messages. + ''' + def __init__(self, dataset, timeout=0, path=os.environ["BINPATH"], + verbose=True, debug=os.environ["DEBUGBINPATH"]): + self.verbose = verbose + self.dataset = dataset + self.path = path + self.timeout = timeout + self.debug = debug + self.k = 1 + + # Get description from executable. + cmd = shlex.split(self.path + "mlpack_lmnn -h") + try: + s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False) + except Exception as e: + Log.Fatal("Could not execute command: " + str(cmd)) + else: + # Use regular expression pattern to get the description. + pattern = re.compile(br"""(.*?)Optional.*?options:""", + re.VERBOSE|re.MULTILINE|re.DOTALL) + + match = pattern.match(s) + if not match: + Log.Warn("Can't parse description", self.verbose) + description = "" + else: + description = match.group(1) + + self.description = description + + ''' + Destructor to clean up at the end. Use this method to remove created files. + ''' + def __del__(self): + Log.Info("Clean up.", self.verbose) + filelist = ["gmon.out", "distance.csv"] + for f in filelist: + if os.path.isfile(f): + os.remove(f) + + ''' + Given an input dict of options, return an output string that the program can + use. + ''' + def OptionsToStr(self, options): + optionsStr = "" + if "optimizer" in options: + optionsStr = "-O " + str(options.pop("optimizer")) + if "num_targets" in options: + self.k = options.pop("num_targets") + optionsStr = optionsStr + " -k " + str(self.k) + if "regularization" in options: + optionsStr = optionsStr + " -r " + str(options.pop("regularization")) + if "tolerance" in options: + optionsStr = optionsStr + " -t " + str(options.pop("tolerance")) + if "range" in options: + optionsStr = optionsStr + " -R " + str(options.pop("range")) + if "step_size" in options: + optionsStr = optionsStr + " -a " + str(options.pop("step_size")) + if "batch_size" in options: + optionsStr = optionsStr + " -b " + str(options.pop("batch_size")) + if "passes" in options: + optionsStr = optionsStr + " -p " + str(options.pop("passes")) + if "max_iterations" in options: + optionsStr = optionsStr + " -n " + str(options.pop("max_iterations")) + if "rank" in options: + optionsStr = optionsStr + " -A " + str(options.pop("rank")) + if "normalize" in options: + optionsStr = optionsStr + " -N" + options.pop("normalize") + if "linear_scan" in options: + optionsStr = optionsStr + " -L" + options.pop("linear_scan") + if "seed" in options: + optionsStr = optionsStr + " --seed " + str(options.pop("seed")) + + if len(options) > 0: + Log.Fatal("Unknown parameters: " + str(options)) + raise Exception("unknown parameters") + + return optionsStr + + ''' + Run valgrind massif profiler on the Large Margin Nearest Neighbors method. + If the method has been successfully completed the report is saved in the + specified file. + + @param options - Extra options for the method. + @param fileName - The name of the massif output file. + @param massifOptions - Extra massif options. + @return Returns False if the method was not successful, if the method was + successful save the report file in the specified file. + ''' + def RunMemory(self, options, fileName, massifOptions="--depth=2"): + Log.Info("Perform LMNN Memory Profiling.", self.verbose) + + # If the dataset contains two files then the second file is the labels file. + # In this case we add this to the command line. + if len(self.dataset) == 2: + cmd = shlex.split(self.debug + "mlpack_lmnn -i " + self.dataset[0] + " -l " + + self.dataset[1] + " -v -o distance.csv " + + self.OptionsToStr(options)) + else: + cmd = shlex.split(self.debug + "mlpack_lmnn -i " + self.dataset + + " -v -o distance.csv " + self.OptionsToStr(options)) + + return Profiler.MassifMemoryUsage(cmd, fileName, self.timeout, massifOptions) + + ''' + Perform Large Margin Nearest Neighbors. If the method has been + successfully completed return the elapsed time in seconds. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def RunMetrics(self, options): + Log.Info("Perform Large Margin Nearest Neighbors.", self.verbose) + + # If the dataset contains two files then the second file is the labels file. + # In this case we add this to the command line. + if len(self.dataset) == 2: + cmd = shlex.split(self.path + "mlpack_lmnn -i " + self.dataset[0] + " -l " + + self.dataset[1] + " -v -o distance.csv " + + self.OptionsToStr(options)) + else: + cmd = shlex.split(self.path + "mlpack_lmnn -i " + self.dataset + + " -v -o distance.csv " + self.OptionsToStr(options)) + + # Run command with the nessecary arguments and return its output as a byte + # string. We have untrusted input so we disable all shell based features. + try: + s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False, + timeout=self.timeout) + except subprocess.TimeoutExpired as e: + Log.Warn(str(e)) + return -2 + except Exception as e: + Log.Fatal("Could not execute command: " + str(cmd)) + return -1 + + # Datastructure to store the results. + metrics = {} + + # Parse data: runtime. + timer = self.ParseTimer(s) + + if timer != -1: + metrics['Runtime'] = timer.total_time - timer.saving_data - timer.loading_data + Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) + + # Get distance. + distance = LoadDataset("distance.csv") + data = np.genfromtxt(self.dataset, delimiter=',') + + dataList = [data[:,:-1], data[:, (data.shape[1] - 1)]] + metrics['Accuracy_1_NN'] = Metrics.KNNAccuracy(distance, dataList, 1, False) + metrics['Accuracy_3_NN'] = Metrics.KNNAccuracy(distance, dataList, 3, False) + metrics['Accuracy_3_NN_DW'] = Metrics.KNNAccuracy(distance, dataList, 3, True) + metrics['Accuracy_5_NN'] = Metrics.KNNAccuracy(distance, dataList, 5, False) + metrics['Accuracy_5_NN_DW'] = Metrics.KNNAccuracy(distance, dataList, 5, True) + + return metrics + + ''' + Parse the timer data form a given string. + + @param data - String to parse timer data from. + @return - Namedtuple that contains the timer data or -1 in case of an error. + ''' + def ParseTimer(self, data): + # Compile the regular expression pattern into a regular expression object to + # parse the timer data. + pattern = re.compile(br""" + .*?loading_data: (?P.*?)s.*? + .*?saving_data: (?P.*?)s.*? + .*?total_time: (?P.*?)s.*? + """, re.VERBOSE|re.MULTILINE|re.DOTALL) + + match = pattern.match(data) + if not match: + Log.Fatal("Can't parse the data: wrong format") + return -1 + else: + # Create a namedtuple and return the timer data. + timer = collections.namedtuple("timer", ["loading_data", "saving_data", + "total_time"]) + + return timer(float(match.group("loading_data")), + float(match.group("saving_data")), + float(match.group("total_time"))) diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py new file mode 100644 index 0000000..a32fb0e --- /dev/null +++ b/methods/shogun/lmnn.py @@ -0,0 +1,139 @@ +''' + file lmnn.py + @author Manish Kumar + + Large Margin Nearest Neighbors with shogun. +''' + +import os +import sys +import inspect +import timeout_decorator + +# Import the util path, this method even works if the path contains symlinks to +# modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util"))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +#Import the metrics definitions path. +metrics_folder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics"))) +if metrics_folder not in sys.path: + sys.path.insert(0, metrics_folder) + +from log import * +from timer import * +from definitions import * +from misc import * + +import numpy as np +from modshogun import RealFeatures +from modshogun import MulticlassLabels +from modshogun import LMNN as ShogunLMNN + +''' +This class implements the Large Margin Nearest Neighbors benchmark. +''' +class LMNN(object): + + ''' + Create the Large Margin Nearest Neighbors instance. + + @param dataset - Input dataset to perform LMNN on. + @param timeout - The time until the timeout. Default no timeout. + @param verbose - Display informational messages. + ''' + def __init__(self, dataset, timeout=0, verbose=True): + self.verbose = verbose + self.dataset = dataset + self.timeout = timeout + self.k = 1 + + ''' + Use the shogun libary to implement Large Margin Nearest Neighbors. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def LMNNShogun(self, options): + @timeout_decorator.timeout(self.timeout) + def RunLMNNShogun(): + totalTimer = Timer() + + # Load input dataset. + Log.Info("Loading dataset", self.verbose) + # Use the last row of the training set as the responses. + X, y = SplitTrainData(self.dataset) + try: + feat = RealFeatures(X.T) + labels = MulticlassLabels(y.astype(np.float64)) + + with totalTimer: + # Get the options for running LMNN. + if "k" in options: + self.k = int(options.pop("k")) + + if "maxiter" in options: + n = int(options.pop("maxiter")) + else: + n = 2000 + + if len(options) > 0: + Log.Fatal("Unknown parameters: " + str(options)) + raise Exception("unknown parameters") + + # Perform LMNN. + prep = ShogunLMNN(feat, labels, self.k) + prep.set_maxiter(n) + prep.train() + except Exception as e: + return [-1, -1] + + time = totalTimer.ElapsedTime() + + # Get distance. + distance = prep.get_linear_transform() + dataList = [X, y] + accuracy1NN = Metrics.KNNAccuracy(distance, dataList, 1, False) + accuracy3NN = Metrics.KNNAccuracy(distance, dataList, 3, False) + accuracy3NNDW = Metrics.KNNAccuracy(distance, dataList, 3, True) + accuracy5NN = Metrics.KNNAccuracy(distance, dataList, 5, False) + accuracy5NNDW = Metrics.KNNAccuracy(distance, dataList, 5, True) + + return [time, accuracy1NN, accuracy3NN, accuracy3NNDW, + accuracy5NN, accuracy5NNDW] + + try: + return RunLMNNShogun() + except timeout_decorator.TimeoutError: + return [-1, -1] + + ''' + Perform Large Margin Nearest Neighbors. If the method has been successfully + completed return the elapsed time in seconds. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def RunMetrics(self, options): + Log.Info("Perform LMNN.", self.verbose) + + results = self.LMNNShogun(options) + if results[0] < 0: + return results[0] + + # Datastructure to store the results. + metrics = {} + metrics['Runtime'] = results[0] + metrics['Accuracy_1_NN'] = results[1] + metrics['Accuracy_3_NN'] = results[2] + metrics['Accuracy_3_NN_DW'] = results[3] + metrics['Accuracy_5_NN'] = results[4] + metrics['Accuracy_5_NN_DW'] = results[5] + + return metrics + diff --git a/tests/benchmark_lmnn.py b/tests/benchmark_lmnn.py new file mode 100644 index 0000000..3b9fcf0 --- /dev/null +++ b/tests/benchmark_lmnn.py @@ -0,0 +1,100 @@ +''' + @file benchmark_lmnn.py + @author Manish Kumar + + Test for the Large Margin Nearest Neighbors scripts. +''' + +import unittest + +import os, sys, inspect + +# Import the util path, this method even works if the path contains +# symlinks to modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], '../util'))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +from util.loader import * + +''' +Test the mlpack Large Margin Nearest Neighbors script. +''' +class LMNN_MLPACK_TEST(unittest.TestCase): + + ''' + Test initialization. + ''' + def setUp(self): + self.dataset = 'datasets/iris_train.csv' + self.verbose = False + self.timeout = 240 + + module = Loader.ImportModuleFromPath("methods/mlpack/lmnn.py") + obj = getattr(module, "LMNN") + self.instance = obj(self.dataset, verbose=self.verbose, timeout=self.timeout) + + ''' + Test the constructor. + ''' + def test_Constructor(self): + self.assertEqual(self.instance.verbose, self.verbose) + self.assertEqual(self.instance.timeout, self.timeout) + self.assertEqual(self.instance.dataset, self.dataset) + + ''' + Test the 'RunMetrics' function. + ''' + def test_RunMetrics(self): + result = self.instance.RunMetrics({}) + self.assertTrue(result["Runtime"] > 0) + + ''' + Test the destructor. + ''' + def test_Destructor(self): + del self.instance + + clean = True + filelist = ["gmon.out", "distance.csv"] + for f in filelist: + if os.path.isfile(f): + clean = False + + self.assertTrue(clean) + +if __name__ == '__main__': + unittest.main() + +''' +Test the shogun Large Margin Nearest Neighbors script. +''' +class LMNN_SHOGUN_TEST(unittest.TestCase): + + ''' + Test initialization. + ''' + def setUp(self): + self.dataset = "datasets/iris.csv" + self.verbose = False + self.timeout = 240 + + module = Loader.ImportModuleFromPath("methods/shogun/lmnn.py") + obj = getattr(module, "LMNN") + self.instance = obj(self.dataset, verbose=self.verbose, timeout=self.timeout) + + ''' + Test the constructor. + ''' + def test_Constructor(self): + self.assertEqual(self.instance.verbose, self.verbose) + self.assertEqual(self.instance.timeout, self.timeout) + self.assertEqual(self.instance.dataset, self.dataset) + + ''' + Test the 'RunMetrics' function. + ''' + def test_RunMetrics(self): + result = self.instance.RunMetrics({}) + self.assertTrue(result["Runtime"] > 0)