From 315fc20ceb442bb8e61e684ef2a6afc0edf89d93 Mon Sep 17 00:00:00 2001 From: Manish Date: Fri, 15 Jun 2018 09:58:05 +0530 Subject: [PATCH 01/15] LMNN Benchmarks --- config.yaml | 77 +++++++++++++ datasets/dataset-urls.txt | 2 + methods/mlpack/lmnn.py | 227 ++++++++++++++++++++++++++++++++++++++ methods/shogun/lmnn.py | 114 +++++++++++++++++++ tests/benchmark_lmnn.py | 100 +++++++++++++++++ 5 files changed, 520 insertions(+) create mode 100644 methods/mlpack/lmnn.py create mode 100644 methods/shogun/lmnn.py create mode 100644 tests/benchmark_lmnn.py diff --git a/config.yaml b/config.yaml index 8662f8e..b473edb 100644 --- a/config.yaml +++ b/config.yaml @@ -794,6 +794,70 @@ methods: normalize: True seed: 42 + LMNN: + run: ['metric'] + script: methods/mlpack/lmnn.py + format: [csv, txt] + datasets: + - files: ['datasets/iris_train.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], + 'datasets/wine.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + options: + passes: 10 + range: 25 + seed: 42 + + - files: ['datasets/letter_recognition.csv', + 'datasets/shuttle_train.csv', 'datasets/isolet.csv', + 'datasets/covtype.csv', 'datasets/corel-histogram.csv', + 'datasets/mnist_all.csv', 'datasets/Twitter.csv'] + options: + passes: 3 + range: 100 + seed: 42 + + - files: ['datasets/iris_train.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], + 'datasets/wine.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + options: + passes: 5 + optimizer: bbsgd + seed: 42 + + - files: ['datasets/iris_train.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], + 'datasets/wine.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + options: + passes: 5 + optimizer: sgd + range: 50 + step_size: 1e-07 + seed: 42 + + - files: ['datasets/iris_train.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], + 'datasets/wine.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + options: + num_targets: 5 + max_iterations: 2000 + optimizer: lbfgs + seed: 42 + wolfe: 0.5 + range: 50 + + - files: ['datasets/covtype.csv', + 'datasets/shuttle_train.csv', 'datasets/isolet.csv', + 'datasets/mnist_all.csv'] + options: + max_iterations: 2000 + optimizer: lbfgs + seed: 42 + range: 100 + HMMTRAIN: run: ['metric'] script: methods/mlpack/hmm_train.py @@ -2174,6 +2238,19 @@ methods: options: lambda1: 0.01 + LMNN: + run: ['metric'] + script: methods/shogun/lmnn.py + format: [csv, txt] + datasets: + - files: ['datasets/iris_train.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], + 'datasets/wine.csv', 'datasets/ionosphere.csv', + 'datasets/shuttle_train.csv', 'datasets/isolet.csv', + 'datasets/covtype.csv', 'datasets/corel-histogram.csv', + 'datasets/mnist_all.csv', 'datasets/Twitter.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + QDA: run: ['metric','metric'] script: methods/shogun/qda.py diff --git a/datasets/dataset-urls.txt b/datasets/dataset-urls.txt index 1d7a0e2..6713bc9 100644 --- a/datasets/dataset-urls.txt +++ b/datasets/dataset-urls.txt @@ -10,6 +10,7 @@ artificial_1DSignal*.csv mlpack.org/datasets/artificial_1DSignal.tar.gz artificial_2DSignal*.csv mlpack.org/datasets/artificial_2DSignal.tar.gz artificial_40D*.csv mlpack.org/datasets/artificial_40D.tar.gz artificial_5DSignal*.csv mlpack.org/datasets/artificial_5DSignal.tar.gz +balance_scale*.csv mlpack.org/datasets/balance_scale.tar.gz bank8FM.csv mlpack.org/datasets/bank8FM.tar.gz cal_housing.csv mlpack.org/datasets/cal_housing.tar.gz circle_data.csv mlpack.org/datasets/circle.tar.gz @@ -25,6 +26,7 @@ faces.csv mlpack.org/datasets/faces.tar.gz ionosphere.csv mlpack.org/datasets/ionosphere.tar.gz iris*.csv mlpack.org/datasets/iris.tar.gz isolet*.csv mlpack.org/datasets/isolet.tar.gz +letter_recognition*.csv http://www.mlpack.org/datasets/letter_recognition.tar.gz madelon*.csv mlpack.org/datasets/madelon.tar.gz mammography*.csv mlpack.org/datasets/mammography.tar.gz mnist*.csv mlpack.org/datasets/mnist.tar.gz diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py new file mode 100644 index 0000000..00332b7 --- /dev/null +++ b/methods/mlpack/lmnn.py @@ -0,0 +1,227 @@ +''' + @file nca.py + @author Manish Kumar + + Class to benchmark the mlpack Large Margin Nearest Neighbors method. +''' + +import os +import sys +import inspect + +# Import the util path, this method even works if the path contains symlinks to +# modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util"))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +from util.log import * +from util.profiler import * + +import shlex + +try: + import subprocess32 as subprocess +except ImportError: + import subprocess + +import re +import collections + +''' +This class implements the Large Margin Nearest Neighbors benchmark. +''' +class LMNN(object): + + ''' + Create the Large Margin Nearest Neighbors benchmark instance, show some + informations and return the instance. + + @param dataset - Input dataset to perform LMNN on. + @param timeout - The time until the timeout. Default no timeout. + @param path - Path to the mlpack executable. + @param verbose - Display informational messages. + ''' + def __init__(self, dataset, timeout=0, path=os.environ["BINPATH"], + verbose=True, debug=os.environ["DEBUGBINPATH"]): + self.verbose = verbose + self.dataset = dataset + self.path = path + self.timeout = timeout + self.debug = debug + + # Get description from executable. + cmd = shlex.split(self.path + "mlpack_LMNN -h") + try: + s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False) + except Exception as e: + Log.Fatal("Could not execute command: " + str(cmd)) + else: + # Use regular expression pattern to get the description. + pattern = re.compile(br"""(.*?)Optional.*?options:""", + re.VERBOSE|re.MULTILINE|re.DOTALL) + + match = pattern.match(s) + if not match: + Log.Warn("Can't parse description", self.verbose) + description = "" + else: + description = match.group(1) + + self.description = description + + ''' + Destructor to clean up at the end. Use this method to remove created files. + ''' + def __del__(self): + Log.Info("Clean up.", self.verbose) + filelist = ["gmon.out", "distance.csv"] + for f in filelist: + if os.path.isfile(f): + os.remove(f) + + ''' + Given an input dict of options, return an output string that the program can + use. + ''' + def OptionsToStr(self, options): + optionsStr = "" + if "optimizer" in options: + optionsStr = "-O " + str(options.pop("optimizer")) + if "num_targets" in options: + optionsStr = optionsStr + " -k " + str(options.pop("num_targets")) + if "regularization" in options: + optionsStr = optionsStr + " -r " + str(options.pop("regularization")) + if "tolerance" in options: + optionsStr = optionsStr + " -t " + str(options.pop("tolerance")) + if "batch_delta" in options: + optionsStr = optionsStr + " -d " + str(options.pop("batch_delta")) + if "range" in options: + optionsStr = optionsStr + " -R " + str(options.pop("range")) + if "step_size" in options: + optionsStr = optionsStr + " -a " + str(options.pop("step_size")) + if "batch_size" in options: + optionsStr = optionsStr + " -b " + str(options.pop("batch_size")) + if "passes" in options: + optionsStr = optionsStr + " -p " + str(options.pop("passes")) + if "max_iterations" in options: + optionsStr = optionsStr + " -n " + str(options.pop("max_iterations")) + if "num_basis" in options: + optionsStr = optionsStr + " -B " + str(options.pop("num_basis")) + if "wolfe" in options: + optionsStr = optionsStr + " -w " + str(options.pop("wolfe")) + if "normalize" in options: + optionsStr = optionsStr + " -N" + options.pop("normalize") + if "linear_scan" in options: + optionsStr = optionsStr + " -L" + options.pop("linear_scan") + if "seed" in options: + optionsStr = optionsStr + " --seed " + str(options.pop("seed")) + + if len(options) > 0: + Log.Fatal("Unknown parameters: " + str(options)) + raise Exception("unknown parameters") + + return optionsStr + + ''' + Run valgrind massif profiler on the Large Margin Nearest Neighbors method. + If the method has been successfully completed the report is saved in the + specified file. + + @param options - Extra options for the method. + @param fileName - The name of the massif output file. + @param massifOptions - Extra massif options. + @return Returns False if the method was not successful, if the method was + successful save the report file in the specified file. + ''' + def RunMemory(self, options, fileName, massifOptions="--depth=2"): + Log.Info("Perform LMNN Memory Profiling.", self.verbose) + + # If the dataset contains two files then the second file is the labels file. + # In this case we add this to the command line. + if len(self.dataset) == 2: + cmd = shlex.split(self.debug + "mlpack_lmnn -i " + self.dataset[0] + " -l " + + self.dataset[1] + " -v -o distance.csv " + + self.OptionsToStr(options)) + else: + cmd = shlex.split(self.debug + "mlpack_lmnn -i " + self.dataset + + " -v -o distance.csv " + self.OptionsToStr(options)) + + return Profiler.MassifMemoryUsage(cmd, fileName, self.timeout, massifOptions) + + ''' + Perform Large Margin Nearest Neighbors. If the method has been + successfully completed return the elapsed time in seconds. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def RunMetrics(self, options): + Log.Info("Perform Large Margin Nearest Neighbors.", self.verbose) + + # If the dataset contains two files then the second file is the labels file. + # In this case we add this to the command line. + if len(self.dataset) == 2: + cmd = shlex.split(self.path + "mlpack_lmnn -i " + self.dataset[0] + " -l " + + self.dataset[1] + " -v -o distance.csv " + + self.OptionsToStr(options)) + else: + cmd = shlex.split(self.path + "mlpack_lmnn -i " + self.dataset + + " -v -o distance.csv " + self.OptionsToStr(options)) + + # Run command with the nessecary arguments and return its output as a byte + # string. We have untrusted input so we disable all shell based features. + try: + s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False, + timeout=self.timeout) + except subprocess.TimeoutExpired as e: + Log.Warn(str(e)) + return -2 + except Exception as e: + Log.Fatal("Could not execute command: " + str(cmd)) + return -1 + + # Datastructure to store the results. + metrics = {} + + # Parse data: runtime. + timer = self.ParseTimer(s) + + if timer != -1: + metrics['Runtime'] = timer.total_time - timer.saving_data - timer.loading_data + + Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) + + return metrics + + ''' + Parse the timer data form a given string. + + @param data - String to parse timer data from. + @return - Namedtuple that contains the timer data or -1 in case of an error. + ''' + def ParseTimer(self, data): + # Compile the regular expression pattern into a regular expression object to + # parse the timer data. + pattern = re.compile(br""" + .*?loading_data: (?P.*?)s.*? + .*?saving_data: (?P.*?)s.*? + .*?total_time: (?P.*?)s.*? + """, re.VERBOSE|re.MULTILINE|re.DOTALL) + + match = pattern.match(data) + if not match: + Log.Fatal("Can't parse the data: wrong format") + return -1 + else: + # Create a namedtuple and return the timer data. + timer = collections.namedtuple("timer", ["loading_data", "saving_data", + "total_time"]) + + return timer(float(match.group("loading_data")), + float(match.group("saving_data")), + float(match.group("total_time"))) diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py new file mode 100644 index 0000000..1bbf962 --- /dev/null +++ b/methods/shogun/lmnn.py @@ -0,0 +1,114 @@ +''' + @file lmnn.py + @author Manish Kumar + + Large Margin Nearest Neighbors with shogun. +''' + +import os +import sys +import inspect +import timeout_decorator + +# Import the util path, this method even works if the path contains symlinks to +# modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util"))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +from log import * +from timer import * + +import numpy as np +from modshogun import RealFeatures +from modshogun import MulticlassLabels +from modshogun import LMNN as ShogunLMNN + +''' +This class implements the Large Margin Nearest Neighbors benchmark. +''' +class LMNN(object): + + ''' + Create the Large Margin Nearest Neighbors instance. + + @param dataset - Input dataset to perform LMNN on. + @param timeout - The time until the timeout. Default no timeout. + @param verbose - Display informational messages. + ''' + def __init__(self, dataset, timeout=0, verbose=True): + self.verbose = verbose + self.dataset = dataset + self.timeout = timeout + + ''' + Use the shogun libary to implement Large Margin Nearest Neighbors. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def LMNNShogun(self, options): + @timeout_decorator.timeout(self.timeout) + def RunLMNNShogun(): + totalTimer = Timer() + + # Load input dataset. + Log.Info("Loading dataset", self.verbose) + if len(self.dataset) == 2: + X = self.dataset[0] + y = self.dataset[1] + else: + # Use the last row of the training set as the responses. + X, y = SplitTrainData(self.dataset) + try: + feat = RealFeatures(self.X.T) + labels = MulticlassLabels(y.astype(numpy.float64)) + + with totalTimer: + # Get the options for running LMNN. + if "k" in options: + k = int(options.pop("k")) + else: + k = 1 + + if "maxiter" in options: + n = int(options.pop("maxiter")) + else: + n = 1000 + + if len(options) > 0: + Log.Fatal("Unknown parameters: " + str(options)) + raise Exception("unknown parameters") + + # Perform LMNN. + prep = ShogunLMNN(feat, labels, k) + prep.set_maxiter(n) + prep.train() + except Exception as e: + return -1 + + return totalTimer.ElapsedTime() + + try: + return RunLMNNShogun() + except timeout_decorator.TimeoutError: + return -1 + + ''' + Perform Large Margin Nearest Neighbors. If the method has been successfully + completed return the elapsed time in seconds. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def RunMetrics(self, options): + Log.Info("Perform LMNN.", self.verbose) + + results = self.LMNNShogun(options) + if results < 0: + return results + + return {'Runtime' : results} diff --git a/tests/benchmark_lmnn.py b/tests/benchmark_lmnn.py new file mode 100644 index 0000000..3b9fcf0 --- /dev/null +++ b/tests/benchmark_lmnn.py @@ -0,0 +1,100 @@ +''' + @file benchmark_lmnn.py + @author Manish Kumar + + Test for the Large Margin Nearest Neighbors scripts. +''' + +import unittest + +import os, sys, inspect + +# Import the util path, this method even works if the path contains +# symlinks to modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], '../util'))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +from util.loader import * + +''' +Test the mlpack Large Margin Nearest Neighbors script. +''' +class LMNN_MLPACK_TEST(unittest.TestCase): + + ''' + Test initialization. + ''' + def setUp(self): + self.dataset = 'datasets/iris_train.csv' + self.verbose = False + self.timeout = 240 + + module = Loader.ImportModuleFromPath("methods/mlpack/lmnn.py") + obj = getattr(module, "LMNN") + self.instance = obj(self.dataset, verbose=self.verbose, timeout=self.timeout) + + ''' + Test the constructor. + ''' + def test_Constructor(self): + self.assertEqual(self.instance.verbose, self.verbose) + self.assertEqual(self.instance.timeout, self.timeout) + self.assertEqual(self.instance.dataset, self.dataset) + + ''' + Test the 'RunMetrics' function. + ''' + def test_RunMetrics(self): + result = self.instance.RunMetrics({}) + self.assertTrue(result["Runtime"] > 0) + + ''' + Test the destructor. + ''' + def test_Destructor(self): + del self.instance + + clean = True + filelist = ["gmon.out", "distance.csv"] + for f in filelist: + if os.path.isfile(f): + clean = False + + self.assertTrue(clean) + +if __name__ == '__main__': + unittest.main() + +''' +Test the shogun Large Margin Nearest Neighbors script. +''' +class LMNN_SHOGUN_TEST(unittest.TestCase): + + ''' + Test initialization. + ''' + def setUp(self): + self.dataset = "datasets/iris.csv" + self.verbose = False + self.timeout = 240 + + module = Loader.ImportModuleFromPath("methods/shogun/lmnn.py") + obj = getattr(module, "LMNN") + self.instance = obj(self.dataset, verbose=self.verbose, timeout=self.timeout) + + ''' + Test the constructor. + ''' + def test_Constructor(self): + self.assertEqual(self.instance.verbose, self.verbose) + self.assertEqual(self.instance.timeout, self.timeout) + self.assertEqual(self.instance.dataset, self.dataset) + + ''' + Test the 'RunMetrics' function. + ''' + def test_RunMetrics(self): + result = self.instance.RunMetrics({}) + self.assertTrue(result["Runtime"] > 0) From 9902eb5bab85e85aed72ef9d2c77c8bc61eb5019 Mon Sep 17 00:00:00 2001 From: Manish Date: Fri, 15 Jun 2018 19:39:04 +0530 Subject: [PATCH 02/15] update datasets --- config.yaml | 46 +++++++++++++++++++++++------------------- methods/mlpack/lmnn.py | 8 ++++---- methods/shogun/lmnn.py | 10 ++++----- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/config.yaml b/config.yaml index b473edb..0d5ca68 100644 --- a/config.yaml +++ b/config.yaml @@ -800,16 +800,16 @@ methods: format: [csv, txt] datasets: - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], - 'datasets/wine.csv', 'datasets/ionosphere.csv', - 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv', + 'datasets/diabetes.csv'] options: passes: 10 range: 25 seed: 42 - files: ['datasets/letter_recognition.csv', - 'datasets/shuttle_train.csv', 'datasets/isolet.csv', + 'datasets/shuttle_train.csv', 'datasets/isolet_train.csv', 'datasets/covtype.csv', 'datasets/corel-histogram.csv', 'datasets/mnist_all.csv', 'datasets/Twitter.csv'] options: @@ -818,18 +818,18 @@ methods: seed: 42 - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], - 'datasets/wine.csv', 'datasets/ionosphere.csv', - 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv', + 'datasets/diabetes.csv'] options: passes: 5 optimizer: bbsgd seed: 42 - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], - 'datasets/wine.csv', 'datasets/ionosphere.csv', - 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv', + 'datasets/diabetes.csv'] options: passes: 5 optimizer: sgd @@ -838,8 +838,7 @@ methods: seed: 42 - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], - 'datasets/wine.csv', 'datasets/ionosphere.csv', + 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] options: num_targets: 5 @@ -850,8 +849,8 @@ methods: range: 50 - files: ['datasets/covtype.csv', - 'datasets/shuttle_train.csv', 'datasets/isolet.csv', - 'datasets/mnist_all.csv'] + 'datasets/shuttle_train.csv', 'datasets/isolet_train.csv', + 'datasets/mnist_all.csv', 'datasets/diabetes.csv'] options: max_iterations: 2000 optimizer: lbfgs @@ -2243,13 +2242,18 @@ methods: script: methods/shogun/lmnn.py format: [csv, txt] datasets: - - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], - 'datasets/wine.csv', 'datasets/ionosphere.csv', - 'datasets/shuttle_train.csv', 'datasets/isolet.csv', - 'datasets/covtype.csv', 'datasets/corel-histogram.csv', - 'datasets/mnist_all.csv', 'datasets/Twitter.csv', - 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + - files: [ ['datasets/iris_train.csv'], + ['datasets/diabetes.csv'], + ['datasets/isolet_train.csv'], + ['datasets/wine_qual.csv'], + ['datasets/ionosphere.csv'], + ['datasets/shuttle_train.csv'], + ['datasets/covtype.csv'], + ['datasets/corel-histogram.csv'], + ['datasets/mnist_all.csv'], + ['datasets/Twitter.csv'], + ['datasets/balance_scale.csv'], + ['datasets/letter_recognition.csv']] QDA: run: ['metric','metric'] diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index 00332b7..23c2702 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -1,5 +1,5 @@ ''' - @file nca.py + @file lmnn.py @author Manish Kumar Class to benchmark the mlpack Large Margin Nearest Neighbors method. @@ -16,8 +16,8 @@ if cmd_subfolder not in sys.path: sys.path.insert(0, cmd_subfolder) -from util.log import * -from util.profiler import * +from log import * +from profiler import * import shlex @@ -52,7 +52,7 @@ def __init__(self, dataset, timeout=0, path=os.environ["BINPATH"], self.debug = debug # Get description from executable. - cmd = shlex.split(self.path + "mlpack_LMNN -h") + cmd = shlex.split(self.path + "mlpack_lmnn -h") try: s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False) except Exception as e: diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index 1bbf962..4f52ba3 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -19,6 +19,8 @@ from log import * from timer import * +from definitions import * +from misc import * import numpy as np from modshogun import RealFeatures @@ -56,12 +58,8 @@ def RunLMNNShogun(): # Load input dataset. Log.Info("Loading dataset", self.verbose) - if len(self.dataset) == 2: - X = self.dataset[0] - y = self.dataset[1] - else: - # Use the last row of the training set as the responses. - X, y = SplitTrainData(self.dataset) + # Use the last row of the training set as the responses. + X, y = SplitTrainData(self.dataset) try: feat = RealFeatures(self.X.T) labels = MulticlassLabels(y.astype(numpy.float64)) From 45f6e1abedb2de0ccaf67cc22456c66583724537 Mon Sep 17 00:00:00 2001 From: Manish Date: Fri, 15 Jun 2018 22:42:50 +0530 Subject: [PATCH 03/15] update shogun lmnn --- methods/shogun/lmnn.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index 4f52ba3..b72b899 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -17,6 +17,12 @@ if cmd_subfolder not in sys.path: sys.path.insert(0, cmd_subfolder) +#Import the metrics definitions path. +metrics_folder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics"))) +if metrics_folder not in sys.path: + sys.path.insert(0, metrics_folder) + from log import * from timer import * from definitions import * @@ -61,8 +67,8 @@ def RunLMNNShogun(): # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) try: - feat = RealFeatures(self.X.T) - labels = MulticlassLabels(y.astype(numpy.float64)) + feat = RealFeatures(X.T) + labels = MulticlassLabels(y.astype(np.float64)) with totalTimer: # Get the options for running LMNN. @@ -74,7 +80,7 @@ def RunLMNNShogun(): if "maxiter" in options: n = int(options.pop("maxiter")) else: - n = 1000 + n = 2000 if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) From 5e0c6b2657b30ad2751331efe176d5f35b81ad91 Mon Sep 17 00:00:00 2001 From: Manish Date: Sat, 16 Jun 2018 18:53:42 +0530 Subject: [PATCH 04/15] Added metrics --- config.yaml | 1 - methods/mlpack/lmnn.py | 29 ++++++++++++++++++++++++++++- methods/shogun/lmnn.py | 29 ++++++++++++++++++++++++++++- 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/config.yaml b/config.yaml index 0d5ca68..445ccc7 100644 --- a/config.yaml +++ b/config.yaml @@ -2243,7 +2243,6 @@ methods: format: [csv, txt] datasets: - files: [ ['datasets/iris_train.csv'], - ['datasets/diabetes.csv'], ['datasets/isolet_train.csv'], ['datasets/wine_qual.csv'], ['datasets/ionosphere.csv'], diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index 23c2702..fa674cd 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -16,16 +16,27 @@ if cmd_subfolder not in sys.path: sys.path.insert(0, cmd_subfolder) +#Import the metrics definitions path. +metrics_folder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics"))) +if metrics_folder not in sys.path: + sys.path.insert(0, metrics_folder) + from log import * from profiler import * +from definitions import * +from misc import * import shlex +from modshogun import MulticlassLabels, RealFeatures +from modshogun import KNN, KNN_COVER_TREE, EuclideanDistance try: import subprocess32 as subprocess except ImportError: import subprocess +import numpy as np import re import collections @@ -193,9 +204,25 @@ def RunMetrics(self, options): if timer != -1: metrics['Runtime'] = timer.total_time - timer.saving_data - timer.loading_data - Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) + # Predict labels. + distance = LoadDataset("distance.csv") + feat = RealFeatures(np.dot(distance, self.dataset[0])) + labels = MulticlassLabels(self.dataset[1]) + dist = EuclideanDistance() + knn = KNN(1, dist, labels) + if "num_targets" in options: + knn.set_k(options.pop("num_targets")) + + knn.train(feat) + knn.set_knn_solver_type(KNN_COVER_TREE) + pred = knn.apply_multiclass(feat) + + predictions = pred.get_int_labels() + confusionMatrix = Metrics.ConfusionMatrix(self.dataset[1], predictions) + metrics['Avg Accuracy'] = Metrics.AverageAccuracy(confusionMatrix) + return metrics ''' diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index b72b899..5df0485 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -32,6 +32,7 @@ from modshogun import RealFeatures from modshogun import MulticlassLabels from modshogun import LMNN as ShogunLMNN +from modshogun import KNN, KNN_COVER_TREE, EuclideanDistance ''' This class implements the Large Margin Nearest Neighbors benchmark. @@ -49,6 +50,7 @@ def __init__(self, dataset, timeout=0, verbose=True): self.verbose = verbose self.dataset = dataset self.timeout = timeout + self.predictions = None ''' Use the shogun libary to implement Large Margin Nearest Neighbors. @@ -93,7 +95,24 @@ def RunLMNNShogun(): except Exception as e: return -1 - return totalTimer.ElapsedTime() + time = totalTimer.ElapsedTime() + + # Predict labels. + distance = prep.get_linear_transform() + feat = RealFeatures(np.dot(distance, X)) + labels = MulticlassLabels(y) + dist = EuclideanDistance() + knn = KNN(1, dist, labels) + if "k" in options: + knn.set_k(options.pop("k")) + + knn.train(feat) + knn.set_knn_solver_type(KNN_COVER_TREE) + pred = knn.apply_multiclass(feat) + + self.predictions = pred.get_int_labels() + + return [time, self.predictions] try: return RunLMNNShogun() @@ -115,4 +134,12 @@ def RunMetrics(self, options): if results < 0: return results + # Datastructure to store the results. + metrics = {} + + X, y = SplitTrainData(self.dataset) + confusionMatrix = Metrics.ConfusionMatrix(y, self.predictions) + metrics['Runtime'] = results + metrics['Avg Accuracy'] = Metrics.AverageAccuracy(confusionMatrix) + return {'Runtime' : results} From 9027de2dd3bae2f501d9057c8ff56e7178e2993b Mon Sep 17 00:00:00 2001 From: Manish Date: Sun, 17 Jun 2018 00:14:16 +0530 Subject: [PATCH 05/15] update metrics --- methods/mlpack/lmnn.py | 15 ++++++++------- methods/shogun/lmnn.py | 30 ++++++++++++++---------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index fa674cd..9f27796 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -28,7 +28,7 @@ from misc import * import shlex -from modshogun import MulticlassLabels, RealFeatures +from modshogun import MulticlassLabels, RealFeatures, MulticlassAccuracy from modshogun import KNN, KNN_COVER_TREE, EuclideanDistance try: @@ -208,8 +208,10 @@ def RunMetrics(self, options): # Predict labels. distance = LoadDataset("distance.csv") - feat = RealFeatures(np.dot(distance, self.dataset[0])) - labels = MulticlassLabels(self.dataset[1]) + data = np.genfromtxt(self.dataset, delimiter=',') + transformedData = np.dot(data[:,:-1], distance.T) + feat = RealFeatures(transformedData.T) + labels = MulticlassLabels(data[:, (data.shape[1] - 1)].astype(np.float64)) dist = EuclideanDistance() knn = KNN(1, dist, labels) if "num_targets" in options: @@ -218,10 +220,9 @@ def RunMetrics(self, options): knn.train(feat) knn.set_knn_solver_type(KNN_COVER_TREE) pred = knn.apply_multiclass(feat) - - predictions = pred.get_int_labels() - confusionMatrix = Metrics.ConfusionMatrix(self.dataset[1], predictions) - metrics['Avg Accuracy'] = Metrics.AverageAccuracy(confusionMatrix) + evaluator = MulticlassAccuracy() + accuracy = evaluator.evaluate(pred, labels) + metrics['Avg Accuracy'] = accuracy return metrics diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index 5df0485..d065e14 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -1,5 +1,5 @@ ''' - @file lmnn.py + file lmnn.py @author Manish Kumar Large Margin Nearest Neighbors with shogun. @@ -30,7 +30,7 @@ import numpy as np from modshogun import RealFeatures -from modshogun import MulticlassLabels +from modshogun import MulticlassLabels, MulticlassAccuracy from modshogun import LMNN as ShogunLMNN from modshogun import KNN, KNN_COVER_TREE, EuclideanDistance @@ -50,7 +50,6 @@ def __init__(self, dataset, timeout=0, verbose=True): self.verbose = verbose self.dataset = dataset self.timeout = timeout - self.predictions = None ''' Use the shogun libary to implement Large Margin Nearest Neighbors. @@ -99,8 +98,9 @@ def RunLMNNShogun(): # Predict labels. distance = prep.get_linear_transform() - feat = RealFeatures(np.dot(distance, X)) - labels = MulticlassLabels(y) + transformedData = np.dot(X, distance.T) + feat = RealFeatures(transformedData.T) + labels = MulticlassLabels(y.astype(np.float64)) dist = EuclideanDistance() knn = KNN(1, dist, labels) if "k" in options: @@ -109,10 +109,10 @@ def RunLMNNShogun(): knn.train(feat) knn.set_knn_solver_type(KNN_COVER_TREE) pred = knn.apply_multiclass(feat) - - self.predictions = pred.get_int_labels() - - return [time, self.predictions] + evaluator = MulticlassAccuracy() + accuracy = evaluator.evaluate(pred, labels) + print(accuracy) + return [time, accuracy] try: return RunLMNNShogun() @@ -131,15 +131,13 @@ def RunMetrics(self, options): Log.Info("Perform LMNN.", self.verbose) results = self.LMNNShogun(options) - if results < 0: - return results + if results[0] < 0: + return results[0] # Datastructure to store the results. metrics = {} + metrics['Runtime'] = results[0] + metrics['Avg Accuracy'] = results[1] - X, y = SplitTrainData(self.dataset) - confusionMatrix = Metrics.ConfusionMatrix(y, self.predictions) - metrics['Runtime'] = results - metrics['Avg Accuracy'] = Metrics.AverageAccuracy(confusionMatrix) + return metrics - return {'Runtime' : results} From 874e862f889ea18b7150ffdbe24c6f45535220e3 Mon Sep 17 00:00:00 2001 From: Manish Date: Mon, 18 Jun 2018 09:41:01 +0530 Subject: [PATCH 06/15] solve num_targets issue --- config.yaml | 28 ++++++++++++++++++---------- methods/mlpack/lmnn.py | 11 +++++------ methods/shogun/lmnn.py | 15 +++++---------- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/config.yaml b/config.yaml index 445ccc7..f4a5070 100644 --- a/config.yaml +++ b/config.yaml @@ -800,37 +800,41 @@ methods: format: [csv, txt] datasets: - files: ['datasets/iris_train.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', - 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv', - 'datasets/diabetes.csv'] + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] options: + num_targets: 5 passes: 10 range: 25 seed: 42 - files: ['datasets/letter_recognition.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], 'datasets/shuttle_train.csv', 'datasets/isolet_train.csv', 'datasets/covtype.csv', 'datasets/corel-histogram.csv', 'datasets/mnist_all.csv', 'datasets/Twitter.csv'] options: + num_targets: 3 passes: 3 range: 100 seed: 42 - files: ['datasets/iris_train.csv', - 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', - 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv', - 'datasets/diabetes.csv'] + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], + 'datasets/balance_scale.csv', 'datasets/ionosphere.csv'] options: + num_targets: 3 passes: 5 optimizer: bbsgd seed: 42 - files: ['datasets/iris_train.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', - 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv', - 'datasets/diabetes.csv'] + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] options: + num_targets: 3 passes: 5 optimizer: sgd range: 50 @@ -838,10 +842,11 @@ methods: seed: 42 - files: ['datasets/iris_train.csv', + ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] options: - num_targets: 5 + num_targets: 3 max_iterations: 2000 optimizer: lbfgs seed: 42 @@ -850,8 +855,9 @@ methods: - files: ['datasets/covtype.csv', 'datasets/shuttle_train.csv', 'datasets/isolet_train.csv', - 'datasets/mnist_all.csv', 'datasets/diabetes.csv'] + 'datasets/mnist_all.csv', 'datasets/letter_recognition.csv'] options: + num_targets: 5 max_iterations: 2000 optimizer: lbfgs seed: 42 @@ -2243,8 +2249,8 @@ methods: format: [csv, txt] datasets: - files: [ ['datasets/iris_train.csv'], - ['datasets/isolet_train.csv'], ['datasets/wine_qual.csv'], + ['datasets/isolet_train.csv'], ['datasets/ionosphere.csv'], ['datasets/shuttle_train.csv'], ['datasets/covtype.csv'], @@ -2253,6 +2259,8 @@ methods: ['datasets/Twitter.csv'], ['datasets/balance_scale.csv'], ['datasets/letter_recognition.csv']] + options: + k: 3 QDA: run: ['metric','metric'] diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index 9f27796..9bc3ae7 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -61,6 +61,7 @@ def __init__(self, dataset, timeout=0, path=os.environ["BINPATH"], self.path = path self.timeout = timeout self.debug = debug + self.k = 1 # Get description from executable. cmd = shlex.split(self.path + "mlpack_lmnn -h") @@ -101,7 +102,8 @@ def OptionsToStr(self, options): if "optimizer" in options: optionsStr = "-O " + str(options.pop("optimizer")) if "num_targets" in options: - optionsStr = optionsStr + " -k " + str(options.pop("num_targets")) + self.k = options.pop("num_targets") + optionsStr = optionsStr + " -k " + str(self.k) if "regularization" in options: optionsStr = optionsStr + " -r " + str(options.pop("regularization")) if "tolerance" in options: @@ -213,16 +215,13 @@ def RunMetrics(self, options): feat = RealFeatures(transformedData.T) labels = MulticlassLabels(data[:, (data.shape[1] - 1)].astype(np.float64)) dist = EuclideanDistance() - knn = KNN(1, dist, labels) - if "num_targets" in options: - knn.set_k(options.pop("num_targets")) - + knn = KNN(self.k, dist, labels) knn.train(feat) knn.set_knn_solver_type(KNN_COVER_TREE) pred = knn.apply_multiclass(feat) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(pred, labels) - metrics['Avg Accuracy'] = accuracy + metrics['Accuracy'] = accuracy return metrics diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index d065e14..cca3d98 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -50,6 +50,7 @@ def __init__(self, dataset, timeout=0, verbose=True): self.verbose = verbose self.dataset = dataset self.timeout = timeout + self.k = 1 ''' Use the shogun libary to implement Large Margin Nearest Neighbors. @@ -74,9 +75,7 @@ def RunLMNNShogun(): with totalTimer: # Get the options for running LMNN. if "k" in options: - k = int(options.pop("k")) - else: - k = 1 + self.k = int(options.pop("k")) if "maxiter" in options: n = int(options.pop("maxiter")) @@ -101,17 +100,13 @@ def RunLMNNShogun(): transformedData = np.dot(X, distance.T) feat = RealFeatures(transformedData.T) labels = MulticlassLabels(y.astype(np.float64)) - dist = EuclideanDistance() - knn = KNN(1, dist, labels) - if "k" in options: - knn.set_k(options.pop("k")) - + dist = EuclideanDistance(feat, feat) + knn = KNN(self.k, dist, labels) knn.train(feat) knn.set_knn_solver_type(KNN_COVER_TREE) pred = knn.apply_multiclass(feat) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(pred, labels) - print(accuracy) return [time, accuracy] try: @@ -137,7 +132,7 @@ def RunMetrics(self, options): # Datastructure to store the results. metrics = {} metrics['Runtime'] = results[0] - metrics['Avg Accuracy'] = results[1] + metrics['Accuracy'] = results[1] return metrics From f6eb97ddb065700bb931d440b5e02d05520c1492 Mon Sep 17 00:00:00 2001 From: Manish Date: Mon, 18 Jun 2018 10:39:08 +0530 Subject: [PATCH 07/15] Small update --- methods/mlpack/lmnn.py | 5 ++--- methods/shogun/lmnn.py | 7 +++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index 9bc3ae7..b2e6d79 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -29,7 +29,7 @@ import shlex from modshogun import MulticlassLabels, RealFeatures, MulticlassAccuracy -from modshogun import KNN, KNN_COVER_TREE, EuclideanDistance +from modshogun import KNN, EuclideanDistance try: import subprocess32 as subprocess @@ -214,10 +214,9 @@ def RunMetrics(self, options): transformedData = np.dot(data[:,:-1], distance.T) feat = RealFeatures(transformedData.T) labels = MulticlassLabels(data[:, (data.shape[1] - 1)].astype(np.float64)) - dist = EuclideanDistance() + dist = EuclideanDistance(feat, feat) knn = KNN(self.k, dist, labels) knn.train(feat) - knn.set_knn_solver_type(KNN_COVER_TREE) pred = knn.apply_multiclass(feat) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(pred, labels) diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index cca3d98..c612b50 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -32,7 +32,7 @@ from modshogun import RealFeatures from modshogun import MulticlassLabels, MulticlassAccuracy from modshogun import LMNN as ShogunLMNN -from modshogun import KNN, KNN_COVER_TREE, EuclideanDistance +from modshogun import KNN, EuclideanDistance ''' This class implements the Large Margin Nearest Neighbors benchmark. @@ -91,7 +91,7 @@ def RunLMNNShogun(): prep.set_maxiter(n) prep.train() except Exception as e: - return -1 + return [-1, -1] time = totalTimer.ElapsedTime() @@ -103,7 +103,6 @@ def RunLMNNShogun(): dist = EuclideanDistance(feat, feat) knn = KNN(self.k, dist, labels) knn.train(feat) - knn.set_knn_solver_type(KNN_COVER_TREE) pred = knn.apply_multiclass(feat) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(pred, labels) @@ -112,7 +111,7 @@ def RunLMNNShogun(): try: return RunLMNNShogun() except timeout_decorator.TimeoutError: - return -1 + return [-1, -1] ''' Perform Large Margin Nearest Neighbors. If the method has been successfully From e34d0721585f8bbfc3005c8e8ff0d97d4b97fc7a Mon Sep 17 00:00:00 2001 From: Manish Date: Mon, 18 Jun 2018 21:12:44 +0530 Subject: [PATCH 08/15] Resolve k error --- methods/shogun/lmnn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index c612b50..8d3ede4 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -87,7 +87,7 @@ def RunLMNNShogun(): raise Exception("unknown parameters") # Perform LMNN. - prep = ShogunLMNN(feat, labels, k) + prep = ShogunLMNN(feat, labels, self.k) prep.set_maxiter(n) prep.train() except Exception as e: From fa381795a71afef2b4812266353310e957ceb00d Mon Sep 17 00:00:00 2001 From: Manish Date: Tue, 19 Jun 2018 22:19:37 +0530 Subject: [PATCH 09/15] Add KNN accuracy predictor --- methods/mlpack/lmnn.py | 33 +++++++++++++++++++++++++++------ methods/shogun/lmnn.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index b2e6d79..99af5e1 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -28,7 +28,7 @@ from misc import * import shlex -from modshogun import MulticlassLabels, RealFeatures, MulticlassAccuracy +from modshogun import MulticlassLabels, RealFeatures from modshogun import KNN, EuclideanDistance try: @@ -215,12 +215,33 @@ def RunMetrics(self, options): feat = RealFeatures(transformedData.T) labels = MulticlassLabels(data[:, (data.shape[1] - 1)].astype(np.float64)) dist = EuclideanDistance(feat, feat) - knn = KNN(self.k, dist, labels) + knn = KNN(self.k + 1, dist, labels) knn.train(feat) - pred = knn.apply_multiclass(feat) - evaluator = MulticlassAccuracy() - accuracy = evaluator.evaluate(pred, labels) - metrics['Accuracy'] = accuracy + # Get nearest neighbors. + NN = knn.nearest_neighbors() + NN = np.delete(NN, 0, 0) + # Compute unique labels. + uniqueLabels = np.unique(labels) + # Keep count correct predictions. + count = 0 + # Normalize labels + for i in range(data.shape[0]): + for j in range(len(uniqueLabels)): + if (labels[i] == uniqueLabels[j]): + labels[i] = j + break + + for i in range(NN.shape[1]): + Map = [0 for x in range(len(uniqueLabels))] + for j in range(NN.shape[0]): + dist = np.linalg.norm(data[NN[j][i],:] - data[i,:]) + # Add constant factor of 1 incase two points overlap + Map[int(labels[NN[j, i]])] += 1 / (dist + 1)**2 + maxInd = np.argmax(Map) + if (maxInd == labels[i]): + count += 1 + + metrics['Accuracy'] = (count / NN.shape[1]) * 100 return metrics diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index 8d3ede4..ff8de99 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -30,7 +30,7 @@ import numpy as np from modshogun import RealFeatures -from modshogun import MulticlassLabels, MulticlassAccuracy +from modshogun import MulticlassLabels from modshogun import LMNN as ShogunLMNN from modshogun import KNN, EuclideanDistance @@ -101,11 +101,33 @@ def RunLMNNShogun(): feat = RealFeatures(transformedData.T) labels = MulticlassLabels(y.astype(np.float64)) dist = EuclideanDistance(feat, feat) - knn = KNN(self.k, dist, labels) + knn = KNN(self.k + 1, dist, labels) knn.train(feat) - pred = knn.apply_multiclass(feat) - evaluator = MulticlassAccuracy() - accuracy = evaluator.evaluate(pred, labels) + # Get nearest neighbors. + NN = knn.nearest_neighbors() + NN = np.delete(NN, 0, 0) + # Compute unique labels. + uniqueLabels = np.unique(labels) + # Keep count correct predictions. + count = 0 + # Normalize labels + for i in range(X.shape[0]): + for j in range(len(uniqueLabels)): + if (labels[i] == uniqueLabels[j]): + labels[i] = j + break + + for i in range(NN.shape[1]): + Map = [0 for x in range(len(uniqueLabels))] + for j in range(NN.shape[0]): + dist = np.linalg.norm(X[NN[j][i],:] - X[i,:]) + # Add constant factor of 1 incase two points overlap + Map[int(labels[NN[j, i]])] += 1 / (dist + 1)**2 + maxInd = np.argmax(Map) + if (maxInd == labels[i]): + count += 1 + + accuracy = (count / NN.shape[1]) * 100 return [time, accuracy] try: From a488dba429425901e51327b715efa123e5362e8f Mon Sep 17 00:00:00 2001 From: Manish Date: Wed, 20 Jun 2018 10:40:23 +0530 Subject: [PATCH 10/15] Add KNNAccuracy to metrics --- methods/metrics/definitions.py | 51 ++++++++++++++++++++++++++++++++++ methods/mlpack/lmnn.py | 42 ++++++---------------------- methods/shogun/lmnn.py | 50 ++++++++++----------------------- 3 files changed, 74 insertions(+), 69 deletions(-) diff --git a/methods/metrics/definitions.py b/methods/metrics/definitions.py index 262fc56..4f35bd8 100644 --- a/methods/metrics/definitions.py +++ b/methods/metrics/definitions.py @@ -9,6 +9,10 @@ import numpy as np import math +from modshogun import RealFeatures +from modshogun import MulticlassLabels +from modshogun import KNN, EuclideanDistance + class Metrics(object): ''' @@ -466,3 +470,50 @@ def SimpleMeanSquaredError(truelabels, predictedlabels): simplemse += difference * difference simplemse /= n return simplemse + + ''' + @param distance - Matrix containing learned distance. + @param data - List containing data & true labels. + @param k - Number of targets used calculation. + @param flag - Switch to control whether to use distance weighted KNN or not. + This method computes the accuracy based on the true labels and + predicted labels from knn classifier. + ''' + @staticmethod + def KNNAccuracy(distance, data, k, flag): + transformedData = np.dot(data[0], distance.T) + feat = RealFeatures(transformedData.T) + labels = MulticlassLabels(data[1].astype(np.float64)) + dist = EuclideanDistance(feat, feat) + knn = KNN(k + 1, dist, labels) + knn.train(feat) + # Get nearest neighbors. + nn = knn.nearest_neighbors() + nn = np.delete(nn, 0, 0) + # Compute unique labels. + uniqueLabels = np.unique(labels) + # Keep count correct predictions. + count = 0 + # Normalize labels + for i in range(data[0].shape[0]): + for j in range(len(uniqueLabels)): + if (labels[i] == uniqueLabels[j]): + labels[i] = j + break + + for i in range(nn.shape[1]): + mapLabels = [0 for x in range(len(uniqueLabels))] + for j in range(nn.shape[0]): + if (flag): + distPoints = np.linalg.norm(data[0][nn[j][i],:] - data[0][i,:]) + # Add constant factor of 1 incase two points overlap + mapLabels[int(labels[nn[j, i]])] += 1 / (distPoints + 1)**2 + else: + # Subtract a variable factor to avoid draw condition without + # affecting actual result. + mapLabels[int(labels[nn[j, i]])] += 1 - j * 1e-8 + maxInd = np.argmax(mapLabels) + if (maxInd == labels[i]): + count += 1 + accuracy = (count / nn.shape[1]) * 100 + return accuracy diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index 99af5e1..dbd0b01 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -28,8 +28,6 @@ from misc import * import shlex -from modshogun import MulticlassLabels, RealFeatures -from modshogun import KNN, EuclideanDistance try: import subprocess32 as subprocess @@ -208,40 +206,16 @@ def RunMetrics(self, options): metrics['Runtime'] = timer.total_time - timer.saving_data - timer.loading_data Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) - # Predict labels. + # Get distance. distance = LoadDataset("distance.csv") data = np.genfromtxt(self.dataset, delimiter=',') - transformedData = np.dot(data[:,:-1], distance.T) - feat = RealFeatures(transformedData.T) - labels = MulticlassLabels(data[:, (data.shape[1] - 1)].astype(np.float64)) - dist = EuclideanDistance(feat, feat) - knn = KNN(self.k + 1, dist, labels) - knn.train(feat) - # Get nearest neighbors. - NN = knn.nearest_neighbors() - NN = np.delete(NN, 0, 0) - # Compute unique labels. - uniqueLabels = np.unique(labels) - # Keep count correct predictions. - count = 0 - # Normalize labels - for i in range(data.shape[0]): - for j in range(len(uniqueLabels)): - if (labels[i] == uniqueLabels[j]): - labels[i] = j - break - - for i in range(NN.shape[1]): - Map = [0 for x in range(len(uniqueLabels))] - for j in range(NN.shape[0]): - dist = np.linalg.norm(data[NN[j][i],:] - data[i,:]) - # Add constant factor of 1 incase two points overlap - Map[int(labels[NN[j, i]])] += 1 / (dist + 1)**2 - maxInd = np.argmax(Map) - if (maxInd == labels[i]): - count += 1 - - metrics['Accuracy'] = (count / NN.shape[1]) * 100 + + dataList = [data[:,:-1], data[:, (data.shape[1] - 1)]] + metrics['Accuracy_1_NN'] = Metrics.KNNAccuracy(distance, dataList, 1, False) + metrics['Accuracy_3_NN'] = Metrics.KNNAccuracy(distance, dataList, 3, False) + metrics['Accuracy_3_NN_DW'] = Metrics.KNNAccuracy(distance, dataList, 3, True) + metrics['Accuracy_5_NN'] = Metrics.KNNAccuracy(distance, dataList, 5, False) + metrics['Accuracy_5_NN_DW'] = Metrics.KNNAccuracy(distance, dataList, 5, True) return metrics diff --git a/methods/shogun/lmnn.py b/methods/shogun/lmnn.py index ff8de99..a32fb0e 100644 --- a/methods/shogun/lmnn.py +++ b/methods/shogun/lmnn.py @@ -32,7 +32,6 @@ from modshogun import RealFeatures from modshogun import MulticlassLabels from modshogun import LMNN as ShogunLMNN -from modshogun import KNN, EuclideanDistance ''' This class implements the Large Margin Nearest Neighbors benchmark. @@ -95,40 +94,17 @@ def RunLMNNShogun(): time = totalTimer.ElapsedTime() - # Predict labels. + # Get distance. distance = prep.get_linear_transform() - transformedData = np.dot(X, distance.T) - feat = RealFeatures(transformedData.T) - labels = MulticlassLabels(y.astype(np.float64)) - dist = EuclideanDistance(feat, feat) - knn = KNN(self.k + 1, dist, labels) - knn.train(feat) - # Get nearest neighbors. - NN = knn.nearest_neighbors() - NN = np.delete(NN, 0, 0) - # Compute unique labels. - uniqueLabels = np.unique(labels) - # Keep count correct predictions. - count = 0 - # Normalize labels - for i in range(X.shape[0]): - for j in range(len(uniqueLabels)): - if (labels[i] == uniqueLabels[j]): - labels[i] = j - break - - for i in range(NN.shape[1]): - Map = [0 for x in range(len(uniqueLabels))] - for j in range(NN.shape[0]): - dist = np.linalg.norm(X[NN[j][i],:] - X[i,:]) - # Add constant factor of 1 incase two points overlap - Map[int(labels[NN[j, i]])] += 1 / (dist + 1)**2 - maxInd = np.argmax(Map) - if (maxInd == labels[i]): - count += 1 - - accuracy = (count / NN.shape[1]) * 100 - return [time, accuracy] + dataList = [X, y] + accuracy1NN = Metrics.KNNAccuracy(distance, dataList, 1, False) + accuracy3NN = Metrics.KNNAccuracy(distance, dataList, 3, False) + accuracy3NNDW = Metrics.KNNAccuracy(distance, dataList, 3, True) + accuracy5NN = Metrics.KNNAccuracy(distance, dataList, 5, False) + accuracy5NNDW = Metrics.KNNAccuracy(distance, dataList, 5, True) + + return [time, accuracy1NN, accuracy3NN, accuracy3NNDW, + accuracy5NN, accuracy5NNDW] try: return RunLMNNShogun() @@ -153,7 +129,11 @@ def RunMetrics(self, options): # Datastructure to store the results. metrics = {} metrics['Runtime'] = results[0] - metrics['Accuracy'] = results[1] + metrics['Accuracy_1_NN'] = results[1] + metrics['Accuracy_3_NN'] = results[2] + metrics['Accuracy_3_NN_DW'] = results[3] + metrics['Accuracy_5_NN'] = results[4] + metrics['Accuracy_5_NN_DW'] = results[5] return metrics From cd6c562095653078cb61d7fe5cdc110b73aa478c Mon Sep 17 00:00:00 2001 From: Manish Date: Wed, 20 Jun 2018 14:22:23 +0530 Subject: [PATCH 11/15] update datasets --- config.yaml | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/config.yaml b/config.yaml index f4a5070..a865532 100644 --- a/config.yaml +++ b/config.yaml @@ -800,19 +800,19 @@ methods: format: [csv, txt] datasets: - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], - 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', - 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] + 'datasets/satellite_train.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv', + 'datasets/oilspill_train.csv', 'datasets/shuttle_train.csv', + 'datasets/ecoli_train.csv', 'datasets/vehicle.csv'] options: - num_targets: 5 + num_targets: 3 passes: 10 - range: 25 + range: 20 seed: 42 - files: ['datasets/letter_recognition.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], 'datasets/shuttle_train.csv', 'datasets/isolet_train.csv', - 'datasets/covtype.csv', 'datasets/corel-histogram.csv', + 'datasets/covtype.csv', 'datasets/optdigits_train.csv', 'datasets/mnist_all.csv', 'datasets/Twitter.csv'] options: num_targets: 3 @@ -821,7 +821,7 @@ methods: seed: 42 - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], + 'datasets/ecoli_train.csv', 'datasets/vehicle.csv', 'datasets/balance_scale.csv', 'datasets/ionosphere.csv'] options: num_targets: 3 @@ -830,8 +830,8 @@ methods: seed: 42 - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], - 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', + 'datasets/satellite_train.csv', 'datasets/ionosphere.csv', + 'datasets/ecoli_train.csv', 'datasets/vehicle.csv', 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] options: num_targets: 3 @@ -842,8 +842,8 @@ methods: seed: 42 - files: ['datasets/iris_train.csv', - ['datasets/diabetes_X.csv', 'datasets/diabetes_y.csv'], - 'datasets/wine_qual.csv', 'datasets/ionosphere.csv', + 'datasets/satellite_train.csv', 'datasets/ionosphere.csv', + 'datasets/ecoli_train.csv', 'datasets/vehicle.csv', 'datasets/balance_scale.csv', 'datasets/letter_recognition.csv'] options: num_targets: 3 @@ -857,7 +857,7 @@ methods: 'datasets/shuttle_train.csv', 'datasets/isolet_train.csv', 'datasets/mnist_all.csv', 'datasets/letter_recognition.csv'] options: - num_targets: 5 + num_targets: 3 max_iterations: 2000 optimizer: lbfgs seed: 42 @@ -2249,16 +2249,17 @@ methods: format: [csv, txt] datasets: - files: [ ['datasets/iris_train.csv'], - ['datasets/wine_qual.csv'], - ['datasets/isolet_train.csv'], + ['datasets/ecoli_train.csv'], + ['datasets/vehicle.csv'], ['datasets/ionosphere.csv'], ['datasets/shuttle_train.csv'], - ['datasets/covtype.csv'], - ['datasets/corel-histogram.csv'], + ['datasets/letter_recognition.csv'], + ['datasets/balance_scale.csv'], + ['datasets/oilspill_train.csv'], ['datasets/mnist_all.csv'], ['datasets/Twitter.csv'], - ['datasets/balance_scale.csv'], - ['datasets/letter_recognition.csv']] + ['datasets/isolet_train.csv'], + ['datasets/covtype.csv']] options: k: 3 From d6584a4f3058d4885cbeaa89dcc7effd6dab22e3 Mon Sep 17 00:00:00 2001 From: Manish Date: Thu, 21 Jun 2018 14:06:50 +0530 Subject: [PATCH 12/15] Added benchmarks for matlab's LMNN --- config.yaml | 17 ++++- methods/matlab/LMNN.m | 168 +++++++++++++++++++++++++++++++++++++++++ methods/matlab/lmnn.py | 146 +++++++++++++++++++++++++++++++++++ 3 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 methods/matlab/LMNN.m create mode 100644 methods/matlab/lmnn.py diff --git a/config.yaml b/config.yaml index a865532..2ec3580 100644 --- a/config.yaml +++ b/config.yaml @@ -953,6 +953,21 @@ methods: new_dimensionality: 2 scaled: True + LMNN: + run: ['metric'] + script: methods/matlab/lmnn.py + format: [csv, txt] + datasets: + - files: ['datasets/iris_train.csv', + 'datasets/satellite_train.csv', 'datasets/ionosphere.csv', + 'datasets/balance_scale.csv', 'datasets/vehicle.csv', + 'datasets/oilspill_train.csv', 'datasets/ecoli_train.csv', + 'datasets/letter_recognition.csv', 'datasets/shuttle_train.csv', + 'datasets/isolet_train.csv', 'datasets/optdigits_train.csv', + 'datasets/covtype.csv', 'datasets/mnist_all.csv'] + options: + k: 3 + PERCEPTRON: run: ['metric'] script: methods/matlab/perceptron.py @@ -3198,4 +3213,4 @@ methods: ['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'], ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'], ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'], - ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ] \ No newline at end of file + ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ] diff --git a/methods/matlab/LMNN.m b/methods/matlab/LMNN.m new file mode 100644 index 0000000..c7f66ba --- /dev/null +++ b/methods/matlab/LMNN.m @@ -0,0 +1,168 @@ +% @file lmnn.m + +function lmnn(cmd) +% LMNN Learns a metric using large-margin nearest neighbor metric learning +% +% The function uses large-margin nearest neighbor (LMNN) metric learning to +% learn a metric on the data set specified by the NxD matrix X and the +% corresponding Nx1 vector labels. The metric is returned in M. +% +% Required options: +% (-i) [string] Input dataset to perform PLMNNCA on. +% Options: +% (-k) [int] Desired number of targets. +% +% +% This file is part of the Matlab Toolbox for Dimensionality Reduction. +% The toolbox can be obtained from http://homepage.tudelft.nl/19j49 +% You are free to use, change, or redistribute this code in any way you +% want for non-commercial purposes. However, it is appreciated if you +% maintain the name of the original author. +% +% (C) Laurens van der Maaten, Delft University of Technology + + inputFile = regexp(cmd, '.*?-i ([^\s]+)', 'tokens', 'once'); + + % Load input dataset. + X = csvread(inputFile{:}); + + % Use the last row of the data as the labels. + labels = X(:,end); + % Remove the label row. + X = X(:,1:end-1); + + % Variable K can't be used + %K = regexp(cmd, '.*?-k ([^\s]+)', 'tokens', 'once'); + %K = str2num(K{1}); + + total_time = tic; + + % Initialize some variables + [N, D] = size(X); + assert(length(labels) == N); + [lablist, ~, labels] = unique(labels); + K = length(lablist); + label_matrix = false(N, K); + label_matrix(sub2ind(size(label_matrix), (1:length(labels))', labels)) = true; + same_label = logical(double(label_matrix) * double(label_matrix')); + M = eye(D); + C = Inf; prev_C = Inf; + + % Set learning parameters + min_iter = 50; % minimum number of iterations + max_iter = 1000; % maximum number of iterations + eta = .1; % learning rate + mu = .5; % weighting of pull and push terms + tol = 1e-3; % tolerance for convergence + best_C = Inf; % best error obtained so far + best_M = M; % best metric found so far + no_targets = 3; % number of target neighbors + + % Select target neighbors + sum_X = sum(X .^ 2, 2); + DD = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * (X * X'))); + DD(~same_label) = Inf; DD(1:N + 1:end) = Inf; + [~, targets_ind] = sort(DD, 2, 'ascend'); + targets_ind = targets_ind(:,1:no_targets); + targets = false(N, N); + targets(sub2ind([N N], vec(repmat((1:N)', [1 no_targets])), vec(targets_ind))) = true; + + % Compute pulling term between target neigbhors to initialize gradient + slack = zeros(N, N, no_targets); + G = zeros(D, D); + for i=1:no_targets + G = G + (1 - mu) .* (X - X(targets_ind(:,i),:))' * (X - X(targets_ind(:,i),:)); + end + + % Perform main learning iterations + iter = 0; + while (prev_C - C > tol || iter < min_iter) && iter < max_iter + + % Compute pairwise distances under current metric + XM = X * M; + sum_X = sum(XM .* X, 2); + DD = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * (XM * X'))); + + % Compute value of slack variables + old_slack = slack; + for i=1:no_targets + slack(:,:,i) = ~same_label .* max(0, bsxfun(@minus, 1 + DD(sub2ind([N N], (1:N)', targets_ind(:,i))), DD)); + end + + % Compute value of cost function + prev_C = C; + C = (1 - mu) .* sum(DD(targets)) + ... % push terms between target neighbors + mu .* sum(slack(:)); % pull terms between impostors + + % Maintain best solution found so far (subgradient method) + if C < best_C + best_C = C; + best_M = M; + end + + % Perform gradient update + for i=1:no_targets + + % Add terms for new violations + [r, c] = find(slack(:,:,i) > 0 & old_slack(:,:,i) == 0); + G = G + mu .* ((X(r,:) - X(targets_ind(r, i),:))' * ... + (X(r,:) - X(targets_ind(r, i),:)) - ... + (X(r,:) - X(c,:))' * (X(r,:) - X(c,:))); + + % Remove terms for resolved violations + [r, c] = find(slack(:,:,i) == 0 & old_slack(:,:,i) > 0); + G = G - mu .* ((X(r,:) - X(targets_ind(r, i),:))' * ... + (X(r,:) - X(targets_ind(r, i),:)) - ... + (X(r,:) - X(c,:))' * (X(r,:) - X(c,:))); + end + M = M - (eta ./ N) .* G; + + % Project metric back onto the PSD cone + [V, L] = eig(M); + V = real(V); L = real(L); + ind = find(diag(L) > 0); + if isempty(ind) + warning('Projection onto PSD cone failed. All eigenvalues were negative.'); break + end + M = V(:,ind) * L(ind, ind) * V(:,ind)'; + if any(isinf(M(:))) + warning('Projection onto PSD cone failed. Metric contains Inf values.'); break + end + if any(isnan(M(:))) + warning('Projection onto PSD cone failed. Metric contains NaN values.'); break + end + + % Update learning rate + if prev_C > C + eta = eta * 1.01; + else + eta = eta * .5; + end + + % Print out progress + iter = iter + 1; + no_slack = sum(slack(:) > 0); + if rem(iter, 10) == 0 + [~, sort_ind] = sort(DD, 2, 'ascend'); + disp(['Iteration ' num2str(iter) ': error is ' num2str(C ./ N) ... + ', nearest neighbor error is ' num2str(sum(labels(sort_ind(:,2)) ~= labels) ./ N) ... + ', number of constraints: ' num2str(no_slack)]); + end + end + + % Return best metric and error + M = best_M; + C = best_C; + + % Compute mapped data + [L, S, ~] = svd(M); + L = bsxfun(@times, sqrt(diag(S)), L); + disp(sprintf('[INFO ] total_time: %fs', toc(total_time))) + + % Save learned distance. + csvwrite('distance.csv', L); +end + +function x = vec(x) + x = x(:); +end diff --git a/methods/matlab/lmnn.py b/methods/matlab/lmnn.py new file mode 100644 index 0000000..cca2054 --- /dev/null +++ b/methods/matlab/lmnn.py @@ -0,0 +1,146 @@ +''' + @file lmnn.py + @author Manish Kumar + + Class to benchmark the matlab Large Margin Nearest Neighbors method. +''' + +import os +import sys +import inspect + +# Import the util path, this method even works if the path contains symlinks to +# modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util"))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +#Import the metrics definitions path. +metrics_folder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics"))) +if metrics_folder not in sys.path: + sys.path.insert(0, metrics_folder) + +from log import * +from profiler import * +from definitions import * + +import shlex +import subprocess +import re +import collections + +''' +This class implements the Large Margin Nearest Neighbors benchmark. +''' +class LMNN(object): + + ''' + Create the Large Margin Nearest Neighbors benchmark instance. + + @param dataset - Input dataset to perform Logistic Regression on. + @param timeout - The time until the timeout. Default no timeout. + @param path - Path to the matlab binary. + @param verbose - Display informational messages. + ''' + def __init__(self, dataset, timeout=0, path=os.environ["MATLAB_BIN"], + verbose=True): + self.verbose = verbose + self.dataset = dataset + self.path = path + self.timeout = timeout + self.k = 1 + + ''' + Destructor to clean up at the end. Use this method to remove created file. + ''' + def __del__(self): + Log.Info("Clean up.", self.verbose) + filelist = ["distance.csv"] + for f in filelist: + if os.path.isfile(f): + os.remove(f) + + ''' + Large Margin Nearest Neighbors benchmark instance. If the method has been + successfully completed return the elapsed time in seconds. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def RunMetrics(self, options): + Log.Info("Perform Large Margin Nearest Neighbors.", self.verbose) + + if "k" in options: + self.k = int(options.pop("k")) + + # No options accepted for this script. + if len(options) > 0: + Log.Fatal("Unknown parameters: " + str(options)) + raise Exception("unknown parameters") + + inputCmd = "-i " + self.dataset + " -k " + str(self.k) + + # Split the command using shell-like syntax. + cmd = shlex.split(self.path + "matlab -nodisplay -nosplash -r \"try, " + + "LMNN('" + inputCmd + "'), catch, exit(1), end, exit(0)\"") + + # Run command with the nessecary arguments and return its output as a byte + # string. We have untrusted input so we disable all shell based features. + try: + s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False, + timeout=self.timeout) + except subprocess.TimeoutExpired as e: + Log.Warn(str(e)) + return -2 + except Exception as e: + Log.Fatal("Could not execute command: " + str(cmd)) + return -1 + + # Datastructure to store the results. + metrics = {} + + # Parse data: runtime. + timer = self.parseTimer(s) + + if timer != -1: + metrics['Runtime'] = timer.total_time + distance = np.genfromtxt("distance.csv", delimiter = ',') + data = np.genfromtxt(self.dataset, delimiter=',') + + dataList = [data[:,:-1], data[:, (data.shape[1] - 1)]] + metrics['Accuracy_1_NN'] = Metrics.KNNAccuracy(distance, dataList, 1, False) + metrics['Accuracy_3_NN'] = Metrics.KNNAccuracy(distance, dataList, 3, False) + metrics['Accuracy_3_NN_DW'] = Metrics.KNNAccuracy(distance, dataList, 3, True) + metrics['Accuracy_5_NN'] = Metrics.KNNAccuracy(distance, dataList, 5, False) + metrics['Accuracy_5_NN_DW'] = Metrics.KNNAccuracy(distance, dataList, 5, True) + + Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) + + return metrics + + ''' + Parse the timer data form a given string. + + @param data - String to parse timer data from. + @return - Namedtuple that contains the timer data or -1 in case of an error. + ''' + def parseTimer(self, data): + # Compile the regular expression pattern into a regular expression object to + # parse the timer data. + pattern = re.compile(br""" + .*?total_time: (?P.*?)s.*? + """, re.VERBOSE|re.MULTILINE|re.DOTALL) + + match = pattern.match(data) + if not match: + Log.Fatal("Can't parse the data: wrong format") + return -1 + else: + # Create a namedtuple and return the timer data. + timer = collections.namedtuple("timer", ["total_time"]) + + return timer(float(match.group("total_time"))) + From 9e47ebae3543e2c9f16e6def392c81e778e7f938 Mon Sep 17 00:00:00 2001 From: Manish Date: Thu, 21 Jun 2018 14:14:14 +0530 Subject: [PATCH 13/15] typo rectify --- methods/matlab/LMNN.m | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/methods/matlab/LMNN.m b/methods/matlab/LMNN.m index c7f66ba..69c534e 100644 --- a/methods/matlab/LMNN.m +++ b/methods/matlab/LMNN.m @@ -8,7 +8,7 @@ function lmnn(cmd) % corresponding Nx1 vector labels. The metric is returned in M. % % Required options: -% (-i) [string] Input dataset to perform PLMNNCA on. +% (-i) [string] Input dataset to perform LMNN on. % Options: % (-k) [int] Desired number of targets. % @@ -31,9 +31,9 @@ function lmnn(cmd) % Remove the label row. X = X(:,1:end-1); - % Variable K can't be used - %K = regexp(cmd, '.*?-k ([^\s]+)', 'tokens', 'once'); - %K = str2num(K{1}); + % Variable K can't be used + % K = regexp(cmd, '.*?-k ([^\s]+)', 'tokens', 'once'); + % K = str2num(K{1}); total_time = tic; From 4d27754099c5808a5aa7bbe80afad71738b7a809 Mon Sep 17 00:00:00 2001 From: Manish Date: Wed, 27 Jun 2018 14:15:40 +0530 Subject: [PATCH 14/15] removed a parameter --- methods/mlpack/lmnn.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index dbd0b01..a225de5 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -106,8 +106,6 @@ def OptionsToStr(self, options): optionsStr = optionsStr + " -r " + str(options.pop("regularization")) if "tolerance" in options: optionsStr = optionsStr + " -t " + str(options.pop("tolerance")) - if "batch_delta" in options: - optionsStr = optionsStr + " -d " + str(options.pop("batch_delta")) if "range" in options: optionsStr = optionsStr + " -R " + str(options.pop("range")) if "step_size" in options: From e993b25eb43dcc408a7ca6dc57307abfaa30dada Mon Sep 17 00:00:00 2001 From: Manish Date: Thu, 28 Jun 2018 15:50:45 +0530 Subject: [PATCH 15/15] removed extra L-BFGS related parameters --- config.yaml | 1 - methods/mlpack/lmnn.py | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/config.yaml b/config.yaml index 2ec3580..34bdf1d 100644 --- a/config.yaml +++ b/config.yaml @@ -850,7 +850,6 @@ methods: max_iterations: 2000 optimizer: lbfgs seed: 42 - wolfe: 0.5 range: 50 - files: ['datasets/covtype.csv', diff --git a/methods/mlpack/lmnn.py b/methods/mlpack/lmnn.py index a225de5..6936740 100644 --- a/methods/mlpack/lmnn.py +++ b/methods/mlpack/lmnn.py @@ -116,10 +116,8 @@ def OptionsToStr(self, options): optionsStr = optionsStr + " -p " + str(options.pop("passes")) if "max_iterations" in options: optionsStr = optionsStr + " -n " + str(options.pop("max_iterations")) - if "num_basis" in options: - optionsStr = optionsStr + " -B " + str(options.pop("num_basis")) - if "wolfe" in options: - optionsStr = optionsStr + " -w " + str(options.pop("wolfe")) + if "rank" in options: + optionsStr = optionsStr + " -A " + str(options.pop("rank")) if "normalize" in options: optionsStr = optionsStr + " -N" options.pop("normalize")