From ea60ba610e8c71e84c1715cc4e7a5454cbee6fbf Mon Sep 17 00:00:00 2001 From: LiuYuHui Date: Mon, 15 Jun 2020 11:41:34 +0800 Subject: [PATCH 1/8] add label encoder --- src/shogun/labels/BinaryLabelEncoder.h | 117 ++++++++++++++++++++ src/shogun/labels/LabelEncoder.h | 106 ++++++++++++++++++ src/shogun/labels/MulticlassLabelsEncoder.h | 81 ++++++++++++++ tests/unit/labels/LabelsEncoder_unittest.cc | 66 +++++++++++ 4 files changed, 370 insertions(+) create mode 100644 src/shogun/labels/BinaryLabelEncoder.h create mode 100644 src/shogun/labels/LabelEncoder.h create mode 100644 src/shogun/labels/MulticlassLabelsEncoder.h create mode 100644 tests/unit/labels/LabelsEncoder_unittest.cc diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h new file mode 100644 index 00000000000..1b332c80d1d --- /dev/null +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -0,0 +1,117 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Yuhui Liu + * + */ +#ifndef _BINARYLABELENCODER__H__ +#define _BINARYLABELENCODER__H__ + +#include +#include +#include +#include +#include +#include + +namespace shogun +{ + + class BinaryLabelEncoder : public LabelEncoder + { + public: + BinaryLabelEncoder() = default; + + ~BinaryLabelEncoder() = default; + + /** Fit label encoder + * + * @param Target values. + * @return SGVector which contains unique labels. + */ + SGVector fit(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + fit_impl(result_vector); + require( + unique_labels.size() == 2, + "BinaryLabel should contain only two elements"); + + return SGVector( + unique_labels.begin(), unique_labels.end()); + } + /** Transform labels to normalized encoding. + * + * @param Target values to be transformed. + * @return Labels transformed to be normalized. + */ + std::shared_ptr + transform(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + require( + std::set(result_vector.begin(), result_vector.end()) + .size() == 2, + "BinaryLabel should contain only two elements"); + auto transformed_vec = transform_impl(result_vector); + + std::transform( + transformed_vec.begin(), transformed_vec.end(), + transformed_vec.begin(), [](float64_t e) { + if (std::abs(e - 0.0) <= + std::numeric_limits::epsilon()) + return -1.0; + else + return e; + }); + return std::make_shared(transformed_vec); + } + /** Transform labels back to original encoding. + * + * @param normailzed encoding labels + * @return original encoding labels + */ + std::shared_ptr + inverse_transform(const std::shared_ptr& labs) override + { + auto normalized_vector = labs->as()->get_labels(); + + std::transform( + normalized_vector.begin(), normalized_vector.end(), + normalized_vector.begin(), [](float64_t e) { + if (std::abs(e + 1.0) <= + std::numeric_limits::epsilon()) + return 0.0; + else + return e; + }); + require( + std::set( + normalized_vector.begin(), normalized_vector.end()) + .size() == 2, + "BinaryLabel should contain only two elements"); + + return std::make_shared( + inverse_transform_impl(normalized_vector)); + } + /** Fit label encoder and return encoded labels. + * + * @param Target values. + * @return Labels transformed to be normalized. + */ + std::shared_ptr + fit_transform(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + return std::make_shared( + transform_impl(fit_impl(result_vector))); + } + + virtual const char* get_name() const + { + return "BinaryLabelEncoder"; + } + }; +} // namespace shogun + +#endif \ No newline at end of file diff --git a/src/shogun/labels/LabelEncoder.h b/src/shogun/labels/LabelEncoder.h new file mode 100644 index 00000000000..48f937343d2 --- /dev/null +++ b/src/shogun/labels/LabelEncoder.h @@ -0,0 +1,106 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Yuhui Liu + * + */ + +#ifndef _LABELENCODER__H__ +#define _LABELENCODER__H__ + +#include +#include +#include +#include +#include +#include +namespace shogun +{ + + class LabelEncoder : public SGObject + { + public: + LabelEncoder() = default; + + virtual ~LabelEncoder() = default; + + /** Fit label encoder + * + * @param Target values. + * @return SGVector which contains unique labels. + */ + virtual SGVector + fit(const std::shared_ptr& labs) = 0; + /** Transform labels to normalized encoding. + * + * @param Target values to be transformed. + * @return Labels transformed to be normalized. + */ + virtual std::shared_ptr + transform(const std::shared_ptr& labs) = 0; + /** Transform labels back to original encoding. + * + * @param normailzed encoding labels + * @return original encoding labels + */ + virtual std::shared_ptr + inverse_transform(const std::shared_ptr&) = 0; + + /** Fit label encoder and return encoded labels. + * + * @param Target values. + * @return Labels transformed to be normalized. + */ + virtual std::shared_ptr + fit_transform(const std::shared_ptr&) = 0; + + virtual const char* get_name() const + { + return "LabelEncoder"; + } + + protected: + SGVector fit_impl(const SGVector& origin_vector) + { + std::copy( + origin_vector.begin(), origin_vector.end(), + std::inserter(unique_labels, unique_labels.begin())); + return SGVector( + unique_labels.begin(), unique_labels.end()); + } + + SGVector + transform_impl(const SGVector& result_vector) + { + SGVector converted(result_vector.vlen); + std::transform( + result_vector.begin(), result_vector.end(), converted.begin(), + [& unique_labels = unique_labels, + &normalized_to_origin = + normalized_to_origin](const auto& old_label) { + auto new_label = std::distance( + unique_labels.begin(), unique_labels.find(old_label)); + normalized_to_origin[new_label] = old_label; + return new_label; + }); + return converted; + } + + SGVector + inverse_transform_impl(const SGVector& result_vector) + { + SGVector original_vector(result_vector.vlen); + std::transform( + result_vector.begin(), result_vector.end(), + original_vector.begin(), + [& normalized_to_origin = normalized_to_origin](const auto& e) { + return normalized_to_origin[e]; + }); + return original_vector; + } + std::set unique_labels; + std::unordered_map normalized_to_origin; + }; +} // namespace shogun + +#endif \ No newline at end of file diff --git a/src/shogun/labels/MulticlassLabelsEncoder.h b/src/shogun/labels/MulticlassLabelsEncoder.h new file mode 100644 index 00000000000..198b8118bf2 --- /dev/null +++ b/src/shogun/labels/MulticlassLabelsEncoder.h @@ -0,0 +1,81 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Yuhui Liu + * + */ +#ifndef _MulticlassLabelsEncoder__H__ +#define _MulticlassLabelsEncoder__H__ + +#include +#include +#include +#include +#include +#include + +namespace shogun +{ + + class MulticlassLabelsEncoder : public LabelEncoder + { + public: + MulticlassLabelsEncoder() = default; + + ~MulticlassLabelsEncoder() = default; + + /** Fit label encoder + * + * @param Target values. + * @return SGVector which contains unique labels. + */ + SGVector fit(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + return fit_impl(result_vector); + } + /** Transform labels to normalized encoding. + * + * @param Target values to be transformed. + * @return Labels transformed to be normalized. + */ + std::shared_ptr + transform(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + return std::make_shared( + transform_impl(result_vector)); + } + /** Transform labels back to original encoding. + * + * @param normailzed encoding labels + * @return original encoding labels + */ + std::shared_ptr + inverse_transform(const std::shared_ptr& labs) override + { + auto normalized_vector = labs->as()->get_labels(); + return std::make_shared( + inverse_transform_impl(normalized_vector)); + } + /** Fit label encoder and return encoded labels. + * + * @param Target values. + * @return Labels transformed to be normalized. + */ + std::shared_ptr + fit_transform(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + return std::make_shared( + transform_impl(fit_impl(result_vector))); + } + + virtual const char* get_name() const + { + return "MulticlassLabelsEncoder"; + } + }; +} // namespace shogun + +#endif \ No newline at end of file diff --git a/tests/unit/labels/LabelsEncoder_unittest.cc b/tests/unit/labels/LabelsEncoder_unittest.cc new file mode 100644 index 00000000000..46dafb13c03 --- /dev/null +++ b/tests/unit/labels/LabelsEncoder_unittest.cc @@ -0,0 +1,66 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Yuhui Liu + */ + +#include +#include +#include +#include +#include +using namespace shogun; + +TEST(BinaryLabelEncoder, fit_transform) +{ + auto label_encoder = std::make_shared(); + SGVector vec{-1, -1, 1, -1, 1}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + EXPECT_EQ(-1, unique_vec[0]); + EXPECT_EQ(1, unique_vec[1]); + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + EXPECT_EQ(-1, result_vec[0]); + EXPECT_EQ(-1, result_vec[1]); + EXPECT_EQ(1, result_vec[2]); + EXPECT_EQ(-1, result_vec[3]); + EXPECT_EQ(1, result_vec[4]); + + auto inv_result = label_encoder->inverse_transform(result_labels) + ->as() + ->get_labels(); + + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(vec[i], inv_result[i]); + } +} + +TEST(MulticlassLabelsEncoder, fit_transform) +{ + auto label_encoder = std::make_shared(); + SGVector vec{1, 2, 2, 6}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + EXPECT_EQ(1, unique_vec[0]); + EXPECT_EQ(2, unique_vec[1]); + EXPECT_EQ(6, unique_vec[2]); + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + EXPECT_EQ(0, result_vec[0]); + EXPECT_EQ(1, result_vec[1]); + EXPECT_EQ(1, result_vec[2]); + EXPECT_EQ(2, result_vec[3]); + + auto inv_result = label_encoder->inverse_transform(result_labels) + ->as() + ->get_labels(); + + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(vec[i], inv_result[i]); + } +} From 53e7e3fe1d66d03624b26f04595b7f49778fcc7e Mon Sep 17 00:00:00 2001 From: LiuYuHui Date: Mon, 15 Jun 2020 20:08:48 +0800 Subject: [PATCH 2/8] add more unit test --- src/shogun/labels/BinaryLabelEncoder.h | 37 +++--- tests/unit/labels/LabelsEncoder_unittest.cc | 121 ++++++++++++++++++-- 2 files changed, 133 insertions(+), 25 deletions(-) diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h index 1b332c80d1d..804013bd81e 100644 --- a/src/shogun/labels/BinaryLabelEncoder.h +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -13,7 +13,7 @@ #include #include #include - +#include namespace shogun { @@ -32,13 +32,12 @@ namespace shogun SGVector fit(const std::shared_ptr& labs) override { const auto result_vector = labs->as()->get_labels(); - fit_impl(result_vector); + auto result_labels = fit_impl(result_vector); require( unique_labels.size() == 2, - "BinaryLabel should contain only two elements"); + "Binary Labels should contain only two elements"); - return SGVector( - unique_labels.begin(), unique_labels.end()); + return result_labels; } /** Transform labels to normalized encoding. * @@ -50,9 +49,10 @@ namespace shogun { const auto result_vector = labs->as()->get_labels(); require( - std::set(result_vector.begin(), result_vector.end()) + std::unordered_set( + result_vector.begin(), result_vector.end()) .size() == 2, - "BinaryLabel should contain only two elements"); + "Binary Labels should contain only two elements"); auto transformed_vec = transform_impl(result_vector); std::transform( @@ -74,7 +74,14 @@ namespace shogun std::shared_ptr inverse_transform(const std::shared_ptr& labs) override { - auto normalized_vector = labs->as()->get_labels(); + auto normalized_labels = labs->as(); + normalized_labels->ensure_valid(); + auto normalized_vector = normalized_labels->get_labels(); + require( + std::unordered_set( + normalized_vector.begin(), normalized_vector.end()) + .size() == 2, + "Binary Labels should contain only two elements"); std::transform( normalized_vector.begin(), normalized_vector.end(), @@ -85,14 +92,12 @@ namespace shogun else return e; }); - require( - std::set( - normalized_vector.begin(), normalized_vector.end()) - .size() == 2, - "BinaryLabel should contain only two elements"); - - return std::make_shared( - inverse_transform_impl(normalized_vector)); + auto origin_vec = inverse_transform_impl(normalized_vector); + SGVector result_vev(origin_vec.vlen); + std::transform( + origin_vec.begin(), origin_vec.end(), result_vev.begin(), + [](auto&& e) { return static_cast(e); }); + return std::make_shared(result_vev); } /** Fit label encoder and return encoded labels. * diff --git a/tests/unit/labels/LabelsEncoder_unittest.cc b/tests/unit/labels/LabelsEncoder_unittest.cc index 46dafb13c03..845a1eff823 100644 --- a/tests/unit/labels/LabelsEncoder_unittest.cc +++ b/tests/unit/labels/LabelsEncoder_unittest.cc @@ -38,22 +38,113 @@ TEST(BinaryLabelEncoder, fit_transform) } } +TEST(BinaryLabelEncoder, labels_not_neg1_or_1) +{ + auto label_encoder = std::make_shared(); + SGVector vec{-100, 200, -100, 200, -100}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + EXPECT_EQ(-100, unique_vec[0]); + EXPECT_EQ(200, unique_vec[1]); + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + EXPECT_EQ(-1, result_vec[0]); + EXPECT_EQ(1, result_vec[1]); + EXPECT_EQ(-1, result_vec[2]); + EXPECT_EQ(1, result_vec[3]); + EXPECT_EQ(-1, result_vec[4]); + + auto inv_result = label_encoder->inverse_transform(result_labels) + ->as() + ->get_labels(); + + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(vec[i], inv_result[i]); + } + + SGVector test_vec{-1, -1, -1, -1, -1, 1}; + auto test_labels = std::make_shared(test_vec); + auto inv_test = label_encoder->inverse_transform(test_labels) + ->as() + ->get_labels(); + EXPECT_EQ(-100, inv_test[0]); + EXPECT_EQ(-100, inv_test[1]); + EXPECT_EQ(-100, inv_test[2]); + EXPECT_EQ(-100, inv_test[3]); + EXPECT_EQ(-100, inv_test[4]); + EXPECT_EQ(200, inv_test[5]); +} + +TEST(BinaryLabelEncoder, more_than_two_labels) +{ + auto label_encoder = std::make_shared(); + SGVector vec{-100, 200, -100, 200, -100, 42}; + auto origin_labels = std::make_shared(vec); + + EXPECT_THROW(label_encoder->fit(origin_labels), ShogunException); + + EXPECT_THROW(label_encoder->transform(origin_labels), ShogunException); + + SGVector vec2{-1, -1, 1, 0}; + auto result_labels = std::make_shared(vec2); + EXPECT_THROW( + label_encoder->inverse_transform(result_labels), ShogunException); + + SGVector vec3{0, 1, 1, 0}; + auto result_labels2 = std::make_shared(vec3); + EXPECT_THROW( + label_encoder->inverse_transform(result_labels2), ShogunException); +} + TEST(MulticlassLabelsEncoder, fit_transform) { + auto eps = std::numeric_limits::epsilon(); auto label_encoder = std::make_shared(); - SGVector vec{1, 2, 2, 6}; + SGVector vec{1.0, 2.0, 2.0, 6.0}; auto origin_labels = std::make_shared(vec); auto unique_vec = label_encoder->fit(origin_labels); - EXPECT_EQ(1, unique_vec[0]); - EXPECT_EQ(2, unique_vec[1]); - EXPECT_EQ(6, unique_vec[2]); + EXPECT_NEAR(1, unique_vec[0], eps); + EXPECT_NEAR(2, unique_vec[1], eps); + EXPECT_NEAR(6, unique_vec[2], eps); auto result_labels = label_encoder->transform(origin_labels); auto result_vec = result_labels->as()->get_labels(); - EXPECT_EQ(0, result_vec[0]); - EXPECT_EQ(1, result_vec[1]); - EXPECT_EQ(1, result_vec[2]); - EXPECT_EQ(2, result_vec[3]); + EXPECT_NEAR(0, result_vec[0], eps); + EXPECT_NEAR(1, result_vec[1], eps); + EXPECT_NEAR(1, result_vec[2], eps); + EXPECT_NEAR(2, result_vec[3], eps); + + auto inv_result = label_encoder->inverse_transform(result_labels) + ->as() + ->get_labels(); + + for (int i = 0; i < 4; i++) + { + EXPECT_NEAR(vec[i], inv_result[i], eps); + } +} + +TEST(MulticlassLabelsEncoder, negative_labels) +{ + auto eps = std::numeric_limits::epsilon(); + auto label_encoder = std::make_shared(); + SGVector vec{-100, 200, -2, 6, -2}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + EXPECT_NEAR(-100, unique_vec[0], eps); + EXPECT_NEAR(-2, unique_vec[1], eps); + EXPECT_NEAR(6, unique_vec[2], eps); + EXPECT_NEAR(200, unique_vec[3], eps); + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + EXPECT_NEAR(0, result_vec[0], eps); + EXPECT_NEAR(3, result_vec[1], eps); + EXPECT_NEAR(1, result_vec[2], eps); + EXPECT_NEAR(2, result_vec[3], eps); + EXPECT_NEAR(1, result_vec[4], eps); auto inv_result = label_encoder->inverse_transform(result_labels) ->as() @@ -61,6 +152,18 @@ TEST(MulticlassLabelsEncoder, fit_transform) for (int i = 0; i < 5; i++) { - EXPECT_EQ(vec[i], inv_result[i]); + EXPECT_NEAR(vec[i], inv_result[i], eps); } + + SGVector test_vec{0, 1, 2, 3, 1, 3}; + auto test_labels = std::make_shared(test_vec); + auto inv_test = label_encoder->inverse_transform(test_labels) + ->as() + ->get_labels(); + EXPECT_NEAR(-100, inv_test[0], eps); + EXPECT_NEAR(-2, inv_test[1], eps); + EXPECT_NEAR(6, inv_test[2], eps); + EXPECT_NEAR(200, inv_test[3], eps); + EXPECT_NEAR(-2, inv_test[4], eps); + EXPECT_NEAR(200, inv_test[5], eps); } From 266cfc4fed3ad74da74c20c8e9d2f0b49f56fd7c Mon Sep 17 00:00:00 2001 From: LiuYuHui Date: Wed, 17 Jun 2020 19:03:08 +0800 Subject: [PATCH 3/8] refine erro message --- src/shogun/labels/BinaryLabelEncoder.h | 50 +++++++++---- tests/unit/labels/LabelsEncoder_unittest.cc | 77 +++++++++++---------- 2 files changed, 74 insertions(+), 53 deletions(-) diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h index 804013bd81e..b386d8bea89 100644 --- a/src/shogun/labels/BinaryLabelEncoder.h +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -32,11 +32,14 @@ namespace shogun SGVector fit(const std::shared_ptr& labs) override { const auto result_vector = labs->as()->get_labels(); + check_is_valid(result_vector); + if (is_convert_float_to_int(result_vector)) + { + io::warn( + "({}, {}) have been converted to (-1, 1).", + result_vector[0], result_vector[1]); + } auto result_labels = fit_impl(result_vector); - require( - unique_labels.size() == 2, - "Binary Labels should contain only two elements"); - return result_labels; } /** Transform labels to normalized encoding. @@ -48,11 +51,7 @@ namespace shogun transform(const std::shared_ptr& labs) override { const auto result_vector = labs->as()->get_labels(); - require( - std::unordered_set( - result_vector.begin(), result_vector.end()) - .size() == 2, - "Binary Labels should contain only two elements"); + check_is_valid(result_vector); auto transformed_vec = transform_impl(result_vector); std::transform( @@ -77,12 +76,7 @@ namespace shogun auto normalized_labels = labs->as(); normalized_labels->ensure_valid(); auto normalized_vector = normalized_labels->get_labels(); - require( - std::unordered_set( - normalized_vector.begin(), normalized_vector.end()) - .size() == 2, - "Binary Labels should contain only two elements"); - + check_is_valid(normalized_vector); std::transform( normalized_vector.begin(), normalized_vector.end(), normalized_vector.begin(), [](float64_t e) { @@ -116,6 +110,32 @@ namespace shogun { return "BinaryLabelEncoder"; } + + private: + void check_is_valid(const SGVector& vec) + { + const auto unique_set = + std::unordered_set(vec.begin(), vec.end()); + require( + unique_set.size() == 2, + "Binary labels should contain only two elements, ({}) have " + "been detected.", + fmt::join(unique_set, ", ")); + } + + bool is_convert_float_to_int(const SGVector& vec) const + { + SGVector converted(vec.vlen); + std::transform( + vec.begin(), vec.end(), converted.begin(), + [](auto&& e) { return static_cast(e); }); + return std::equal( + vec.begin(), vec.end(), converted.begin(), + [](auto&& e1, auto&& e2) { + return std::abs(e1 - e2) > + std::numeric_limits::epsilon(); + }); + } }; } // namespace shogun diff --git a/tests/unit/labels/LabelsEncoder_unittest.cc b/tests/unit/labels/LabelsEncoder_unittest.cc index 845a1eff823..18813be831f 100644 --- a/tests/unit/labels/LabelsEncoder_unittest.cc +++ b/tests/unit/labels/LabelsEncoder_unittest.cc @@ -22,11 +22,12 @@ TEST(BinaryLabelEncoder, fit_transform) auto result_labels = label_encoder->transform(origin_labels); auto result_vec = result_labels->as()->get_labels(); - EXPECT_EQ(-1, result_vec[0]); - EXPECT_EQ(-1, result_vec[1]); - EXPECT_EQ(1, result_vec[2]); - EXPECT_EQ(-1, result_vec[3]); - EXPECT_EQ(1, result_vec[4]); + + SGVector expected_res{-1, -1, 1, -1, 1}; + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(expected_res[i], result_vec[i]); + } auto inv_result = label_encoder->inverse_transform(result_labels) ->as() @@ -49,11 +50,11 @@ TEST(BinaryLabelEncoder, labels_not_neg1_or_1) auto result_labels = label_encoder->transform(origin_labels); auto result_vec = result_labels->as()->get_labels(); - EXPECT_EQ(-1, result_vec[0]); - EXPECT_EQ(1, result_vec[1]); - EXPECT_EQ(-1, result_vec[2]); - EXPECT_EQ(1, result_vec[3]); - EXPECT_EQ(-1, result_vec[4]); + SGVector expected_vec{-1, 1, -1, 1, -1}; + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(expected_vec[i], result_vec[i]); + } auto inv_result = label_encoder->inverse_transform(result_labels) ->as() @@ -69,12 +70,11 @@ TEST(BinaryLabelEncoder, labels_not_neg1_or_1) auto inv_test = label_encoder->inverse_transform(test_labels) ->as() ->get_labels(); - EXPECT_EQ(-100, inv_test[0]); - EXPECT_EQ(-100, inv_test[1]); - EXPECT_EQ(-100, inv_test[2]); - EXPECT_EQ(-100, inv_test[3]); - EXPECT_EQ(-100, inv_test[4]); - EXPECT_EQ(200, inv_test[5]); + SGVector expected_inv_test{-100, -100, -100, -100, -100, 200}; + for (int i = 0; i < 6; i++) + { + EXPECT_EQ(expected_inv_test[i], inv_test[i]); + } } TEST(BinaryLabelEncoder, more_than_two_labels) @@ -100,7 +100,7 @@ TEST(BinaryLabelEncoder, more_than_two_labels) TEST(MulticlassLabelsEncoder, fit_transform) { - auto eps = std::numeric_limits::epsilon(); + auto eps = std::numeric_limits::epsilon(); auto label_encoder = std::make_shared(); SGVector vec{1.0, 2.0, 2.0, 6.0}; auto origin_labels = std::make_shared(vec); @@ -111,10 +111,11 @@ TEST(MulticlassLabelsEncoder, fit_transform) auto result_labels = label_encoder->transform(origin_labels); auto result_vec = result_labels->as()->get_labels(); - EXPECT_NEAR(0, result_vec[0], eps); - EXPECT_NEAR(1, result_vec[1], eps); - EXPECT_NEAR(1, result_vec[2], eps); - EXPECT_NEAR(2, result_vec[3], eps); + SGVector expected_res{0, 1, 1, 2}; + for (int i = 0; i < 4; i++) + { + EXPECT_NEAR(expected_res[i], result_vec[i], eps); + } auto inv_result = label_encoder->inverse_transform(result_labels) ->as() @@ -128,23 +129,23 @@ TEST(MulticlassLabelsEncoder, fit_transform) TEST(MulticlassLabelsEncoder, negative_labels) { - auto eps = std::numeric_limits::epsilon(); + auto eps = std::numeric_limits::epsilon(); auto label_encoder = std::make_shared(); - SGVector vec{-100, 200, -2, 6, -2}; + SGVector vec{-100.1, 200.4, -2.868, 6.98, -2.868}; auto origin_labels = std::make_shared(vec); auto unique_vec = label_encoder->fit(origin_labels); - EXPECT_NEAR(-100, unique_vec[0], eps); - EXPECT_NEAR(-2, unique_vec[1], eps); - EXPECT_NEAR(6, unique_vec[2], eps); - EXPECT_NEAR(200, unique_vec[3], eps); + EXPECT_NEAR(-100.1, unique_vec[0], eps); + EXPECT_NEAR(-2.868, unique_vec[1], eps); + EXPECT_NEAR(6.98, unique_vec[2], eps); + EXPECT_NEAR(200.4, unique_vec[3], eps); auto result_labels = label_encoder->transform(origin_labels); auto result_vec = result_labels->as()->get_labels(); - EXPECT_NEAR(0, result_vec[0], eps); - EXPECT_NEAR(3, result_vec[1], eps); - EXPECT_NEAR(1, result_vec[2], eps); - EXPECT_NEAR(2, result_vec[3], eps); - EXPECT_NEAR(1, result_vec[4], eps); + SGVector expected_res{0, 3, 1, 2, 1}; + for (int i = 0; i < 5; i++) + { + EXPECT_NEAR(expected_res[i], result_vec[i], eps); + } auto inv_result = label_encoder->inverse_transform(result_labels) ->as() @@ -160,10 +161,10 @@ TEST(MulticlassLabelsEncoder, negative_labels) auto inv_test = label_encoder->inverse_transform(test_labels) ->as() ->get_labels(); - EXPECT_NEAR(-100, inv_test[0], eps); - EXPECT_NEAR(-2, inv_test[1], eps); - EXPECT_NEAR(6, inv_test[2], eps); - EXPECT_NEAR(200, inv_test[3], eps); - EXPECT_NEAR(-2, inv_test[4], eps); - EXPECT_NEAR(200, inv_test[5], eps); + SGVector expected_inv{-100.1, -2.868, 6.98, + 200.4, -2.868, 200.4}; + for (int i = 0; i < 6; i++) + { + EXPECT_NEAR(expected_inv[i], inv_test[i], eps); + } } From aea04e761fe7c63c129c76664f07fc17bf71be67 Mon Sep 17 00:00:00 2001 From: LiuYuHui Date: Mon, 22 Jun 2020 11:15:10 +0800 Subject: [PATCH 4/8] refine function name --- src/shogun/labels/BinaryLabelEncoder.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h index b386d8bea89..041e231c1a5 100644 --- a/src/shogun/labels/BinaryLabelEncoder.h +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -33,7 +33,7 @@ namespace shogun { const auto result_vector = labs->as()->get_labels(); check_is_valid(result_vector); - if (is_convert_float_to_int(result_vector)) + if (!can_convert_float_to_int(result_vector)) { io::warn( "({}, {}) have been converted to (-1, 1).", @@ -118,12 +118,11 @@ namespace shogun std::unordered_set(vec.begin(), vec.end()); require( unique_set.size() == 2, - "Binary labels should contain only two elements, ({}) have " - "been detected.", + "Binary labels should contain only two elements, can not interpret ({}) as binary labels", fmt::join(unique_set, ", ")); } - bool is_convert_float_to_int(const SGVector& vec) const + bool can_convert_float_to_int(const SGVector& vec) const { SGVector converted(vec.vlen); std::transform( @@ -132,7 +131,7 @@ namespace shogun return std::equal( vec.begin(), vec.end(), converted.begin(), [](auto&& e1, auto&& e2) { - return std::abs(e1 - e2) > + return std::abs(e1 - e2) < std::numeric_limits::epsilon(); }); } From 145f0934a2ff662772b5979ade974c65753150a7 Mon Sep 17 00:00:00 2001 From: LiuYuHui Date: Tue, 23 Jun 2020 11:29:19 +0800 Subject: [PATCH 5/8] add covert warning for multiclasslabel encoder --- src/shogun/labels/BinaryLabelEncoder.h | 16 ++-------------- src/shogun/labels/LabelEncoder.h | 19 +++++++++++++++++++ src/shogun/labels/MulticlassLabelsEncoder.h | 8 ++++++++ 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h index 041e231c1a5..27e4b0ccce1 100644 --- a/src/shogun/labels/BinaryLabelEncoder.h +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -33,7 +33,7 @@ namespace shogun { const auto result_vector = labs->as()->get_labels(); check_is_valid(result_vector); - if (!can_convert_float_to_int(result_vector)) + if (print_warning && !can_convert_float_to_int(result_vector)) { io::warn( "({}, {}) have been converted to (-1, 1).", @@ -122,19 +122,7 @@ namespace shogun fmt::join(unique_set, ", ")); } - bool can_convert_float_to_int(const SGVector& vec) const - { - SGVector converted(vec.vlen); - std::transform( - vec.begin(), vec.end(), converted.begin(), - [](auto&& e) { return static_cast(e); }); - return std::equal( - vec.begin(), vec.end(), converted.begin(), - [](auto&& e1, auto&& e2) { - return std::abs(e1 - e2) < - std::numeric_limits::epsilon(); - }); - } + }; } // namespace shogun diff --git a/src/shogun/labels/LabelEncoder.h b/src/shogun/labels/LabelEncoder.h index 48f937343d2..8f5ef132df5 100644 --- a/src/shogun/labels/LabelEncoder.h +++ b/src/shogun/labels/LabelEncoder.h @@ -59,6 +59,9 @@ namespace shogun return "LabelEncoder"; } + void set_print_warning(bool print_warning){ + print_warning = print_warning; + } protected: SGVector fit_impl(const SGVector& origin_vector) { @@ -98,6 +101,22 @@ namespace shogun }); return original_vector; } + + bool can_convert_float_to_int(const SGVector& vec) const + { + SGVector converted(vec.vlen); + std::transform( + vec.begin(), vec.end(), converted.begin(), + [](auto&& e) { return static_cast(e); }); + return std::equal( + vec.begin(), vec.end(), converted.begin(), + [](auto&& e1, auto&& e2) { + return std::abs(e1 - e2) < + std::numeric_limits::epsilon(); + }); + } + + bool print_warning = true; std::set unique_labels; std::unordered_map normalized_to_origin; }; diff --git a/src/shogun/labels/MulticlassLabelsEncoder.h b/src/shogun/labels/MulticlassLabelsEncoder.h index 198b8118bf2..02fc05b269e 100644 --- a/src/shogun/labels/MulticlassLabelsEncoder.h +++ b/src/shogun/labels/MulticlassLabelsEncoder.h @@ -32,6 +32,14 @@ namespace shogun SGVector fit(const std::shared_ptr& labs) override { const auto result_vector = labs->as()->get_labels(); + if (print_warning && !can_convert_float_to_int(result_vector)) + { + std::set s(result_vector.begin(), result_vector.end()); + io::warn( + "{} have been converted to 0...{}", + fmt::join(s, ", "), + result_vector.vlen - 1); + } return fit_impl(result_vector); } /** Transform labels to normalized encoding. From 11f8db6a8153744888a10ec63c0ec134b9b660d0 Mon Sep 17 00:00:00 2001 From: LiuYuHui Date: Wed, 24 Jun 2020 10:19:44 +0800 Subject: [PATCH 6/8] delete print_warning --- src/shogun/labels/BinaryLabelEncoder.h | 2 +- src/shogun/labels/LabelEncoder.h | 4 ---- src/shogun/labels/MulticlassLabelsEncoder.h | 4 ++-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h index 27e4b0ccce1..fd7be6c356e 100644 --- a/src/shogun/labels/BinaryLabelEncoder.h +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -33,7 +33,7 @@ namespace shogun { const auto result_vector = labs->as()->get_labels(); check_is_valid(result_vector); - if (print_warning && !can_convert_float_to_int(result_vector)) + if (!can_convert_float_to_int(result_vector)) { io::warn( "({}, {}) have been converted to (-1, 1).", diff --git a/src/shogun/labels/LabelEncoder.h b/src/shogun/labels/LabelEncoder.h index 8f5ef132df5..273ee992c53 100644 --- a/src/shogun/labels/LabelEncoder.h +++ b/src/shogun/labels/LabelEncoder.h @@ -59,9 +59,6 @@ namespace shogun return "LabelEncoder"; } - void set_print_warning(bool print_warning){ - print_warning = print_warning; - } protected: SGVector fit_impl(const SGVector& origin_vector) { @@ -116,7 +113,6 @@ namespace shogun }); } - bool print_warning = true; std::set unique_labels; std::unordered_map normalized_to_origin; }; diff --git a/src/shogun/labels/MulticlassLabelsEncoder.h b/src/shogun/labels/MulticlassLabelsEncoder.h index 02fc05b269e..ffc6382ec95 100644 --- a/src/shogun/labels/MulticlassLabelsEncoder.h +++ b/src/shogun/labels/MulticlassLabelsEncoder.h @@ -32,11 +32,11 @@ namespace shogun SGVector fit(const std::shared_ptr& labs) override { const auto result_vector = labs->as()->get_labels(); - if (print_warning && !can_convert_float_to_int(result_vector)) + if (!can_convert_float_to_int(result_vector)) { std::set s(result_vector.begin(), result_vector.end()); io::warn( - "{} have been converted to 0...{}", + "({}) have been converted to (0...{})", fmt::join(s, ", "), result_vector.vlen - 1); } From 7c2f7d41589927b3f2249d1ae2505bbaf214fab8 Mon Sep 17 00:00:00 2001 From: LiuYuHui Date: Tue, 30 Jun 2020 11:29:05 +0800 Subject: [PATCH 7/8] add special treat for contiguous labels --- src/shogun/labels/BinaryLabelEncoder.h | 59 ++++++++------------- src/shogun/labels/LabelEncoder.h | 45 ++++++++++++---- src/shogun/labels/MulticlassLabelsEncoder.h | 56 ++++++++++--------- tests/unit/labels/LabelsEncoder_unittest.cc | 28 ++++++++++ 4 files changed, 115 insertions(+), 73 deletions(-) diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h index fd7be6c356e..8a51a4c256c 100644 --- a/src/shogun/labels/BinaryLabelEncoder.h +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -16,7 +16,9 @@ #include namespace shogun { - + /** @brief Implements a reversible mapping from + * any form of labels to binary labels (+1, -1). + */ class BinaryLabelEncoder : public LabelEncoder { public: @@ -24,29 +26,22 @@ namespace shogun ~BinaryLabelEncoder() = default; - /** Fit label encoder - * - * @param Target values. - * @return SGVector which contains unique labels. - */ SGVector fit(const std::shared_ptr& labs) override { const auto result_vector = labs->as()->get_labels(); check_is_valid(result_vector); if (!can_convert_float_to_int(result_vector)) { + std::set s( + result_vector.begin(), result_vector.end()); io::warn( - "({}, {}) have been converted to (-1, 1).", - result_vector[0], result_vector[1]); + "({}, {}) have been converted to (-1, 1).", *s.begin(), + *s.end()); } auto result_labels = fit_impl(result_vector); return result_labels; } - /** Transform labels to normalized encoding. - * - * @param Target values to be transformed. - * @return Labels transformed to be normalized. - */ + std::shared_ptr transform(const std::shared_ptr& labs) override { @@ -57,34 +52,29 @@ namespace shogun std::transform( transformed_vec.begin(), transformed_vec.end(), transformed_vec.begin(), [](float64_t e) { - if (std::abs(e - 0.0) <= - std::numeric_limits::epsilon()) - return -1.0; - else - return e; + return Math::fequals( + e, 0.0, + std::numeric_limits::epsilon()) + ? -1.0 + : e; }); return std::make_shared(transformed_vec); } - /** Transform labels back to original encoding. - * - * @param normailzed encoding labels - * @return original encoding labels - */ + std::shared_ptr inverse_transform(const std::shared_ptr& labs) override { auto normalized_labels = labs->as(); normalized_labels->ensure_valid(); auto normalized_vector = normalized_labels->get_labels(); - check_is_valid(normalized_vector); std::transform( normalized_vector.begin(), normalized_vector.end(), normalized_vector.begin(), [](float64_t e) { - if (std::abs(e + 1.0) <= - std::numeric_limits::epsilon()) - return 0.0; - else - return e; + return Math::fequals( + e, -1.0, + std::numeric_limits::epsilon()) + ? 0.0 + : e; }); auto origin_vec = inverse_transform_impl(normalized_vector); SGVector result_vev(origin_vec.vlen); @@ -93,11 +83,7 @@ namespace shogun [](auto&& e) { return static_cast(e); }); return std::make_shared(result_vev); } - /** Fit label encoder and return encoded labels. - * - * @param Target values. - * @return Labels transformed to be normalized. - */ + std::shared_ptr fit_transform(const std::shared_ptr& labs) override { @@ -118,11 +104,10 @@ namespace shogun std::unordered_set(vec.begin(), vec.end()); require( unique_set.size() == 2, - "Binary labels should contain only two elements, can not interpret ({}) as binary labels", + "Cannot interpret ({}) as binary labels, need exactly two " + "classes.", fmt::join(unique_set, ", ")); } - - }; } // namespace shogun diff --git a/src/shogun/labels/LabelEncoder.h b/src/shogun/labels/LabelEncoder.h index 273ee992c53..b1dffe7b320 100644 --- a/src/shogun/labels/LabelEncoder.h +++ b/src/shogun/labels/LabelEncoder.h @@ -16,7 +16,10 @@ #include namespace shogun { - + /** @brief Implements a reversible mapping from any + * form of labels to one of Shogun's target label spaces + * (binary, multi-class, etc). + */ class LabelEncoder : public SGObject { public: @@ -60,11 +63,21 @@ namespace shogun } protected: + virtual bool check_is_contiguous( + const SGVector& vec, const std::set& labels) + { + return false; + } SGVector fit_impl(const SGVector& origin_vector) { + is_fitted = true; std::copy( origin_vector.begin(), origin_vector.end(), std::inserter(unique_labels, unique_labels.begin())); + if (check_is_contiguous(origin_vector, unique_labels)) + { + is_fitted = false; + } return SGVector( unique_labels.begin(), unique_labels.end()); } @@ -72,15 +85,18 @@ namespace shogun SGVector transform_impl(const SGVector& result_vector) { + is_transformed = true; + if (!is_fitted && unique_labels.size()) + return result_vector; + require(is_fitted, "Transform expect to be called after fit."); SGVector converted(result_vector.vlen); std::transform( result_vector.begin(), result_vector.end(), converted.begin(), [& unique_labels = unique_labels, - &normalized_to_origin = - normalized_to_origin](const auto& old_label) { + &inverse_mapping = inverse_mapping](const auto& old_label) { auto new_label = std::distance( unique_labels.begin(), unique_labels.find(old_label)); - normalized_to_origin[new_label] = old_label; + inverse_mapping[new_label] = old_label; return new_label; }); return converted; @@ -89,12 +105,19 @@ namespace shogun SGVector inverse_transform_impl(const SGVector& result_vector) { + require( + is_transformed, + "Inverse transform expect to be called after transform."); + if (!is_fitted && unique_labels.size() && is_transformed) + { + return result_vector; + } SGVector original_vector(result_vector.vlen); std::transform( result_vector.begin(), result_vector.end(), original_vector.begin(), - [& normalized_to_origin = normalized_to_origin](const auto& e) { - return normalized_to_origin[e]; + [& inverse_mapping = inverse_mapping](const auto& e) { + return inverse_mapping[e]; }); return original_vector; } @@ -107,14 +130,16 @@ namespace shogun [](auto&& e) { return static_cast(e); }); return std::equal( vec.begin(), vec.end(), converted.begin(), - [](auto&& e1, auto&& e2) { - return std::abs(e1 - e2) < - std::numeric_limits::epsilon(); + [&](auto&& e1, auto&& e2) { + return Math::fequals(e1, static_cast(e2), eps); }); } std::set unique_labels; - std::unordered_map normalized_to_origin; + std::unordered_map inverse_mapping; + const float64_t eps = std::numeric_limits::epsilon(); + bool is_fitted = false; + bool is_transformed = false; }; } // namespace shogun diff --git a/src/shogun/labels/MulticlassLabelsEncoder.h b/src/shogun/labels/MulticlassLabelsEncoder.h index ffc6382ec95..aa8a8e75f3a 100644 --- a/src/shogun/labels/MulticlassLabelsEncoder.h +++ b/src/shogun/labels/MulticlassLabelsEncoder.h @@ -16,7 +16,9 @@ namespace shogun { - + /** @brief Implements a reversible mapping from + * any form of labels to multi-class labels. + */ class MulticlassLabelsEncoder : public LabelEncoder { public: @@ -24,29 +26,20 @@ namespace shogun ~MulticlassLabelsEncoder() = default; - /** Fit label encoder - * - * @param Target values. - * @return SGVector which contains unique labels. - */ SGVector fit(const std::shared_ptr& labs) override { const auto result_vector = labs->as()->get_labels(); if (!can_convert_float_to_int(result_vector)) - { - std::set s(result_vector.begin(), result_vector.end()); + { + std::set s( + result_vector.begin(), result_vector.end()); io::warn( - "({}) have been converted to (0...{})", - fmt::join(s, ", "), - result_vector.vlen - 1); + "({}) have been converted to (0...{})", fmt::join(s, ", "), + result_vector.vlen - 1); } return fit_impl(result_vector); } - /** Transform labels to normalized encoding. - * - * @param Target values to be transformed. - * @return Labels transformed to be normalized. - */ + std::shared_ptr transform(const std::shared_ptr& labs) override { @@ -54,11 +47,7 @@ namespace shogun return std::make_shared( transform_impl(result_vector)); } - /** Transform labels back to original encoding. - * - * @param normailzed encoding labels - * @return original encoding labels - */ + std::shared_ptr inverse_transform(const std::shared_ptr& labs) override { @@ -66,11 +55,7 @@ namespace shogun return std::make_shared( inverse_transform_impl(normalized_vector)); } - /** Fit label encoder and return encoded labels. - * - * @param Target values. - * @return Labels transformed to be normalized. - */ + std::shared_ptr fit_transform(const std::shared_ptr& labs) override { @@ -83,6 +68,25 @@ namespace shogun { return "MulticlassLabelsEncoder"; } + + protected: + bool check_is_contiguous( + const SGVector& vec, + const std::set& unique_labels) override + { + if (const auto vlen = unique_labels.size() == vec.size()) + { + const auto [min_v, max_v] = std::minmax_element( + unique_labels.begin(), unique_labels.end()); + if (Math::fequals(*min_v, 0.0, eps) && + Math::fequals( + *max_v, static_cast(vlen - 1), eps)) + { + return true; + } + } + return false; + } }; } // namespace shogun diff --git a/tests/unit/labels/LabelsEncoder_unittest.cc b/tests/unit/labels/LabelsEncoder_unittest.cc index 18813be831f..c2e50a81dd2 100644 --- a/tests/unit/labels/LabelsEncoder_unittest.cc +++ b/tests/unit/labels/LabelsEncoder_unittest.cc @@ -168,3 +168,31 @@ TEST(MulticlassLabelsEncoder, negative_labels) EXPECT_NEAR(expected_inv[i], inv_test[i], eps); } } + +TEST(MulticlassLabelsEncoder, contiguous_labels) +{ + auto eps = std::numeric_limits::epsilon(); + auto label_encoder = std::make_shared(); + SGVector vec{0, 1, 2, 3, 4, 5}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + for (int i = 0; i < 6; i++) + { + EXPECT_NEAR(vec[i], unique_vec[i], eps); + } + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + for (int i = 0; i < 6; i++) + { + EXPECT_NEAR(vec[i], result_vec[i], eps); + } + + auto inv_labels = label_encoder->inverse_transform(result_labels); + auto inv_vec = inv_labels->as()->get_labels(); + + for (int i = 0; i < 6; i++) + { + EXPECT_NEAR(vec[i], inv_vec[i], eps); + } +} \ No newline at end of file From b2eb4d58d1c9f54d6424947504883732f25ac233 Mon Sep 17 00:00:00 2001 From: LiuYuHui Date: Wed, 1 Jul 2020 17:01:19 +0800 Subject: [PATCH 8/8] create mapping in fit --- src/shogun/labels/BinaryLabelEncoder.h | 3 +- src/shogun/labels/LabelEncoder.h | 41 +++++++++++++-------- src/shogun/labels/MulticlassLabelsEncoder.h | 4 +- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h index 8a51a4c256c..78cffa107d0 100644 --- a/src/shogun/labels/BinaryLabelEncoder.h +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -38,8 +38,7 @@ namespace shogun "({}, {}) have been converted to (-1, 1).", *s.begin(), *s.end()); } - auto result_labels = fit_impl(result_vector); - return result_labels; + return fit_impl(result_vector); } std::shared_ptr diff --git a/src/shogun/labels/LabelEncoder.h b/src/shogun/labels/LabelEncoder.h index b1dffe7b320..c5ae4b6a346 100644 --- a/src/shogun/labels/LabelEncoder.h +++ b/src/shogun/labels/LabelEncoder.h @@ -63,21 +63,34 @@ namespace shogun } protected: - virtual bool check_is_contiguous( - const SGVector& vec, const std::set& labels) + virtual bool check_is_contiguous(const SGVector& vec) { return false; } + + void create_mapping(const SGVector& origin_vector) + { + std::for_each( + origin_vector.begin(), origin_vector.end(), + [this](const auto& old_label) { + auto new_label = std::distance( + unique_labels.begin(), unique_labels.find(old_label)); + inverse_mapping[new_label] = old_label; + mapping[old_label] = new_label; + }); + } + SGVector fit_impl(const SGVector& origin_vector) { is_fitted = true; std::copy( origin_vector.begin(), origin_vector.end(), std::inserter(unique_labels, unique_labels.begin())); - if (check_is_contiguous(origin_vector, unique_labels)) + if (check_is_contiguous(origin_vector)) { is_fitted = false; } + create_mapping(origin_vector); return SGVector( unique_labels.begin(), unique_labels.end()); } @@ -85,19 +98,14 @@ namespace shogun SGVector transform_impl(const SGVector& result_vector) { - is_transformed = true; if (!is_fitted && unique_labels.size()) return result_vector; require(is_fitted, "Transform expect to be called after fit."); SGVector converted(result_vector.vlen); std::transform( result_vector.begin(), result_vector.end(), converted.begin(), - [& unique_labels = unique_labels, - &inverse_mapping = inverse_mapping](const auto& old_label) { - auto new_label = std::distance( - unique_labels.begin(), unique_labels.find(old_label)); - inverse_mapping[new_label] = old_label; - return new_label; + [& mapping = mapping](const auto& old_label) { + return mapping[old_label]; }); return converted; } @@ -105,13 +113,12 @@ namespace shogun SGVector inverse_transform_impl(const SGVector& result_vector) { - require( - is_transformed, - "Inverse transform expect to be called after transform."); - if (!is_fitted && unique_labels.size() && is_transformed) + if (!is_fitted && unique_labels.size()) { return result_vector; } + require( + is_fitted, "Inverse transform expect to be called after fit."); SGVector original_vector(result_vector.vlen); std::transform( result_vector.begin(), result_vector.end(), @@ -136,10 +143,12 @@ namespace shogun } std::set unique_labels; + + std::unordered_map mapping; std::unordered_map inverse_mapping; - const float64_t eps = std::numeric_limits::epsilon(); + static constexpr float64_t eps = + std::numeric_limits::epsilon(); bool is_fitted = false; - bool is_transformed = false; }; } // namespace shogun diff --git a/src/shogun/labels/MulticlassLabelsEncoder.h b/src/shogun/labels/MulticlassLabelsEncoder.h index aa8a8e75f3a..9948b625cd2 100644 --- a/src/shogun/labels/MulticlassLabelsEncoder.h +++ b/src/shogun/labels/MulticlassLabelsEncoder.h @@ -70,9 +70,7 @@ namespace shogun } protected: - bool check_is_contiguous( - const SGVector& vec, - const std::set& unique_labels) override + bool check_is_contiguous(const SGVector& vec) override { if (const auto vlen = unique_labels.size() == vec.size()) {