-
-
Notifications
You must be signed in to change notification settings - Fork 1k
Add label encoder #5067
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add label encoder #5067
Changes from 7 commits
ea60ba6
53e7e3f
266cfc4
aea04e7
145f093
11f8db6
7c2f7d4
b2eb4d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
/* | ||
* This software is distributed under BSD 3-clause license (see LICENSE file). | ||
* | ||
* Authors: Yuhui Liu | ||
* | ||
*/ | ||
#ifndef _BINARYLABELENCODER__H__ | ||
#define _BINARYLABELENCODER__H__ | ||
|
||
#include <memory> | ||
#include <shogun/base/SGObject.h> | ||
#include <shogun/labels/BinaryLabels.h> | ||
#include <shogun/labels/DenseLabels.h> | ||
#include <shogun/labels/LabelEncoder.h> | ||
#include <shogun/lib/SGVector.h> | ||
#include <unordered_set> | ||
namespace shogun | ||
{ | ||
/** @brief Implements a reversible mapping from | ||
* any form of labels to binary labels (+1, -1). | ||
*/ | ||
class BinaryLabelEncoder : public LabelEncoder | ||
{ | ||
public: | ||
BinaryLabelEncoder() = default; | ||
|
||
~BinaryLabelEncoder() = default; | ||
|
||
SGVector<float64_t> fit(const std::shared_ptr<Labels>& labs) override | ||
{ | ||
const auto result_vector = labs->as<DenseLabels>()->get_labels(); | ||
check_is_valid(result_vector); | ||
if (!can_convert_float_to_int(result_vector)) | ||
{ | ||
std::set<float64_t> s( | ||
result_vector.begin(), result_vector.end()); | ||
io::warn( | ||
"({}, {}) have been converted to (-1, 1).", *s.begin(), | ||
*s.end()); | ||
} | ||
auto result_labels = fit_impl(result_vector); | ||
return result_labels; | ||
LiuYuHui marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
std::shared_ptr<Labels> | ||
transform(const std::shared_ptr<Labels>& labs) override | ||
{ | ||
const auto result_vector = labs->as<DenseLabels>()->get_labels(); | ||
check_is_valid(result_vector); | ||
auto transformed_vec = transform_impl(result_vector); | ||
|
||
std::transform( | ||
transformed_vec.begin(), transformed_vec.end(), | ||
transformed_vec.begin(), [](float64_t e) { | ||
return Math::fequals( | ||
e, 0.0, | ||
std::numeric_limits<float64_t>::epsilon()) | ||
? -1.0 | ||
: e; | ||
}); | ||
return std::make_shared<BinaryLabels>(transformed_vec); | ||
} | ||
|
||
std::shared_ptr<Labels> | ||
inverse_transform(const std::shared_ptr<Labels>& labs) override | ||
{ | ||
auto normalized_labels = labs->as<BinaryLabels>(); | ||
normalized_labels->ensure_valid(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does this do? The same as the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I should remove |
||
auto normalized_vector = normalized_labels->get_labels(); | ||
std::transform( | ||
normalized_vector.begin(), normalized_vector.end(), | ||
normalized_vector.begin(), [](float64_t e) { | ||
return Math::fequals( | ||
e, -1.0, | ||
std::numeric_limits<float64_t>::epsilon()) | ||
? 0.0 | ||
: e; | ||
}); | ||
auto origin_vec = inverse_transform_impl(normalized_vector); | ||
SGVector<int32_t> result_vev(origin_vec.vlen); | ||
std::transform( | ||
origin_vec.begin(), origin_vec.end(), result_vev.begin(), | ||
[](auto&& e) { return static_cast<int32_t>(e); }); | ||
return std::make_shared<BinaryLabels>(result_vev); | ||
} | ||
|
||
std::shared_ptr<Labels> | ||
fit_transform(const std::shared_ptr<Labels>& labs) override | ||
LiuYuHui marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
const auto result_vector = labs->as<DenseLabels>()->get_labels(); | ||
return std::make_shared<BinaryLabels>( | ||
transform_impl(fit_impl(result_vector))); | ||
} | ||
|
||
virtual const char* get_name() const | ||
{ | ||
return "BinaryLabelEncoder"; | ||
} | ||
|
||
private: | ||
void check_is_valid(const SGVector<float64_t>& vec) | ||
{ | ||
const auto unique_set = | ||
std::unordered_set<float64_t>(vec.begin(), vec.end()); | ||
require( | ||
unique_set.size() == 2, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gf712 I think it might not be good to assert this as sometimes labels might only contain one class. But I guess this will pop up if a problem and we can change it then :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, in what situation would there only be one label? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When predicting, although I am not sure this is ever called in that case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems like this is only called in fit and transform, so should be fine There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep + 1 and if it becomes a problem we just change it |
||
"Cannot interpret ({}) as binary labels, need exactly two " | ||
"classes.", | ||
fmt::join(unique_set, ", ")); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. neat! Didn't know this was a thing :D There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: "Cannot interpret {} as binary labels" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo can not -> cannot |
||
} | ||
}; | ||
} // namespace shogun | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,146 @@ | ||||||
/* | ||||||
* This software is distributed under BSD 3-clause license (see LICENSE file). | ||||||
* | ||||||
* Authors: Yuhui Liu | ||||||
* | ||||||
*/ | ||||||
|
||||||
#ifndef _LABELENCODER__H__ | ||||||
#define _LABELENCODER__H__ | ||||||
|
||||||
#include <algorithm> | ||||||
#include <map> | ||||||
#include <memory> | ||||||
#include <set> | ||||||
#include <shogun/base/SGObject.h> | ||||||
#include <shogun/lib/SGVector.h> | ||||||
namespace shogun | ||||||
{ | ||||||
/** @brief Implements a reversible mapping from any | ||||||
* form of labels to one of Shogun's target label spaces | ||||||
* (binary, multi-class, etc). | ||||||
*/ | ||||||
class LabelEncoder : public SGObject | ||||||
LiuYuHui marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
{ | ||||||
public: | ||||||
LabelEncoder() = default; | ||||||
|
||||||
virtual ~LabelEncoder() = default; | ||||||
|
||||||
/** Fit label encoder | ||||||
* | ||||||
* @param Target values. | ||||||
* @return SGVector which contains unique labels. | ||||||
*/ | ||||||
virtual SGVector<float64_t> | ||||||
fit(const std::shared_ptr<Labels>& labs) = 0; | ||||||
/** Transform labels to normalized encoding. | ||||||
* | ||||||
* @param Target values to be transformed. | ||||||
* @return Labels transformed to be normalized. | ||||||
*/ | ||||||
virtual std::shared_ptr<Labels> | ||||||
transform(const std::shared_ptr<Labels>& labs) = 0; | ||||||
/** Transform labels back to original encoding. | ||||||
* | ||||||
* @param normailzed encoding labels | ||||||
* @return original encoding labels | ||||||
*/ | ||||||
virtual std::shared_ptr<Labels> | ||||||
inverse_transform(const std::shared_ptr<Labels>&) = 0; | ||||||
|
||||||
/** Fit label encoder and return encoded labels. | ||||||
* | ||||||
* @param Target values. | ||||||
* @return Labels transformed to be normalized. | ||||||
*/ | ||||||
virtual std::shared_ptr<Labels> | ||||||
fit_transform(const std::shared_ptr<Labels>&) = 0; | ||||||
|
||||||
virtual const char* get_name() const | ||||||
{ | ||||||
return "LabelEncoder"; | ||||||
} | ||||||
|
||||||
protected: | ||||||
virtual bool check_is_contiguous( | ||||||
const SGVector<float64_t>& vec, const std::set<float64_t>& labels) | ||||||
{ | ||||||
return false; | ||||||
} | ||||||
SGVector<float64_t> fit_impl(const SGVector<float64_t>& origin_vector) | ||||||
{ | ||||||
is_fitted = true; | ||||||
std::copy( | ||||||
origin_vector.begin(), origin_vector.end(), | ||||||
std::inserter(unique_labels, unique_labels.begin())); | ||||||
if (check_is_contiguous(origin_vector, unique_labels)) | ||||||
{ | ||||||
is_fitted = false; | ||||||
} | ||||||
return SGVector<float64_t>( | ||||||
unique_labels.begin(), unique_labels.end()); | ||||||
} | ||||||
|
||||||
SGVector<float64_t> | ||||||
transform_impl(const SGVector<float64_t>& result_vector) | ||||||
{ | ||||||
is_transformed = true; | ||||||
if (!is_fitted && unique_labels.size()) | ||||||
return result_vector; | ||||||
require(is_fitted, "Transform expect to be called after fit."); | ||||||
SGVector<float64_t> converted(result_vector.vlen); | ||||||
std::transform( | ||||||
result_vector.begin(), result_vector.end(), converted.begin(), | ||||||
[& unique_labels = unique_labels, | ||||||
&inverse_mapping = inverse_mapping](const auto& old_label) { | ||||||
auto new_label = std::distance( | ||||||
unique_labels.begin(), unique_labels.find(old_label)); | ||||||
inverse_mapping[new_label] = old_label; | ||||||
return new_label; | ||||||
}); | ||||||
return converted; | ||||||
} | ||||||
|
||||||
SGVector<float64_t> | ||||||
inverse_transform_impl(const SGVector<float64_t>& result_vector) | ||||||
{ | ||||||
require( | ||||||
is_transformed, | ||||||
"Inverse transform expect to be called after transform."); | ||||||
if (!is_fitted && unique_labels.size() && is_transformed) | ||||||
{ | ||||||
return result_vector; | ||||||
} | ||||||
SGVector<float64_t> original_vector(result_vector.vlen); | ||||||
std::transform( | ||||||
result_vector.begin(), result_vector.end(), | ||||||
original_vector.begin(), | ||||||
[& inverse_mapping = inverse_mapping](const auto& e) { | ||||||
return inverse_mapping[e]; | ||||||
}); | ||||||
return original_vector; | ||||||
} | ||||||
|
||||||
bool can_convert_float_to_int(const SGVector<float64_t>& vec) const | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder whether this should be templated (for both float and int type) and live somewhere where other conversion tools (safe_convert) live...this might be useful elesewhere There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, ideally this would be templated |
||||||
{ | ||||||
SGVector<int32_t> converted(vec.vlen); | ||||||
std::transform( | ||||||
vec.begin(), vec.end(), converted.begin(), | ||||||
[](auto&& e) { return static_cast<int32_t>(e); }); | ||||||
return std::equal( | ||||||
vec.begin(), vec.end(), converted.begin(), | ||||||
[&](auto&& e1, auto&& e2) { | ||||||
return Math::fequals(e1, static_cast<float64_t>(e2), eps); | ||||||
}); | ||||||
} | ||||||
|
||||||
std::set<float64_t> unique_labels; | ||||||
std::unordered_map<float64_t, float64_t> inverse_mapping; | ||||||
const float64_t eps = std::numeric_limits<float64_t>::epsilon(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when I changed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah yes, it should be |
||||||
bool is_fitted = false; | ||||||
bool is_transformed = false; | ||||||
}; | ||||||
} // namespace shogun | ||||||
|
||||||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
* This software is distributed under BSD 3-clause license (see LICENSE file). | ||
* | ||
* Authors: Yuhui Liu | ||
* | ||
*/ | ||
#ifndef _MulticlassLabelsEncoder__H__ | ||
#define _MulticlassLabelsEncoder__H__ | ||
|
||
#include <memory> | ||
#include <shogun/base/SGObject.h> | ||
#include <shogun/labels/DenseLabels.h> | ||
#include <shogun/labels/LabelEncoder.h> | ||
#include <shogun/labels/MulticlassLabels.h> | ||
#include <shogun/lib/SGVector.h> | ||
|
||
namespace shogun | ||
{ | ||
/** @brief Implements a reversible mapping from | ||
* any form of labels to multi-class labels. | ||
*/ | ||
class MulticlassLabelsEncoder : public LabelEncoder | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comments as for binary labels class |
||
{ | ||
public: | ||
MulticlassLabelsEncoder() = default; | ||
|
||
~MulticlassLabelsEncoder() = default; | ||
|
||
SGVector<float64_t> fit(const std::shared_ptr<Labels>& labs) override | ||
{ | ||
const auto result_vector = labs->as<DenseLabels>()->get_labels(); | ||
if (!can_convert_float_to_int(result_vector)) | ||
{ | ||
std::set<float64_t> s( | ||
result_vector.begin(), result_vector.end()); | ||
io::warn( | ||
"({}) have been converted to (0...{})", fmt::join(s, ", "), | ||
result_vector.vlen - 1); | ||
} | ||
return fit_impl(result_vector); | ||
} | ||
|
||
std::shared_ptr<Labels> | ||
transform(const std::shared_ptr<Labels>& labs) override | ||
{ | ||
const auto result_vector = labs->as<DenseLabels>()->get_labels(); | ||
return std::make_shared<MulticlassLabels>( | ||
transform_impl(result_vector)); | ||
} | ||
|
||
std::shared_ptr<Labels> | ||
inverse_transform(const std::shared_ptr<Labels>& labs) override | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't we also need a check valid here? Something that ensures that the labels are contiguous? [0,1,2,3,4, ... ] no gaps. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am thinking whether we need a check valid here, as inverse_transform is to map from internal encoding to origin encoding. for example, {100, 100, 200, 300} -> {0, 0, 1, 2}, {0, 0, 1, 2} are transformed by internal encoding, but it is not continuous There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah you are right of course :) |
||
{ | ||
auto normalized_vector = labs->as<DenseLabels>()->get_labels(); | ||
return std::make_shared<MulticlassLabels>( | ||
inverse_transform_impl(normalized_vector)); | ||
} | ||
|
||
std::shared_ptr<Labels> | ||
fit_transform(const std::shared_ptr<Labels>& labs) override | ||
{ | ||
const auto result_vector = labs->as<DenseLabels>()->get_labels(); | ||
return std::make_shared<MulticlassLabels>( | ||
transform_impl(fit_impl(result_vector))); | ||
} | ||
|
||
virtual const char* get_name() const | ||
{ | ||
return "MulticlassLabelsEncoder"; | ||
} | ||
|
||
protected: | ||
bool check_is_contiguous( | ||
const SGVector<float64_t>& vec, | ||
const std::set<float64_t>& unique_labels) override | ||
{ | ||
if (const auto vlen = unique_labels.size() == vec.size()) | ||
{ | ||
const auto [min_v, max_v] = std::minmax_element( | ||
unique_labels.begin(), unique_labels.end()); | ||
if (Math::fequals(*min_v, 0.0, eps) && | ||
Math::fequals( | ||
*max_v, static_cast<float64_t>(vlen - 1), eps)) | ||
{ | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
}; | ||
} // namespace shogun | ||
|
||
#endif |
Uh oh!
There was an error while loading. Please reload this page.