Skip to content
This repository was archived by the owner on Feb 26, 2025. It is now read-only.

Commit e2f66a4

Browse files
authored
backport: Allow reading not-quite null-terminated strings. (#1059)
* backport: Allow reading not-quite null-terminated strings. For fixed-length strings that have a padding mode that suggests they're null-terminated are in fact not required to be null-terminated, we (silently) fail to read the last character. Since, HDF5 will create such string, h5dump will display the file and h5py will read the string; we opted to allow it and also read the string into and `std::string` (which is variable length and already not guaranteed to have the same length as the fixed length string). HighFive will continue to not allow writing null-terminated strings that aren't via `write`. * backport: Fix off-by-one in char_buffer_length. Backports: (#1056), ( #1060)
1 parent 0cd16e0 commit e2f66a4

File tree

3 files changed

+51
-19
lines changed

3 files changed

+51
-19
lines changed

include/highfive/H5DataType.hpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -153,16 +153,18 @@ class FixedLengthStringType: public StringType {
153153
/// UTF8. In particular, a string with `n` UFT8 characters in general
154154
/// requires `4*n` bytes.
155155
///
156-
/// The string padding is subtle, essentially it's just a hint. A
157-
/// null-terminated string is guaranteed to have one `'\0'` which marks the
158-
/// semantic end of the string. The length of the buffer must be at least
159-
/// `size` bytes regardless. HDF5 will read or write `size` bytes,
160-
/// irrespective of the when the `\0` occurs.
161-
///
162-
/// Note that when writing passing `StringPadding::NullTerminated` is a
156+
/// The string padding is subtle, essentially it's just a hint. While
157+
/// commonly, a null-terminated string is guaranteed to have one `'\0'`
158+
/// which marks the semantic end of the string, this is not enforced by
159+
/// HDF5. In fact, there are HDF5 files that contain strings that claim to
160+
/// be null-terminated but aren't. The length of the buffer must be at
161+
/// least `size` bytes regardless of the padding. HDF5 will read or write
162+
/// `size` bytes, irrespective of when (if at all) the `\0` occurs.
163+
///
164+
/// Note that when writing, passing `StringPadding::NullTerminated` is a
163165
/// guarantee to the reader that it contains a `\0`. Therefore, make sure
164-
/// that the string really is nullterminated. Otherwise prefer a
165-
/// null-padded string which only means states that the buffer is filled up
166+
/// that the string really is null-terminated. Otherwise prefer a
167+
/// null-padded string. This mearly states that the buffer is filled up
166168
/// with 0 or more `\0`.
167169
FixedLengthStringType(size_t size,
168170
StringPadding padding,

include/highfive/bits/H5Converter_misc.hpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,14 @@ enum class BufferMode { Read, Write };
103103
///
104104
/// \brief String length in bytes excluding the `\0`.
105105
///
106-
inline size_t char_buffer_size(char const* const str, size_t max_string_length) {
107-
for (size_t i = 0; i <= max_string_length; ++i) {
106+
inline size_t char_buffer_length(char const* const str, size_t max_string_size) {
107+
for (size_t i = 0; i < max_string_size; ++i) {
108108
if (str[i] == '\0') {
109109
return i;
110110
}
111111
}
112112

113-
return max_string_length;
113+
return max_string_size;
114114
}
115115

116116

@@ -190,7 +190,7 @@ struct StringBuffer {
190190
} else if (buffer.isFixedLengthString()) {
191191
// If the buffer is fixed-length and null-terminated, then
192192
// `buffer.string_length` doesn't include the null-character.
193-
if (length > buffer.string_length) {
193+
if (length > buffer.string_max_length) {
194194
throw std::invalid_argument("String length too big.");
195195
}
196196

@@ -229,9 +229,9 @@ struct StringBuffer {
229229
/// `length() + 1` bytes long.
230230
size_t length() const {
231231
if (buffer.isNullTerminated()) {
232-
return char_buffer_size(data(), buffer.string_length);
232+
return char_buffer_length(data(), buffer.string_size);
233233
} else {
234-
return buffer.string_length;
234+
return buffer.string_max_length;
235235
}
236236
}
237237

@@ -272,7 +272,7 @@ struct StringBuffer {
272272
: file_datatype(_file_datatype.asStringType())
273273
, padding(file_datatype.getPadding())
274274
, string_size(file_datatype.isVariableStr() ? size_t(-1) : file_datatype.getSize())
275-
, string_length(string_size - size_t(isNullTerminated()))
275+
, string_max_length(string_size - size_t(isNullTerminated()))
276276
, dims(_dims) {
277277
if (string_size == 0 && isNullTerminated()) {
278278
throw DataTypeException(
@@ -322,9 +322,11 @@ struct StringBuffer {
322322
private:
323323
StringType file_datatype;
324324
StringPadding padding;
325-
size_t string_size; // Size of buffer required to store the string.
326-
// Meaningful for fixed length strings only.
327-
size_t string_length; // Semantic length of string.
325+
// Size of buffer required to store the string.
326+
// Meaningful for fixed length strings only.
327+
size_t string_size;
328+
// Maximum length of string.
329+
size_t string_max_length;
328330
std::vector<size_t> dims;
329331

330332
std::vector<char> fixed_length_buffer;

tests/unit/tests_high_five_base.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2318,6 +2318,34 @@ void check_multiple_string(Proxy proxy, size_t string_length) {
23182318
}
23192319
}
23202320

2321+
template <class Proxy>
2322+
void check_supposedly_nullterm(Proxy proxy) {
2323+
auto dataspace = HighFive::DataSpace::Scalar();
2324+
auto datatype = HighFive::FixedLengthStringType(5, HighFive::StringPadding::NullTerminated);
2325+
proxy.create("not_null_terminated", dataspace, datatype);
2326+
auto obj = proxy.get("not_null_terminated");
2327+
2328+
// Creates a 5 byte, "null-terminated", fixed-length string. The first five
2329+
// bytes are filled with "GROUP". Clearly, this isn't null-terminated. However,
2330+
// h5py will read it back as "GROUP", HDF5 allows us to create these; and they're
2331+
// found in the wild.
2332+
std::string value = "GROUP";
2333+
obj.write_raw(value.c_str(), datatype);
2334+
2335+
auto actual = obj.template read<std::string>();
2336+
REQUIRE(actual == value);
2337+
}
2338+
2339+
TEST_CASE("HighFiveSTDString (attribute, nullterm cornercase)") {
2340+
auto file = HighFive::File("not_null_terminated_attribute.h5", HighFive::File::Truncate);
2341+
check_supposedly_nullterm(ForwardToAttribute(file));
2342+
}
2343+
2344+
TEST_CASE("HighFiveSTDString (dataset, nullterm cornercase)") {
2345+
auto file = HighFive::File("not_null_terminated_dataset.h5", HighFive::File::Truncate);
2346+
check_supposedly_nullterm(ForwardToDataSet(file));
2347+
}
2348+
23212349
TEST_CASE("HighFiveSTDString (dataset, single, short)") {
23222350
File file("std_string_dataset_single_short.h5", File::Truncate);
23232351
check_single_string(ForwardToDataSet(file), 3);

0 commit comments

Comments
 (0)