Skip to content

Commit c494495

Browse files
committed
UTF8 as default
1 parent 22cac66 commit c494495

File tree

11 files changed

+154
-43
lines changed

11 files changed

+154
-43
lines changed

README.md

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,9 @@ Serialize has three overloads.
336336

337337
```csharp
338338
// Non generic API also available, these version is first argument is Type and value is object?
339-
byte[] Serialize<T>(in T? value)
340-
void Serialize<T, TBufferWriter>(in TBufferWriter bufferWriter, in T? value)
341-
async ValueTask SerializeAsync<T>(Stream stream, T? value, CancellationToken cancellationToken = default)
339+
byte[] Serialize<T>(in T? value, MemoryPackSerializeOptions? options = default)
340+
void Serialize<T, TBufferWriter>(in TBufferWriter bufferWriter, in T? value, MemoryPackSerializeOptions? options = default)
341+
async ValueTask SerializeAsync<T>(Stream stream, T? value, MemoryPackSerializeOptions? options = default, CancellationToken cancellationToken = default)
342342
```
343343

344344
The recommended way to do this in Performance is to use `BufferWriter`. This serializes directly into the buffer. It can be applied to `PipeWriter` in `System.IO.Pipelines`, `BodyWriter` in ASP .NET Core, etc.
@@ -349,6 +349,16 @@ Note that `SerializeAsync` for `Stream` is asynchronous only for Flush; it seria
349349

350350
If you want to do complete streaming write, see [Streaming Serialization](#streaming-serialization) section.
351351

352+
### MemoryPackSerializeOptions
353+
354+
`MemoryPackSerializeOptions` configures how serialize string as Utf16 or Utf8. If passing null then uses `MemoryPackSerializeOptions.Default`, it is same as `MemoryPackSerializeOptions.Utf8`, in other words, serialize the string as Utf8. If you want to serialize with Utf16, you can use `MemoryPackSerializeOptions.Utf16`.
355+
356+
Since C#'s internal string representation is UTF16, UTF16 performs better. However, the payload tends to be larger; in UTF8, an ASCII string is one byte, while in UTF16 it is two bytes. Because the difference in size of this payload is so large, UTF8 is set by default.
357+
358+
If the data is non-ASCII (e.g. Japanese, which can be more than 3 bytes, and UTF8 is larger), or if you have to compress it separately, UTF16 may give better results.
359+
360+
Whether UTF8 or UTF16 is selected during serialization, it is not necessary to specify it during deserialization. It will be automatically detected and deserialized normally.
361+
352362
Deserialize API
353363
---
354364
Deserialize has `ReadOnlySpan<byte>` and `ReadOnlySequence<byte>`, `Stream` overload and `ref` support.
@@ -473,10 +483,10 @@ Payload size depends on the target value; unlike JSON, there are no keys and it
473483

474484
For those with varint encoding, such as MessagePack and Protobuf, MemoryPack tends to be larger if ints are used a lot (in MemoryPack, ints are always 4 bytes due to fixed size encoding, while MsgPack is 1~5 bytes).
475485

476-
Also, strings are usually UTF8 for other formats, but MemoryPack is UTF16 fixed length (2 bytes), so MemoryPack is larger if the string occupies ASCII. Conversely, MemoryPack may be smaller if the string contains many UTF8 characters of 3 bytes or more, such as Japanese.
477-
478486
float and double are 4 bytes and 8 bytes in MemoryPack, but 5 bytes and 9 bytes in MsgPack. So MemoryPack is smaller, for example, for Vector3 (float, float, float) arrays.
479487

488+
String is UTF8 by default, which is similar to other serializers, but if the UTF16 option is chosen, it will be of a different nature.
489+
480490
In any case, if the payload size is large, compression should be considered. LZ4, ZStandard and Brotli are recommended. An efficient way to combine compression and serialization will be presented at a later date.
481491

482492
Packages

sandbox/Benchmark/Benchmarks/DeserializeTest.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818

1919
namespace Benchmark.Benchmarks;
2020

21-
[GenericTypeArguments(typeof(int))]
22-
[GenericTypeArguments(typeof(Vector3[]))]
23-
[GenericTypeArguments(typeof(JsonResponseModel))]
24-
[GenericTypeArguments(typeof(NeuralNetworkLayerModel))]
21+
//[GenericTypeArguments(typeof(int))]
22+
//[GenericTypeArguments(typeof(Vector3[]))]
23+
//[GenericTypeArguments(typeof(JsonResponseModel))]
24+
//[GenericTypeArguments(typeof(NeuralNetworkLayerModel))]
2525
public class DeserializeTest<T> : SerializerTestBase<T>
2626
{
2727
//SerializerSessionPool pool;
@@ -51,13 +51,13 @@ public DeserializeTest()
5151
payloadJson = JsonSerializer.SerializeToUtf8Bytes(value);
5252
}
5353

54-
[Benchmark(Baseline = true)]
54+
[Benchmark]
5555
public T MessagePackDeserialize()
5656
{
5757
return MessagePackSerializer.Deserialize<T>(payloadMessagePack);
5858
}
5959

60-
[Benchmark]
60+
[Benchmark(Baseline = true)]
6161
public T? MemoryPackDeserialize()
6262
{
6363
return MemoryPackSerializer.Deserialize<T>(payloadMemoryPack);

sandbox/Benchmark/Benchmarks/SerializeTest.cs

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ namespace Benchmark.Benchmarks;
3030
//[GenericTypeArguments(typeof(MyClass))]
3131

3232

33-
//[GenericTypeArguments(typeof(int))]
34-
//[GenericTypeArguments(typeof(Vector3[]))]
35-
//[GenericTypeArguments(typeof(JsonResponseModel))]
36-
//[GenericTypeArguments(typeof(NeuralNetworkLayerModel))]
33+
[GenericTypeArguments(typeof(int))]
34+
[GenericTypeArguments(typeof(Vector3[]))]
35+
[GenericTypeArguments(typeof(JsonResponseModel))]
36+
[GenericTypeArguments(typeof(NeuralNetworkLayerModel))]
3737
[CategoriesColumn]
3838
[PayloadColumn]
3939
[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)]
@@ -70,18 +70,24 @@ public SerializeTest()
7070
jsonWriter = new Utf8JsonWriter(writer);
7171
}
7272

73-
[Benchmark(Baseline = true), BenchmarkCategory(Categories.Bytes)]
73+
[Benchmark, BenchmarkCategory(Categories.Bytes)]
7474
public byte[] MessagePackSerialize()
7575
{
7676
return MessagePackSerializer.Serialize(value);
7777
}
7878

79-
[Benchmark, BenchmarkCategory(Categories.Bytes)]
79+
[Benchmark(Baseline = true), BenchmarkCategory(Categories.Bytes)]
8080
public byte[] MemoryPackSerialize()
8181
{
8282
return MemoryPackSerializer.Serialize(value, MemoryPackSerializeOptions.Default);
8383
}
8484

85+
[Benchmark, BenchmarkCategory(Categories.Bytes)]
86+
public byte[] MemoryPackSerializeUtf16()
87+
{
88+
return MemoryPackSerializer.Serialize(value, MemoryPackSerializeOptions.Utf16);
89+
}
90+
8591
// requires T:new(), can't test it.
8692
//[Benchmark]
8793
//public byte[] BinaryPackSerialize()
@@ -113,20 +119,27 @@ public byte[] SystemTextJsonSerialize()
113119
// return orleansSerializer.SerializeToArray(value);
114120
//}
115121

116-
[Benchmark(Baseline = true), BenchmarkCategory(Categories.BufferWriter)]
122+
[Benchmark, BenchmarkCategory(Categories.BufferWriter)]
117123
public void MessagePackBufferWriter()
118124
{
119125
MessagePackSerializer.Serialize(writer, value);
120126
writer.Clear();
121127
}
122128

123-
[Benchmark, BenchmarkCategory(Categories.BufferWriter)]
129+
[Benchmark(Baseline = true), BenchmarkCategory(Categories.BufferWriter)]
124130
public void MemoryPackBufferWriter()
125131
{
126132
MemoryPackSerializer.Serialize(writer, value);
127133
writer.Clear();
128134
}
129135

136+
[Benchmark, BenchmarkCategory(Categories.BufferWriter)]
137+
public void MemoryPackBufferWriterUtf16()
138+
{
139+
MemoryPackSerializer.Serialize(writer, value, MemoryPackSerializeOptions.Utf16);
140+
writer.Clear();
141+
}
142+
130143
//[Benchmark]
131144
//public void BinaryPackStream()
132145
//{

sandbox/Benchmark/Benchmarks/Utf16VsUtf8.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,26 +23,26 @@ public Utf16VsUtf8()
2323
{
2424
this.japanese = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをん";
2525
this.ascii = "abcedfghijklmnopqrstuvwxyz0123456789";
26-
this.utf16Jpn = MemoryPackSerializer.Serialize(japanese, MemoryPackSerializeOptions.Default);
26+
this.utf16Jpn = MemoryPackSerializer.Serialize(japanese, MemoryPackSerializeOptions.Utf16);
2727
this.utf8Jpn = MemoryPackSerializer.Serialize(japanese, MemoryPackSerializeOptions.Utf8);
28-
this.utf16Ascii = MemoryPackSerializer.Serialize(ascii, MemoryPackSerializeOptions.Default);
28+
this.utf16Ascii = MemoryPackSerializer.Serialize(ascii, MemoryPackSerializeOptions.Utf16);
2929
this.utf8Ascii = MemoryPackSerializer.Serialize(ascii, MemoryPackSerializeOptions.Utf8);
3030

3131
this.largeAscii = RandomProvider.NextString(600);
32-
this.utf16LargeAscii = MemoryPackSerializer.Serialize(largeAscii, MemoryPackSerializeOptions.Default);
32+
this.utf16LargeAscii = MemoryPackSerializer.Serialize(largeAscii, MemoryPackSerializeOptions.Utf16);
3333
this.utf8LargeAscii = MemoryPackSerializer.Serialize(largeAscii, MemoryPackSerializeOptions.Utf8);
3434
}
3535

3636
[Benchmark]
3737
public byte[] SerializeUtf16Ascii()
3838
{
39-
return MemoryPackSerializer.Serialize(ascii);
39+
return MemoryPackSerializer.Serialize(ascii, MemoryPackSerializeOptions.Utf16);
4040
}
4141

4242
[Benchmark]
4343
public byte[] SerializeUtf16Japanese()
4444
{
45-
return MemoryPackSerializer.Serialize(japanese);
45+
return MemoryPackSerializer.Serialize(japanese, MemoryPackSerializeOptions.Utf16);
4646
}
4747

4848
[Benchmark]
@@ -60,7 +60,7 @@ public byte[] SerializeUtf8Japanese()
6060
[Benchmark]
6161
public byte[] SerializeUtf16LargeAscii()
6262
{
63-
return MemoryPackSerializer.Serialize(largeAscii, MemoryPackSerializeOptions.Default);
63+
return MemoryPackSerializer.Serialize(largeAscii, MemoryPackSerializeOptions.Utf16);
6464
}
6565

6666
[Benchmark]

sandbox/Benchmark/Program.cs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,14 @@
4444

4545
//BenchmarkRunner.Run<SerializeTest<JsonResponseModel>>(config, args);
4646

47-
BenchmarkRunner.Run<Utf16VsUtf8>(config, args);
47+
//BenchmarkRunner.Run<Utf16VsUtf8>(config, args);
4848

4949
//BenchmarkRunner.Run<SerializeTest<NeuralNetworkLayerModel>>(config, args);
5050

5151
// BenchmarkRunner.Run<DeserializeTest<NeuralNetworkLayerModel>>(config, args);
52-
//BenchmarkRunner.Run<DeserializeTest<JsonResponseModel>>(config, args);
52+
53+
54+
BenchmarkRunner.Run<DeserializeTest<JsonResponseModel>>(config, args);
5355

5456

5557
//BenchmarkRunner.Run<GetLocalVsStaticField>(config, args);
@@ -67,7 +69,7 @@
6769
Console.WriteLine(foo);
6870

6971
Check<JsonResponseModel>();
70-
//Check<NeuralNetworkLayerModel>();
72+
Check<NeuralNetworkLayerModel>();
7173

7274
void Check<T>()
7375
where T : IInitializable, IEquatable<T>, new()

sandbox/SandboxConsoleApp/Program.cs

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,40 @@
1616
using System.Linq;
1717
using System.Numerics;
1818
using System.Runtime.CompilerServices;
19+
using System.Runtime.InteropServices;
1920
using System.Security.Cryptography.X509Certificates;
2021
using System.Text;
2122
using System.Xml.Linq;
2223

2324

2425

25-
var bin = MemoryPackSerializer.Serialize("hogehoge");
26-
var takotako = MemoryPackSerializer.Deserialize<string>(bin);
26+
//var bin = MemoryPackSerializer.Serialize("hogehoge");
27+
//var takotako = MemoryPackSerializer.Deserialize<string>(bin);
2728

28-
Console.WriteLine(takotako);
29+
//Console.WriteLine(takotako);
2930

3031
// ---
3132

33+
var str = "あいうえおかきくけこさしすせそたちつてとなにぬねの";
34+
var bytes = Encoding.UTF8.GetBytes(str);
3235

36+
var encoder = new BrotliEncoder(4, 22);
3337

34-
//var encoder = new BrotliEncoder(4, 22);
38+
39+
40+
41+
42+
var dest = new byte[1024];
43+
44+
//bytes = MemoryMarshal.AsBytes(str.AsSpan()).ToArray();
45+
46+
encoder.Compress(bytes, dest, out var consumed, out var written, true);
47+
48+
49+
var foo = dest.AsSpan(0, written).ToArray();
50+
51+
Console.WriteLine(bytes.Length);
52+
Console.WriteLine(foo.Length);
3553

3654
//// new BrotliDecoder().Decompress(
3755

src/MemoryPack.Core/MemoryPackReader.cs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.Buffers;
2+
using System.Reflection.Emit;
23
using System.Reflection.Metadata;
34
using System.Runtime.CompilerServices;
45
using System.Runtime.InteropServices;
@@ -234,28 +235,39 @@ string ReadUtf8(int utf8Length)
234235

235236
utf8Length = ~utf8Length;
236237

237-
// TODO:security
238-
239238
ref var spanRef = ref GetSpanReference(utf8Length + 4); // + read utf16 length
240239

241240
string str;
242241
var utf16Length = Unsafe.ReadUnaligned<int>(ref spanRef);
242+
243243
if (utf16Length <= 0)
244244
{
245245
var src = MemoryMarshal.CreateReadOnlySpan(ref Unsafe.Add(ref spanRef, 4), utf8Length);
246246
str = Encoding.UTF8.GetString(src);
247247
}
248248
else
249249
{
250+
// check malformed utf16Length
251+
var max = unchecked((Remaining + 1) * 3);
252+
if (max < 0) max = int.MaxValue;
253+
if (max < utf16Length)
254+
{
255+
MemoryPackSerializationException.ThrowInsufficientBufferUnless(utf8Length);
256+
}
257+
258+
// regular path, know decoded UTF16 length will gets faster decode result
250259
unsafe
251260
{
252261
fixed (byte* p = &Unsafe.Add(ref spanRef, 4))
253262
{
254263
str = string.Create(utf16Length, ((IntPtr)p, utf8Length), static (dest, state) =>
255264
{
256265
var src = MemoryMarshal.CreateSpan(ref Unsafe.AsRef<byte>((byte*)state.Item1), state.Item2);
257-
var status = Utf8.ToUtf16(src, dest, out var bytesRead, out var charsWritten);
258-
// TODO: throw when status failed
266+
var status = Utf8.ToUtf16(src, dest, out var bytesRead, out var charsWritten, replaceInvalidSequences: false);
267+
if (status != OperationStatus.Done)
268+
{
269+
MemoryPackSerializationException.ThrowFailedEncoding(status);
270+
}
259271
});
260272
}
261273
}

src/MemoryPack.Core/MemoryPackSerializationException.cs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System.Diagnostics.CodeAnalysis;
1+
using System.Buffers;
2+
using System.Diagnostics.CodeAnalysis;
23

34
namespace MemoryPack;
45

@@ -103,4 +104,10 @@ public static void ThrowDeserializeObjectIsNull(string target)
103104
{
104105
throw new MemoryPackSerializationException($"Deserialized {target} is null.");
105106
}
107+
108+
[DoesNotReturn]
109+
public static void ThrowFailedEncoding(OperationStatus status)
110+
{
111+
throw new MemoryPackSerializationException($"Failed Utf8 encoding/decoding process, status: {status}.");
112+
}
106113
}

src/MemoryPack.Core/MemoryPackSerializeOptions.cs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22

33
public record MemoryPackSerializeOptions
44
{
5-
public static MemoryPackSerializeOptions Default = new MemoryPackSerializeOptions { StringEncoding = StringEncoding.Utf16 };
6-
public static MemoryPackSerializeOptions Utf8 = Default with { StringEncoding = StringEncoding.Utf8 };
5+
// Default is Utf8
6+
public static readonly MemoryPackSerializeOptions Default = new MemoryPackSerializeOptions { StringEncoding = StringEncoding.Utf8 };
7+
8+
public static readonly MemoryPackSerializeOptions Utf8 = Default with { StringEncoding = StringEncoding.Utf8 };
9+
public static readonly MemoryPackSerializeOptions Utf16 = Default with { StringEncoding = StringEncoding.Utf16 };
710

811
public StringEncoding StringEncoding { get; init; }
912
}

src/MemoryPack.Core/MemoryPackWriter.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ void WriteUtf16(string value)
208208
Advance(copyByteCount + 4);
209209
}
210210

211-
[MethodImpl(MethodImplOptions.NoInlining)] // non default, no inline
211+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
212212
void WriteUtf8(string value)
213213
{
214214
// [utf8-length, utf16-length, utf8-value]
@@ -220,14 +220,14 @@ void WriteUtf8(string value)
220220

221221
ref var destPointer = ref GetSpanReference(maxByteCount + 8); // header
222222

223-
// write utf8-length is final
223+
// write utf16-length
224224
Unsafe.WriteUnaligned(ref Unsafe.Add(ref destPointer, 4), source.Length);
225225

226226
var dest = MemoryMarshal.CreateSpan(ref Unsafe.Add(ref destPointer, 8), maxByteCount);
227-
var status = Utf8.FromUtf16(source, dest, out var _, out var bytesWritten);
227+
var status = Utf8.FromUtf16(source, dest, out var _, out var bytesWritten, replaceInvalidSequences: false);
228228
if (status != OperationStatus.Done)
229229
{
230-
// TODO: throw when write failed.
230+
MemoryPackSerializationException.ThrowFailedEncoding(status);
231231
}
232232

233233
// write written utf8-length in header, that is ~length

0 commit comments

Comments
 (0)