From 5b6254e853cf6055f220505eb1a7163bf8d0078f Mon Sep 17 00:00:00 2001 From: Rod Vagg Date: Mon, 31 Jan 2022 16:41:11 +1100 Subject: [PATCH 1/2] feat: add dagcbor.EncodedLength(Node) to calculate length without encoding For pre-allocation if it's going to be more efficient to walk the graph twice than have sloppy allocation during encode. --- codec/dagcbor/marshal.go | 117 ++++++++++++++++++++++++++++++++ codec/dagcbor/roundtrip_test.go | 5 ++ 2 files changed, 122 insertions(+) diff --git a/codec/dagcbor/marshal.go b/codec/dagcbor/marshal.go index ee8a8b38..ea2587c4 100644 --- a/codec/dagcbor/marshal.go +++ b/codec/dagcbor/marshal.go @@ -245,3 +245,120 @@ func marshalMap(n datamodel.Node, tk *tok.Token, sink shared.TokenSink, options _, err := sink.Step(tk) return err } + +// EncodedLength will calculate the length in bytes that the encoded form of the +// provided Node will occupy. +// In some circmstances is may be advantageous to be able to pre-allocate the +// bytes and encode into those bytes than require the encoding process to write +// to a flexible byte sink. +// Note that this function requires a full walk of the Node's graph, which may +// not necessarily be a trivial cost. +func EncodedLength(n datamodel.Node) (int64, error) { + switch n.Kind() { + case datamodel.Kind_Invalid: + return 0, fmt.Errorf("cannot traverse a node that is absent") + case datamodel.Kind_Null: + return 1, nil // 0xf6 + case datamodel.Kind_Map: + length := uintLength(n.Length()) // length prefixed major 5 + for itr := n.MapIterator(); !itr.Done(); { + k, v, err := itr.Next() + if err != nil { + return 0, err + } + keyLength, err := EncodedLength(k) + if err != nil { + return 0, err + } + length += keyLength + valueLength, err := EncodedLength(v) + if err != nil { + return 0, err + } + length += valueLength + } + return length, nil + case datamodel.Kind_List: + nl := n.Length() + length := uintLength(nl) // length prefixed major 4 + for i := int64(0); i < nl; i++ { + v, err := n.LookupByIndex(i) + if err != nil { + return 0, err + } + innerLength, err := EncodedLength(v) + if err != nil { + return 0, err + } + length += innerLength + } + return length, nil + case datamodel.Kind_Bool: + return 1, nil // 0xf4 or 0xf5 + case datamodel.Kind_Int: + v, err := n.AsInt() + if err != nil { + return 0, err + } + return uintLength(v), nil // major 0 or 1, as small as possible + case datamodel.Kind_Float: + return 9, nil // always major 7 and 64-bit float + case datamodel.Kind_String: + v, err := n.AsString() + if err != nil { + return 0, err + } + + return uintLength(int64(len(v))) + int64(len(v)), nil // length prefixed major 3 + case datamodel.Kind_Bytes: + v, err := n.AsBytes() + if err != nil { + return 0, err + } + return uintLength(int64(len(v))) + int64(len(v)), nil // length prefixed major 2 + case datamodel.Kind_Link: + v, err := n.AsLink() + if err != nil { + return 0, err + } + switch lnk := v.(type) { + case cidlink.Link: + length := int64(2) // tag,42: 0xd82a + bl := int64(len(lnk.Bytes())) + 1 // additional 0x00 in front of the CID bytes + length += uintLength(bl) + bl // length prefixed major 2 + return length, err + default: + return 0, fmt.Errorf("schemafree link emission only supported by this codec for CID type links") + } + default: + panic("unreachable") + } +} + +// Calculate how many bytes an integer, and therefore also the leading bytes of +// a length-prefixed token. CBOR will pack it up into the smallest possible +// uint representation, even merging it with the major if it's <=23. + +type boundaryLength struct { + upperBound int64 + length int64 +} + +var lengthBoundaries = []boundaryLength{ + {24, 1}, // packed major|minor + {256, 2}, // major, 8-bit length + {65536, 3}, // major, 16-bit length + {4294967296, 5}, // major, 32-bit length + {-1, 9}, // major, 64-bit length +} + +func uintLength(length int64) int64 { + for _, lb := range lengthBoundaries { + if length < lb.upperBound { + return lb.length + } + } + // maximum number of bytes to pack this length + // also improbable, and likely a very bad Node that shouldn't be encoded + return lengthBoundaries[len(lengthBoundaries)-1].length +} diff --git a/codec/dagcbor/roundtrip_test.go b/codec/dagcbor/roundtrip_test.go index fef0f5a6..babce5e5 100644 --- a/codec/dagcbor/roundtrip_test.go +++ b/codec/dagcbor/roundtrip_test.go @@ -56,6 +56,11 @@ func TestRoundtrip(t *testing.T) { qt.Assert(t, err, qt.IsNil) qt.Check(t, buf.String(), qt.Equals, serial) }) + t.Run("length", func(t *testing.T) { + length, err := EncodedLength(n) + qt.Assert(t, err, qt.IsNil) + qt.Check(t, length, qt.Equals, int64(len(serial))) + }) t.Run("decoding", func(t *testing.T) { buf := strings.NewReader(serial) nb := basicnode.Prototype.Map.NewBuilder() From feb8a2a9831cd5c98e7d64425a5b280c52633ed7 Mon Sep 17 00:00:00 2001 From: Rod Vagg Date: Wed, 2 Feb 2022 15:53:25 +1100 Subject: [PATCH 2/2] fix: minor EncodedLength fixes, add tests to fully exercise --- codec/dagcbor/marshal.go | 41 +++++++++++--------- codec/dagcbor/marshal_test.go | 70 +++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 18 deletions(-) create mode 100644 codec/dagcbor/marshal_test.go diff --git a/codec/dagcbor/marshal.go b/codec/dagcbor/marshal.go index ea2587c4..6798d050 100644 --- a/codec/dagcbor/marshal.go +++ b/codec/dagcbor/marshal.go @@ -248,11 +248,11 @@ func marshalMap(n datamodel.Node, tk *tok.Token, sink shared.TokenSink, options // EncodedLength will calculate the length in bytes that the encoded form of the // provided Node will occupy. -// In some circmstances is may be advantageous to be able to pre-allocate the -// bytes and encode into those bytes than require the encoding process to write -// to a flexible byte sink. +// // Note that this function requires a full walk of the Node's graph, which may -// not necessarily be a trivial cost. +// not necessarily be a trivial cost and will incur some allocations. Using this +// method to calculate buffers to pre-allocate may not result in performance +// gains, but rather incur an overall cost. Use with care. func EncodedLength(n datamodel.Node) (int64, error) { switch n.Kind() { case datamodel.Kind_Invalid: @@ -260,7 +260,7 @@ func EncodedLength(n datamodel.Node) (int64, error) { case datamodel.Kind_Null: return 1, nil // 0xf6 case datamodel.Kind_Map: - length := uintLength(n.Length()) // length prefixed major 5 + length := uintLength(uint64(n.Length())) // length prefixed major 5 for itr := n.MapIterator(); !itr.Done(); { k, v, err := itr.Next() if err != nil { @@ -280,7 +280,7 @@ func EncodedLength(n datamodel.Node) (int64, error) { return length, nil case datamodel.Kind_List: nl := n.Length() - length := uintLength(nl) // length prefixed major 4 + length := uintLength(uint64(nl)) // length prefixed major 4 for i := int64(0); i < nl; i++ { v, err := n.LookupByIndex(i) if err != nil { @@ -300,7 +300,10 @@ func EncodedLength(n datamodel.Node) (int64, error) { if err != nil { return 0, err } - return uintLength(v), nil // major 0 or 1, as small as possible + if v < 0 { + v = -v - 1 // negint is stored as one less than actual + } + return uintLength(uint64(v)), nil // major 0 or 1, as small as possible case datamodel.Kind_Float: return 9, nil // always major 7 and 64-bit float case datamodel.Kind_String: @@ -309,13 +312,13 @@ func EncodedLength(n datamodel.Node) (int64, error) { return 0, err } - return uintLength(int64(len(v))) + int64(len(v)), nil // length prefixed major 3 + return uintLength(uint64(len(v))) + int64(len(v)), nil // length prefixed major 3 case datamodel.Kind_Bytes: v, err := n.AsBytes() if err != nil { return 0, err } - return uintLength(int64(len(v))) + int64(len(v)), nil // length prefixed major 2 + return uintLength(uint64(len(v))) + int64(len(v)), nil // length prefixed major 2 case datamodel.Kind_Link: v, err := n.AsLink() if err != nil { @@ -323,9 +326,9 @@ func EncodedLength(n datamodel.Node) (int64, error) { } switch lnk := v.(type) { case cidlink.Link: - length := int64(2) // tag,42: 0xd82a - bl := int64(len(lnk.Bytes())) + 1 // additional 0x00 in front of the CID bytes - length += uintLength(bl) + bl // length prefixed major 2 + length := int64(2) // tag,42: 0xd82a + bl := int64(len(lnk.Bytes())) + 1 // additional 0x00 in front of the CID bytes + length += uintLength(uint64(bl)) + bl // length prefixed major 2 return length, err default: return 0, fmt.Errorf("schemafree link emission only supported by this codec for CID type links") @@ -340,7 +343,7 @@ func EncodedLength(n datamodel.Node) (int64, error) { // uint representation, even merging it with the major if it's <=23. type boundaryLength struct { - upperBound int64 + upperBound uint64 length int64 } @@ -349,16 +352,18 @@ var lengthBoundaries = []boundaryLength{ {256, 2}, // major, 8-bit length {65536, 3}, // major, 16-bit length {4294967296, 5}, // major, 32-bit length - {-1, 9}, // major, 64-bit length + {0, 9}, // major, 64-bit length } -func uintLength(length int64) int64 { +func uintLength(ii uint64) int64 { for _, lb := range lengthBoundaries { - if length < lb.upperBound { + if ii < lb.upperBound { return lb.length } } - // maximum number of bytes to pack this length - // also improbable, and likely a very bad Node that shouldn't be encoded + // maximum number of bytes to pack this int + // if this int is used as a length prefix for a map, list, string or bytes + // then we likely have a very bad Node that shouldn't be encoded, but the + // encoder may raise problems with that if the memory allocator doesn't first. return lengthBoundaries[len(lengthBoundaries)-1].length } diff --git a/codec/dagcbor/marshal_test.go b/codec/dagcbor/marshal_test.go new file mode 100644 index 00000000..50eacf6c --- /dev/null +++ b/codec/dagcbor/marshal_test.go @@ -0,0 +1,70 @@ +package dagcbor + +import ( + "bytes" + "math/rand" + "testing" + "time" + + qt "github.com/frankban/quicktest" + "github.com/ipld/go-ipld-prime/datamodel" + basicnode "github.com/ipld/go-ipld-prime/node/basic" + "github.com/ipld/go-ipld-prime/testutil/garbage" +) + +func calculateActualLength(t *testing.T, n datamodel.Node) int64 { + var buf bytes.Buffer + err := Encode(n, &buf) + qt.Assert(t, err, qt.IsNil) + return int64(buf.Len()) +} + +func verifyEstimatedSize(t *testing.T, n datamodel.Node) { + estimatedLength, err := EncodedLength(n) + qt.Assert(t, err, qt.IsNil) + actualLength := calculateActualLength(t, n) + qt.Assert(t, estimatedLength, qt.Equals, actualLength) +} + +func TestEncodedLength(t *testing.T) { + t.Run("int boundaries", func(t *testing.T) { + for ii := 0; ii < 4; ii++ { + verifyEstimatedSize(t, basicnode.NewInt(int64(lengthBoundaries[ii].upperBound))) + verifyEstimatedSize(t, basicnode.NewInt(int64(lengthBoundaries[ii].upperBound)-1)) + verifyEstimatedSize(t, basicnode.NewInt(int64(lengthBoundaries[ii].upperBound)+1)) + verifyEstimatedSize(t, basicnode.NewInt(-1*int64(lengthBoundaries[ii].upperBound))) + verifyEstimatedSize(t, basicnode.NewInt(-1*int64(lengthBoundaries[ii].upperBound)-1)) + verifyEstimatedSize(t, basicnode.NewInt(-1*int64(lengthBoundaries[ii].upperBound)+1)) + } + }) + + t.Run("small garbage", func(t *testing.T) { + seed := time.Now().Unix() + t.Logf("randomness seed: %v\n", seed) + rnd := rand.New(rand.NewSource(seed)) + for i := 0; i < 1000; i++ { + gbg := garbage.Generate(rnd, garbage.TargetBlockSize(1<<6)) + verifyEstimatedSize(t, gbg) + } + }) + + t.Run("medium garbage", func(t *testing.T) { + seed := time.Now().Unix() + t.Logf("randomness seed: %v\n", seed) + rnd := rand.New(rand.NewSource(seed)) + for i := 0; i < 100; i++ { + gbg := garbage.Generate(rnd, garbage.TargetBlockSize(1<<16)) + verifyEstimatedSize(t, gbg) + } + }) + + t.Run("large garbage", func(t *testing.T) { + seed := time.Now().Unix() + t.Logf("randomness seed: %v\n", seed) + rnd := rand.New(rand.NewSource(seed)) + for i := 0; i < 10; i++ { + gbg := garbage.Generate(rnd, garbage.TargetBlockSize(1<<20)) + verifyEstimatedSize(t, gbg) + } + }) +}