diff --git a/codec/dagcbor/marshal.go b/codec/dagcbor/marshal.go index ee8a8b38..6798d050 100644 --- a/codec/dagcbor/marshal.go +++ b/codec/dagcbor/marshal.go @@ -245,3 +245,125 @@ func marshalMap(n datamodel.Node, tk *tok.Token, sink shared.TokenSink, options _, err := sink.Step(tk) return err } + +// EncodedLength will calculate the length in bytes that the encoded form of the +// provided Node will occupy. +// +// Note that this function requires a full walk of the Node's graph, which may +// not necessarily be a trivial cost and will incur some allocations. Using this +// method to calculate buffers to pre-allocate may not result in performance +// gains, but rather incur an overall cost. Use with care. +func EncodedLength(n datamodel.Node) (int64, error) { + switch n.Kind() { + case datamodel.Kind_Invalid: + return 0, fmt.Errorf("cannot traverse a node that is absent") + case datamodel.Kind_Null: + return 1, nil // 0xf6 + case datamodel.Kind_Map: + length := uintLength(uint64(n.Length())) // length prefixed major 5 + for itr := n.MapIterator(); !itr.Done(); { + k, v, err := itr.Next() + if err != nil { + return 0, err + } + keyLength, err := EncodedLength(k) + if err != nil { + return 0, err + } + length += keyLength + valueLength, err := EncodedLength(v) + if err != nil { + return 0, err + } + length += valueLength + } + return length, nil + case datamodel.Kind_List: + nl := n.Length() + length := uintLength(uint64(nl)) // length prefixed major 4 + for i := int64(0); i < nl; i++ { + v, err := n.LookupByIndex(i) + if err != nil { + return 0, err + } + innerLength, err := EncodedLength(v) + if err != nil { + return 0, err + } + length += innerLength + } + return length, nil + case datamodel.Kind_Bool: + return 1, nil // 0xf4 or 0xf5 + case datamodel.Kind_Int: + v, err := n.AsInt() + if err != nil { + return 0, err + } + if v < 0 { + v = -v - 1 // negint is stored as one less than actual + } + return uintLength(uint64(v)), nil // major 0 or 1, as small as possible + case datamodel.Kind_Float: + return 9, nil // always major 7 and 64-bit float + case datamodel.Kind_String: + v, err := n.AsString() + if err != nil { + return 0, err + } + + return uintLength(uint64(len(v))) + int64(len(v)), nil // length prefixed major 3 + case datamodel.Kind_Bytes: + v, err := n.AsBytes() + if err != nil { + return 0, err + } + return uintLength(uint64(len(v))) + int64(len(v)), nil // length prefixed major 2 + case datamodel.Kind_Link: + v, err := n.AsLink() + if err != nil { + return 0, err + } + switch lnk := v.(type) { + case cidlink.Link: + length := int64(2) // tag,42: 0xd82a + bl := int64(len(lnk.Bytes())) + 1 // additional 0x00 in front of the CID bytes + length += uintLength(uint64(bl)) + bl // length prefixed major 2 + return length, err + default: + return 0, fmt.Errorf("schemafree link emission only supported by this codec for CID type links") + } + default: + panic("unreachable") + } +} + +// Calculate how many bytes an integer, and therefore also the leading bytes of +// a length-prefixed token. CBOR will pack it up into the smallest possible +// uint representation, even merging it with the major if it's <=23. + +type boundaryLength struct { + upperBound uint64 + length int64 +} + +var lengthBoundaries = []boundaryLength{ + {24, 1}, // packed major|minor + {256, 2}, // major, 8-bit length + {65536, 3}, // major, 16-bit length + {4294967296, 5}, // major, 32-bit length + {0, 9}, // major, 64-bit length +} + +func uintLength(ii uint64) int64 { + for _, lb := range lengthBoundaries { + if ii < lb.upperBound { + return lb.length + } + } + // maximum number of bytes to pack this int + // if this int is used as a length prefix for a map, list, string or bytes + // then we likely have a very bad Node that shouldn't be encoded, but the + // encoder may raise problems with that if the memory allocator doesn't first. + return lengthBoundaries[len(lengthBoundaries)-1].length +} diff --git a/codec/dagcbor/marshal_test.go b/codec/dagcbor/marshal_test.go new file mode 100644 index 00000000..50eacf6c --- /dev/null +++ b/codec/dagcbor/marshal_test.go @@ -0,0 +1,70 @@ +package dagcbor + +import ( + "bytes" + "math/rand" + "testing" + "time" + + qt "github.com/frankban/quicktest" + "github.com/ipld/go-ipld-prime/datamodel" + basicnode "github.com/ipld/go-ipld-prime/node/basic" + "github.com/ipld/go-ipld-prime/testutil/garbage" +) + +func calculateActualLength(t *testing.T, n datamodel.Node) int64 { + var buf bytes.Buffer + err := Encode(n, &buf) + qt.Assert(t, err, qt.IsNil) + return int64(buf.Len()) +} + +func verifyEstimatedSize(t *testing.T, n datamodel.Node) { + estimatedLength, err := EncodedLength(n) + qt.Assert(t, err, qt.IsNil) + actualLength := calculateActualLength(t, n) + qt.Assert(t, estimatedLength, qt.Equals, actualLength) +} + +func TestEncodedLength(t *testing.T) { + t.Run("int boundaries", func(t *testing.T) { + for ii := 0; ii < 4; ii++ { + verifyEstimatedSize(t, basicnode.NewInt(int64(lengthBoundaries[ii].upperBound))) + verifyEstimatedSize(t, basicnode.NewInt(int64(lengthBoundaries[ii].upperBound)-1)) + verifyEstimatedSize(t, basicnode.NewInt(int64(lengthBoundaries[ii].upperBound)+1)) + verifyEstimatedSize(t, basicnode.NewInt(-1*int64(lengthBoundaries[ii].upperBound))) + verifyEstimatedSize(t, basicnode.NewInt(-1*int64(lengthBoundaries[ii].upperBound)-1)) + verifyEstimatedSize(t, basicnode.NewInt(-1*int64(lengthBoundaries[ii].upperBound)+1)) + } + }) + + t.Run("small garbage", func(t *testing.T) { + seed := time.Now().Unix() + t.Logf("randomness seed: %v\n", seed) + rnd := rand.New(rand.NewSource(seed)) + for i := 0; i < 1000; i++ { + gbg := garbage.Generate(rnd, garbage.TargetBlockSize(1<<6)) + verifyEstimatedSize(t, gbg) + } + }) + + t.Run("medium garbage", func(t *testing.T) { + seed := time.Now().Unix() + t.Logf("randomness seed: %v\n", seed) + rnd := rand.New(rand.NewSource(seed)) + for i := 0; i < 100; i++ { + gbg := garbage.Generate(rnd, garbage.TargetBlockSize(1<<16)) + verifyEstimatedSize(t, gbg) + } + }) + + t.Run("large garbage", func(t *testing.T) { + seed := time.Now().Unix() + t.Logf("randomness seed: %v\n", seed) + rnd := rand.New(rand.NewSource(seed)) + for i := 0; i < 10; i++ { + gbg := garbage.Generate(rnd, garbage.TargetBlockSize(1<<20)) + verifyEstimatedSize(t, gbg) + } + }) +} diff --git a/codec/dagcbor/roundtrip_test.go b/codec/dagcbor/roundtrip_test.go index fef0f5a6..babce5e5 100644 --- a/codec/dagcbor/roundtrip_test.go +++ b/codec/dagcbor/roundtrip_test.go @@ -56,6 +56,11 @@ func TestRoundtrip(t *testing.T) { qt.Assert(t, err, qt.IsNil) qt.Check(t, buf.String(), qt.Equals, serial) }) + t.Run("length", func(t *testing.T) { + length, err := EncodedLength(n) + qt.Assert(t, err, qt.IsNil) + qt.Check(t, length, qt.Equals, int64(len(serial))) + }) t.Run("decoding", func(t *testing.T) { buf := strings.NewReader(serial) nb := basicnode.Prototype.Map.NewBuilder()