diff --git a/go.mod b/go.mod index fc5b771f19..bacec63d2d 100644 --- a/go.mod +++ b/go.mod @@ -12,9 +12,9 @@ require ( github.com/mikefarah/yq/v4 v4.52.5 go.uber.org/zap v1.28.0 gocloud.dev v0.22.0 - golang.org/x/mod v0.36.0 + golang.org/x/mod v0.37.0 golang.org/x/oauth2 v0.36.0 - golang.org/x/sync v0.20.0 + golang.org/x/sync v0.21.0 google.golang.org/api v0.198.0 istio.io/api v0.0.0-20231206023236-e7cadb36da57 istio.io/client-go v1.18.7 @@ -22,12 +22,12 @@ require ( k8s.io/apimachinery v0.35.6 k8s.io/client-go v0.35.6 k8s.io/code-generator v0.35.6 - knative.dev/caching v0.0.0-20260602144006-dd092fc806ea - knative.dev/eventing v0.49.1-0.20260611075324-69b976f7fd56 + knative.dev/caching v0.0.0-20260615151736-e6e6a89d939a + knative.dev/eventing v0.49.1-0.20260615165644-9a2ae8b2b023 knative.dev/hack v0.0.0-20260428014158-b2a37f1b6e7b - knative.dev/pkg v0.0.0-20260615142336-0f785e385ea0 + knative.dev/pkg v0.0.0-20260615201544-6300c57a9e78 knative.dev/reconciler-test v0.0.0-20260602150814-125bf8d48e1c - knative.dev/serving v0.49.1-0.20260615142436-1683170667da + knative.dev/serving v0.49.1-0.20260615163344-394d3f959991 sigs.k8s.io/cluster-inventory-api v0.1.3 sigs.k8s.io/controller-tools v0.20.1 sigs.k8s.io/yaml v1.6.0 @@ -174,13 +174,13 @@ require ( go.yaml.in/yaml/v2 v2.4.4 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect go.yaml.in/yaml/v4 v4.0.0-rc.3 // indirect - golang.org/x/crypto v0.51.0 // indirect - golang.org/x/net v0.55.0 // indirect - golang.org/x/sys v0.45.0 // indirect - golang.org/x/term v0.43.0 // indirect - golang.org/x/text v0.37.0 // indirect + golang.org/x/crypto v0.53.0 // indirect + golang.org/x/net v0.56.0 // indirect + golang.org/x/sys v0.46.0 // indirect + golang.org/x/term v0.44.0 // indirect + golang.org/x/text v0.38.0 // indirect golang.org/x/time v0.15.0 // indirect - golang.org/x/tools v0.45.0 // indirect + golang.org/x/tools v0.46.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 // indirect diff --git a/go.sum b/go.sum index 34703f65b8..bd598305df 100644 --- a/go.sum +++ b/go.sum @@ -1114,8 +1114,8 @@ golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf golang.org/x/crypto v0.16.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= -golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= -golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= +golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= +golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1164,8 +1164,8 @@ golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/mod v0.36.0 h1:JJjpVx6myfUsUdAzZuOSTTmRE0PfZeNWzzvKrP7amb4= -golang.org/x/mod v0.36.0/go.mod h1:moc6ELqsWcOw5Ef3xVprK5ul/MvtVvkIXLziUOICjUQ= +golang.org/x/mod v0.37.0 h1:vF1DjpVEshcIqoEaauuHebaLk1O1forxjxBaVn884JQ= +golang.org/x/mod v0.37.0/go.mod h1:m8S8VeM9r4dzDwjrKO0a1sZP3YjeMamRRlD+fmR2Q/0= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1238,8 +1238,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= -golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= -golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= +golang.org/x/net v0.56.0 h1:Rw8j/hFzGvJUZwNBXnAtf5sVDVt+65SK2C7IxCxZt5o= +golang.org/x/net v0.56.0/go.mod h1:D3Ku6r+V6JROoZK144D2XfMHFcMq/0zSfLelVTCFKec= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1280,8 +1280,8 @@ golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.2.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= -golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM= +golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -1385,8 +1385,8 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= -golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= +golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -1403,8 +1403,8 @@ golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= -golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= -golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= +golang.org/x/term v0.44.0 h1:0rLvDRCtNj0gZkyIXhCyOb2OAzEhLVqc4B+hrsBhrmc= +golang.org/x/term v0.44.0/go.mod h1:7ze4MdzUzLXpSAoFP1H0bOI9aXDqveSvatT5vKcFh2Y= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1422,8 +1422,8 @@ golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= -golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= +golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= +golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1513,8 +1513,8 @@ golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= golang.org/x/tools v0.9.1/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= -golang.org/x/tools v0.45.0 h1:18qN3FAooORvApf5XjCXgsuayZOEtXf6JK18I3+ONa8= -golang.org/x/tools v0.45.0/go.mod h1:LuUGqqaXcXMEFEruIVJVm5mgDD8vww/z/SR1gQ4uE/0= +golang.org/x/tools v0.46.0 h1:7jTurBkPZu4moS/Uy4OQT1M+QBlsj3wejyZwsT8Z7rk= +golang.org/x/tools v0.46.0/go.mod h1:FrD85F8l+NWL+9XWBSyVSHO6Ne4jutsfIFba7AWQ5Ys= golang.org/x/tools/go/expect v0.1.1-deprecated h1:jpBZDwmgPhXsKZC6WhL20P4b/wmnpsEAGHaNy0n/rJM= golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated h1:1h2MnaIAIXISqTFKdENegdpAgUXz6NrPEsbIeWaBRvM= @@ -1812,20 +1812,20 @@ k8s.io/utils v0.0.0-20210819203725-bdf08cb9a70a/go.mod h1:jPW/WVKK9YHAvNhRxK0md/ k8s.io/utils v0.0.0-20230209194617-a36077c30491/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 h1:AZYQSJemyQB5eRxqcPky+/7EdBj0xi3g0ZcxxJ7vbWU= k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= -knative.dev/caching v0.0.0-20260602144006-dd092fc806ea h1:mEaJOrBttRcfb6pzWBsgVG6K18QbcFfAe0yDKOHsHJ4= -knative.dev/caching v0.0.0-20260602144006-dd092fc806ea/go.mod h1:CFF4YV9sYdDj7mLkBomZO7/F0cc0WCOTW5Gbd75YiA8= -knative.dev/eventing v0.49.1-0.20260611075324-69b976f7fd56 h1:AHDySG8Wz6mQuiKAXanlYWLjTM8hRU9P8sBPKcHbOso= -knative.dev/eventing v0.49.1-0.20260611075324-69b976f7fd56/go.mod h1:ibEWNiJvssamCNesm0N1BmB5rhWdAyJHYEIcxI+99sQ= +knative.dev/caching v0.0.0-20260615151736-e6e6a89d939a h1:tclytWjklHF4rgQMH3NtXLyQdXl/fUQvAPnyL2o6S7A= +knative.dev/caching v0.0.0-20260615151736-e6e6a89d939a/go.mod h1:/uMgO2vuHg8jVnmknNAlhjsIG12A8T7Zyl4A4wqbPx0= +knative.dev/eventing v0.49.1-0.20260615165644-9a2ae8b2b023 h1:ummPeOc6N040T4/W0az4eR1fXKv0Vu24m1DVv9jysVw= +knative.dev/eventing v0.49.1-0.20260615165644-9a2ae8b2b023/go.mod h1:1+9UhTLOb6iFQ63L6rC02wZlYlpB8STI+Z7/aG3vodc= knative.dev/hack v0.0.0-20260428014158-b2a37f1b6e7b h1:MvbV2F2BdI8qKrYYUhDwbUZbX0BAYRSIpXM2TOtTvs0= knative.dev/hack v0.0.0-20260428014158-b2a37f1b6e7b/go.mod h1:L5RzHgbvam0u8QFHfzCX6MKxu/a/gIGEdaRBqNiVbl0= knative.dev/networking v0.0.0-20260602144506-c8765a725c2b h1:SlEkmc5m5EAaXomG82qYifOXC+YpLg+jPS4VcDkIz3c= knative.dev/networking v0.0.0-20260602144506-c8765a725c2b/go.mod h1:Dx+BnBfPyYXFP+DiuB9mzLJiJ4gaSo0/dhRGTkU4Tjw= -knative.dev/pkg v0.0.0-20260615142336-0f785e385ea0 h1:/DuVdR9EfTJQh73U7vymbSYwzNrJSaI1turWaOpfxs8= -knative.dev/pkg v0.0.0-20260615142336-0f785e385ea0/go.mod h1:7YlXMU5kaJVKc15ilLYkVhbv9F3UGIPUpIrQuwSQa4E= +knative.dev/pkg v0.0.0-20260615201544-6300c57a9e78 h1:l6cYazykii3EtY+DVcQOBMgGQ9WBnhAKeGRSo+15JiI= +knative.dev/pkg v0.0.0-20260615201544-6300c57a9e78/go.mod h1:Ve19ZYW7DwIfQL4oCT9t9zmPp4egv0KacKVPXUcivDQ= knative.dev/reconciler-test v0.0.0-20260602150814-125bf8d48e1c h1:M8GPqYxAGOLtvqk0fNUjnJp6rzxqpIRCbESh3hfMsVs= knative.dev/reconciler-test v0.0.0-20260602150814-125bf8d48e1c/go.mod h1:QBq7VXOkZ5mbuXn7lK2pGCXLGIJsRyh0hSjoF1wd1Ds= -knative.dev/serving v0.49.1-0.20260615142436-1683170667da h1:NCH/hZJYf7B6cqWBYLxk1ejk27fkECr1lLyRm+jLBPk= -knative.dev/serving v0.49.1-0.20260615142436-1683170667da/go.mod h1:fHHSUO7OjvmJmDIQ2VBsAHUjN24c75FmW2vudgbMU48= +knative.dev/serving v0.49.1-0.20260615163344-394d3f959991 h1:r5GvAQqSlqp2hIYggUG4EqIUaRIa6LnIPAv/vbdwUNk= +knative.dev/serving v0.49.1-0.20260615163344-394d3f959991/go.mod h1:c6/K7I2EwY8LGJw2+9ktka1OOXJPD7Z3XqnjSX07wxw= nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0= pgregory.net/rapid v1.1.0 h1:CMa0sjHSru3puNx+J0MIAuiiEV4N0qj8/cMWGBBCsjw= pgregory.net/rapid v1.1.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go index b850e772e1..bfe546b60a 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go @@ -20,7 +20,7 @@ func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) var ( - useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 + useAVX2 = cpu.X86.HasSSSE3 && cpu.X86.HasAVX2 && cpu.X86.HasBMI2 ) // setupState writes a ChaCha20 input matrix to state. See @@ -47,7 +47,7 @@ func setupState(state *[16]uint32, key *[32]byte, nonce []byte) { } func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { - if !cpu.X86.HasSSSE3 { + if !useAVX2 { return c.sealGeneric(dst, nonce, plaintext, additionalData) } @@ -66,7 +66,7 @@ func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) [] } func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { - if !cpu.X86.HasSSSE3 { + if !useAVX2 { return c.openGeneric(dst, nonce, ciphertext, additionalData) } diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s index fd5ee845f9..c703c13471 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -6,27 +6,6 @@ // func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // Hack: Must declare #define macros inside of a function due to Avo constraints - // ROL rotates the uint32s in register R left by N bits, using temporary T. - #define ROL(N, R, T) \ - MOVO R, T; \ - PSLLL $(N), T; \ - PSRLL $(32-(N)), R; \ - PXOR T, R - - // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. - #ifdef GOAMD64_v2 - #define ROL8(R, T) PSHUFB ·rol8<>(SB), R - #else - #define ROL8(R, T) ROL(8, R, T) - #endif - - // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. - #ifdef GOAMD64_v2 - #define ROL16(R, T) PSHUFB ·rol16<>(SB), R - #else - #define ROL16(R, T) ROL(16, R, T) - #endif XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 @@ -192,676 +171,112 @@ hashADDone: // Requires: AVX, AVX2, BMI2, CMOV, SSE2 TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access - MOVQ SP, BP - ADDQ $0x20, BP - ANDQ $-32, BP - MOVQ dst_base+0(FP), DI - MOVQ key_base+24(FP), R8 - MOVQ src_base+48(FP), SI - MOVQ src_len+56(FP), BX - MOVQ ad_base+72(FP), CX - - // Check for AVX2 support - CMPB ·useAVX2+0(SB), $0x01 - JE chacha20Poly1305Open_AVX2 + MOVQ SP, BP + ADDQ $0x20, BP + ANDQ $-32, BP + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + VZEROUPPER + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VBROADCASTI128 16(R8), Y14 + VBROADCASTI128 32(R8), Y12 + VBROADCASTI128 48(R8), Y4 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ BX, $0x80 - JBE openSSE128 - - // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X9, X13 - - // Store state on stack for future use - MOVO X3, 32(BP) - MOVO X6, 48(BP) - MOVO X9, 128(BP) - MOVQ $0x0000000a, R9 - -openSSEPreparePolyKey: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - DECQ R9 - JNE openSSEPreparePolyKey + CMPQ BX, $0xc0 + JBE openAVX2192 + CMPQ BX, $0x00000140 + JBE openAVX2320 + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 + +openAVX2PreparePolyKey: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL 32(BP), X3 + // Clamp and store poly key + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVO X0, (BP) - MOVO X3, 16(BP) + // Stream for the first 64 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 - // Hash AAD + // Hash AD + first 64 bytes MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) + XORQ CX, CX -openSSEMainLoop: - CMPQ BX, $0x00000100 - JB openSSEMainLoopDone - - // Load state, increment counter blocks - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash - // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $0x00000004, CX - MOVQ SI, R9 - -openSSEInternalLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - LEAQ 16(R9), R9 - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ CX - JGE openSSEInternalLoop - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(R9), R9 - CMPQ CX, $-6 - JG openSSEInternalLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - - // Load - xor - store - MOVO X15, 64(BP) - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU X0, (DI) - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU X3, 16(DI) - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU X6, 32(DI) - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X9, 48(DI) - MOVOU 64(SI), X9 - PXOR X9, X1 - MOVOU X1, 64(DI) - MOVOU 80(SI), X9 - PXOR X9, X4 - MOVOU X4, 80(DI) - MOVOU 96(SI), X9 - PXOR X9, X7 - MOVOU X7, 96(DI) - MOVOU 112(SI), X9 - PXOR X9, X10 - MOVOU X10, 112(DI) - MOVOU 128(SI), X9 - PXOR X9, X2 - MOVOU X2, 128(DI) - MOVOU 144(SI), X9 - PXOR X9, X5 - MOVOU X5, 144(DI) - MOVOU 160(SI), X9 - PXOR X9, X8 - MOVOU X8, 160(DI) - MOVOU 176(SI), X9 - PXOR X9, X11 - MOVOU X11, 176(DI) - MOVOU 192(SI), X9 - PXOR X9, X12 - MOVOU X12, 192(DI) - MOVOU 208(SI), X9 - PXOR X9, X13 - MOVOU X13, 208(DI) - MOVOU 224(SI), X9 - PXOR X9, X14 - MOVOU X14, 224(DI) - MOVOU 240(SI), X9 - PXOR 64(BP), X9 - MOVOU X9, 240(DI) - LEAQ 256(SI), SI - LEAQ 256(DI), DI - SUBQ $0x00000100, BX - JMP openSSEMainLoop - -openSSEMainLoopDone: - // Handle the various tail sizes efficiently - TESTQ BX, BX - JE openSSEFinalize - CMPQ BX, $0x40 - JBE openSSETail64 - CMPQ BX, $0x80 - JBE openSSETail128 - CMPQ BX, $0xc0 - JBE openSSETail192 - JMP openSSETail256 - -openSSEFinalize: - // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), R10 - ADCQ src_len+56(FP), R11 +openAVX2InitialHash64: + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -878,3187 +293,494 @@ openSSEFinalize: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 - // Final reduce - MOVQ R10, R13 - MOVQ R11, R14 - MOVQ R12, R15 - SUBQ $-5, R10 - SBBQ $-1, R11 - SBBQ $0x03, R12 - CMOVQCS R13, R10 - CMOVQCS R14, R11 - CMOVQCS R15, R12 - - // Add in the "s" part of the key - ADDQ 16(BP), R10 - ADCQ 24(BP), R11 - - // Finally, constant time compare to the tag at the end of the message - XORQ AX, AX - MOVQ $0x00000001, DX - XORQ (SI), R10 - XORQ 8(SI), R11 - ORQ R11, R10 - CMOVQEQ DX, AX + // Decrypt the first 64 bytes + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX - // Return true iff tags are equal - MOVB AX, ret+96(FP) - RET +openAVX2MainLoop: + CMPQ BX, $0x00000200 + JB openAVX2MainLoopDone -openSSE128: - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X3, X13 - MOVO X6, X14 - MOVO X10, X15 - MOVQ $0x0000000a, R9 - -openSSE128InnerCipherLoop: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ R9 - JNE openSSE128InnerCipherLoop - - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL X13, X3 - PADDL X13, X4 - PADDL X13, X5 - PADDL X14, X7 - PADDL X14, X8 - PADDL X15, X10 - PADDL ·sseIncMask<>+0(SB), X15 - PADDL X15, X11 - - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVOU X0, (BP) - MOVOU X3, 16(BP) + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - -openSSE128Open: - CMPQ BX, $0x10 - JB openSSETail16 - SUBQ $0x10, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - - // Load for decryption - MOVOU (SI), X12 - PXOR X12, X1 - MOVOU X1, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Shift the stream "left" - MOVO X4, X1 - MOVO X7, X4 - MOVO X10, X7 - MOVO X2, X10 - MOVO X5, X2 - MOVO X8, X5 - MOVO X11, X8 - JMP openSSE128Open - -openSSETail16: - TESTQ BX, BX - JE openSSEFinalize - - // We can safely load the CT from the end, because it is padded with the MAC - MOVQ BX, R9 - SHLQ $0x04, R9 - LEAQ ·andMask<>+0(SB), R13 - MOVOU (SI), X12 - ADDQ BX, SI - PAND -16(R13)(R9*1), X12 - MOVO X12, 64(BP) - MOVQ X12, R13 - MOVQ 72(BP), R14 - PXOR X1, X12 - - // We can only store one byte at a time, since plaintext can be shorter than 16 bytes -openSSETail16Store: - MOVQ X12, R8 - MOVB R8, (DI) - PSRLDQ $0x01, X12 - INCQ DI - DECQ BX - JNE openSSETail16Store - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - JMP openSSEFinalize - -openSSETail64: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - XORQ R9, R9 - MOVQ BX, CX - CMPQ CX, $0x10 - JB openSSETail64LoopB - -openSSETail64LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - SUBQ $0x10, CX - -openSSETail64LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - CMPQ CX, $0x10 - JAE openSSETail64LoopA - CMPQ R9, $0xa0 - JNE openSSETail64LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL 32(BP), X3 - PADDL 48(BP), X6 - PADDL 80(BP), X9 - -openSSETail64DecLoop: - CMPQ BX, $0x10 - JB openSSETail64DecLoopDone - SUBQ $0x10, BX - MOVOU (SI), X12 - PXOR X12, X0 - MOVOU X0, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - MOVO X3, X0 - MOVO X6, X3 - MOVO X9, X6 - JMP openSSETail64DecLoop - -openSSETail64DecLoopDone: - MOVO X0, X1 - JMP openSSETail16 - -openSSETail128: - MOVO ·chacha20Constants<>+0(SB), X1 - MOVO 32(BP), X4 - MOVO 48(BP), X7 - MOVO 128(BP), X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 80(BP) - MOVO X1, X0 - MOVO X4, X3 - MOVO X7, X6 - MOVO X10, X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 96(BP) - XORQ R9, R9 - MOVQ BX, CX - ANDQ $-16, CX - -openSSETail128LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSETail128LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - CMPQ R9, CX - JB openSSETail128LoopA - CMPQ R9, $0xa0 - JNE openSSETail128LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 96(BP), X9 - PADDL 80(BP), X10 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, (DI) - MOVOU X4, 16(DI) - MOVOU X7, 32(DI) - MOVOU X10, 48(DI) - SUBQ $0x40, BX - LEAQ 64(SI), SI - LEAQ 64(DI), DI - JMP openSSETail64DecLoop - -openSSETail192: - MOVO ·chacha20Constants<>+0(SB), X2 - MOVO 32(BP), X5 - MOVO 48(BP), X8 - MOVO 128(BP), X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X11, 80(BP) - MOVO X2, X1 - MOVO X5, X4 - MOVO X8, X7 - MOVO X11, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - MOVO X1, X0 - MOVO X4, X3 - MOVO X7, X6 - MOVO X10, X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 112(BP) - MOVQ BX, CX - MOVQ $0x000000a0, R9 - CMPQ CX, $0xa0 - CMOVQGT R9, CX - ANDQ $-16, CX - XORQ R9, R9 - -openSSLTail192LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSLTail192LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - CMPQ R9, CX - JB openSSLTail192LoopA - CMPQ R9, $0xa0 - JNE openSSLTail192LoopB - CMPQ BX, $0xb0 - JB openSSLTail192Store - ADDQ 160(SI), R10 - ADCQ 168(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - CMPQ BX, $0xc0 - JB openSSLTail192Store - ADDQ 176(SI), R10 - ADCQ 184(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSLTail192Store: - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 32(BP), X5 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 48(BP), X8 - PADDL 112(BP), X9 - PADDL 96(BP), X10 - PADDL 80(BP), X11 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X2 - PXOR X13, X5 - PXOR X14, X8 - PXOR X15, X11 - MOVOU X2, (DI) - MOVOU X5, 16(DI) - MOVOU X8, 32(DI) - MOVOU X11, 48(DI) - MOVOU 64(SI), X12 - MOVOU 80(SI), X13 - MOVOU 96(SI), X14 - MOVOU 112(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - SUBQ $0x80, BX - LEAQ 128(SI), SI - LEAQ 128(DI), DI - JMP openSSETail64DecLoop - -openSSETail256: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - XORQ R9, R9 - -openSSETail256Loop: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - ADDQ $0x10, R9 - CMPQ R9, $0xa0 - JB openSSETail256Loop - MOVQ BX, CX - ANDQ $-16, CX - -openSSETail256HashLoop: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ $0x10, R9 - CMPQ R9, CX - JB openSSETail256HashLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - MOVO X15, 64(BP) - - // Load - xor - store - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVOU 128(SI), X0 - MOVOU 144(SI), X3 - MOVOU 160(SI), X6 - MOVOU 176(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 128(DI) - MOVOU X5, 144(DI) - MOVOU X8, 160(DI) - MOVOU X11, 176(DI) - LEAQ 192(SI), SI - LEAQ 192(DI), DI - SUBQ $0xc0, BX - MOVO X12, X0 - MOVO X13, X3 - MOVO X14, X6 - MOVO 64(BP), X9 - JMP openSSETail64DecLoop - -chacha20Poly1305Open_AVX2: - VZEROUPPER - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x70 - BYTE $0x10 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x20 - BYTE $0xc4 - BYTE $0xc2 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x30 - VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 - - // Special optimization, for very short buffers - CMPQ BX, $0xc0 - JBE openAVX2192 - CMPQ BX, $0x00000140 - JBE openAVX2320 - - // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA Y14, 32(BP) - VMOVDQA Y12, 64(BP) - VMOVDQA Y4, 192(BP) - MOVQ $0x0000000a, R9 - -openAVX2PreparePolyKey: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x04, Y4, Y4, Y4 - DECQ R9 - JNE openAVX2PreparePolyKey - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD 32(BP), Y14, Y14 - VPADDD 64(BP), Y12, Y12 - VPADDD 192(BP), Y4, Y4 - VPERM2I128 $0x02, Y0, Y14, Y3 - - // Clamp and store poly key - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for the first 64 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - - // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - XORQ CX, CX - -openAVX2InitialHash64: - ADDQ (SI)(CX*1), R10 - ADCQ 8(SI)(CX*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ $0x10, CX - CMPQ CX, $0x40 - JNE openAVX2InitialHash64 - - // Decrypt the first 64 bytes - VPXOR (SI), Y0, Y0 - VPXOR 32(SI), Y14, Y14 - VMOVDQU Y0, (DI) - VMOVDQU Y14, 32(DI) - LEAQ 64(SI), SI - LEAQ 64(DI), DI - SUBQ $0x40, BX - -openAVX2MainLoop: - CMPQ BX, $0x00000200 - JB openAVX2MainLoopDone - - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA Y0, Y7 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA Y14, Y11 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA Y12, Y15 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) - VMOVDQA Y3, 192(BP) - XORQ CX, CX - -openAVX2InternalLoop: - ADDQ (SI)(CX*1), R10 - ADCQ 8(SI)(CX*1), R11 - ADCQ $0x01, R12 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 - VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - ADDQ 16(SI)(CX*1), R10 - ADCQ 24(SI)(CX*1), R11 - ADCQ $0x01, R12 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 - VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x04, Y11, Y11, Y11 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - VPALIGNR $0x0c, Y3, Y3, Y3 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - ADDQ 32(SI)(CX*1), R10 - ADCQ 40(SI)(CX*1), R11 - ADCQ $0x01, R12 - LEAQ 48(CX), CX - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 - VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 - VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x0c, Y11, Y11, Y11 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - VPALIGNR $0x04, Y3, Y3, Y3 - CMPQ CX, $0x000001e0 - JNE openAVX2InternalLoop - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 32(BP), Y11, Y11 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 64(BP), Y15, Y15 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPADDD 192(BP), Y3, Y3 - VMOVDQA Y15, 224(BP) - - // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - ADDQ 480(SI), R10 - ADCQ 488(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPERM2I128 $0x02, Y0, Y14, Y15 - VPERM2I128 $0x13, Y0, Y14, Y14 - VPERM2I128 $0x02, Y12, Y4, Y0 - VPERM2I128 $0x13, Y12, Y4, Y12 - VPXOR (SI), Y15, Y15 - VPXOR 32(SI), Y0, Y0 - VPXOR 64(SI), Y14, Y14 - VPXOR 96(SI), Y12, Y12 - VMOVDQU Y15, (DI) - VMOVDQU Y0, 32(DI) - VMOVDQU Y14, 64(DI) - VMOVDQU Y12, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR 128(SI), Y0, Y0 - VPXOR 160(SI), Y14, Y14 - VPXOR 192(SI), Y12, Y12 - VPXOR 224(SI), Y4, Y4 - VMOVDQU Y0, 128(DI) - VMOVDQU Y14, 160(DI) - VMOVDQU Y12, 192(DI) - VMOVDQU Y4, 224(DI) - - // and here - ADDQ 496(SI), R10 - ADCQ 504(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - VPXOR 256(SI), Y0, Y0 - VPXOR 288(SI), Y14, Y14 - VPXOR 320(SI), Y12, Y12 - VPXOR 352(SI), Y4, Y4 - VMOVDQU Y0, 256(DI) - VMOVDQU Y14, 288(DI) - VMOVDQU Y12, 320(DI) - VMOVDQU Y4, 352(DI) - VPERM2I128 $0x02, Y7, Y11, Y0 - VPERM2I128 $0x02, 224(BP), Y3, Y14 - VPERM2I128 $0x13, Y7, Y11, Y12 - VPERM2I128 $0x13, 224(BP), Y3, Y4 - VPXOR 384(SI), Y0, Y0 - VPXOR 416(SI), Y14, Y14 - VPXOR 448(SI), Y12, Y12 - VPXOR 480(SI), Y4, Y4 - VMOVDQU Y0, 384(DI) - VMOVDQU Y14, 416(DI) - VMOVDQU Y12, 448(DI) - VMOVDQU Y4, 480(DI) - LEAQ 512(SI), SI - LEAQ 512(DI), DI - SUBQ $0x00000200, BX - JMP openAVX2MainLoop - -openAVX2MainLoopDone: - // Handle the various tail sizes efficiently - TESTQ BX, BX - JE openSSEFinalize - CMPQ BX, $0x80 - JBE openAVX2Tail128 - CMPQ BX, $0x00000100 - JBE openAVX2Tail256 - CMPQ BX, $0x00000180 - JBE openAVX2Tail384 - JMP openAVX2Tail512 - -openAVX2192: - VMOVDQA Y0, Y5 - VMOVDQA Y14, Y9 - VMOVDQA Y12, Y13 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y0, Y6 - VMOVDQA Y14, Y10 - VMOVDQA Y12, Y8 - VMOVDQA Y4, Y2 - VMOVDQA Y1, Y15 - MOVQ $0x0000000a, R9 - -openAVX2192InnerCipherLoop: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - DECQ R9 - JNE openAVX2192InnerCipherLoop - VPADDD Y6, Y0, Y0 - VPADDD Y6, Y5, Y5 - VPADDD Y10, Y14, Y14 - VPADDD Y10, Y9, Y9 - VPADDD Y8, Y12, Y12 - VPADDD Y8, Y13, Y13 - VPADDD Y2, Y4, Y4 - VPADDD Y15, Y1, Y1 - VPERM2I128 $0x02, Y0, Y14, Y3 - - // Clamp and store poly key - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for up to 192 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - VPERM2I128 $0x02, Y5, Y9, Y12 - VPERM2I128 $0x02, Y13, Y1, Y4 - VPERM2I128 $0x13, Y5, Y9, Y5 - VPERM2I128 $0x13, Y13, Y1, Y9 - -openAVX2ShortOpen: - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - -openAVX2ShortOpenLoop: - CMPQ BX, $0x20 - JB openAVX2ShortTail32 - SUBQ $0x20, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ 16(SI), R10 - ADCQ 24(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Load for decryption - VPXOR (SI), Y0, Y0 - VMOVDQU Y0, (DI) - LEAQ 32(SI), SI - LEAQ 32(DI), DI - - // Shift stream left - VMOVDQA Y14, Y0 - VMOVDQA Y12, Y14 - VMOVDQA Y4, Y12 - VMOVDQA Y5, Y4 - VMOVDQA Y9, Y5 - VMOVDQA Y13, Y9 - VMOVDQA Y1, Y13 - VMOVDQA Y6, Y1 - VMOVDQA Y10, Y6 - JMP openAVX2ShortOpenLoop - -openAVX2ShortTail32: - CMPQ BX, $0x10 - VMOVDQA X0, X1 - JB openAVX2ShortDone - SUBQ $0x10, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Load for decryption - VPXOR (SI), X0, X12 - VMOVDQU X12, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - VPERM2I128 $0x11, Y0, Y0, Y0 - VMOVDQA X0, X1 - -openAVX2ShortDone: - VZEROUPPER - JMP openSSETail16 - -openAVX2320: - VMOVDQA Y0, Y5 - VMOVDQA Y14, Y9 - VMOVDQA Y12, Y13 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y0, Y6 - VMOVDQA Y14, Y10 - VMOVDQA Y12, Y8 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VMOVDQA Y14, Y7 - VMOVDQA Y12, Y11 - VMOVDQA Y4, Y15 - MOVQ $0x0000000a, R9 - -openAVX2320InnerCipherLoop: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - DECQ R9 - JNE openAVX2320InnerCipherLoop - VMOVDQA ·chacha20Constants<>+0(SB), Y3 - VPADDD Y3, Y0, Y0 - VPADDD Y3, Y5, Y5 - VPADDD Y3, Y6, Y6 - VPADDD Y7, Y14, Y14 - VPADDD Y7, Y9, Y9 - VPADDD Y7, Y10, Y10 - VPADDD Y11, Y12, Y12 - VPADDD Y11, Y13, Y13 - VPADDD Y11, Y8, Y8 - VMOVDQA ·avx2IncMask<>+0(SB), Y3 - VPADDD Y15, Y4, Y4 - VPADDD Y3, Y15, Y15 - VPADDD Y15, Y1, Y1 - VPADDD Y3, Y15, Y15 - VPADDD Y15, Y2, Y2 - - // Clamp and store poly key - VPERM2I128 $0x02, Y0, Y14, Y3 - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for up to 320 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - VPERM2I128 $0x02, Y5, Y9, Y12 - VPERM2I128 $0x02, Y13, Y1, Y4 - VPERM2I128 $0x13, Y5, Y9, Y5 - VPERM2I128 $0x13, Y13, Y1, Y9 - VPERM2I128 $0x02, Y6, Y10, Y13 - VPERM2I128 $0x02, Y8, Y2, Y1 - VPERM2I128 $0x13, Y6, Y10, Y6 - VPERM2I128 $0x13, Y8, Y2, Y10 - JMP openAVX2ShortOpen - -openAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>+0(SB), Y5 - VMOVDQA 32(BP), Y9 - VMOVDQA 64(BP), Y13 - VMOVDQA 192(BP), Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 - VMOVDQA Y1, Y4 - XORQ R9, R9 - MOVQ BX, CX - ANDQ $-16, CX - TESTQ CX, CX - JE openAVX2Tail128LoopB - -openAVX2Tail128LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openAVX2Tail128LoopB: - ADDQ $0x10, R9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y1, Y1, Y1 - CMPQ R9, CX - JB openAVX2Tail128LoopA - CMPQ R9, $0xa0 - JNE openAVX2Tail128LoopB - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD 32(BP), Y9, Y9 - VPADDD 64(BP), Y13, Y13 - VPADDD Y4, Y1, Y1 - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - -openAVX2TailLoop: - CMPQ BX, $0x20 - JB openAVX2Tail - SUBQ $0x20, BX - - // Load for decryption - VPXOR (SI), Y0, Y0 - VMOVDQU Y0, (DI) - LEAQ 32(SI), SI - LEAQ 32(DI), DI - VMOVDQA Y14, Y0 - VMOVDQA Y12, Y14 - VMOVDQA Y4, Y12 - JMP openAVX2TailLoop - -openAVX2Tail: - CMPQ BX, $0x10 - VMOVDQA X0, X1 - JB openAVX2TailDone - SUBQ $0x10, BX - - // Load for decryption - VPXOR (SI), X0, X12 - VMOVDQU X12, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - VPERM2I128 $0x11, Y0, Y0, Y0 - VMOVDQA X0, X1 - -openAVX2TailDone: - VZEROUPPER - JMP openSSETail16 - -openAVX2Tail256: - VMOVDQA ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y4, Y7 - VMOVDQA Y1, Y11 - - // Compute the number of iterations that will hash data - MOVQ BX, 224(BP) - MOVQ BX, CX - SUBQ $0x80, CX - SHRQ $0x04, CX - MOVQ $0x0000000a, R9 - CMPQ CX, $0x0a - CMOVQGT R9, CX - MOVQ SI, BX - XORQ R9, R9 - -openAVX2Tail256LoopA: - ADDQ (BX), R10 - ADCQ 8(BX), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(BX), BX - -openAVX2Tail256LoopB: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 +openAVX2InternalLoop: + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 - INCQ R9 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - CMPQ R9, CX - JB openAVX2Tail256LoopA - CMPQ R9, $0x0a - JNE openAVX2Tail256LoopB - MOVQ BX, R9 - SUBQ SI, BX - MOVQ BX, CX - MOVQ 224(BP), BX - -openAVX2Tail256Hash: - ADDQ $0x10, CX - CMPQ CX, BX - JGT openAVX2Tail256HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(R9), R9 - JMP openAVX2Tail256Hash + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 + JNE openAVX2InternalLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) -openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD Y7, Y4, Y4 - VPADDD Y11, Y1, Y1 - VPERM2I128 $0x02, Y0, Y14, Y6 - VPERM2I128 $0x02, Y12, Y4, Y10 - VPERM2I128 $0x13, Y0, Y14, Y8 - VPERM2I128 $0x13, Y12, Y4, Y2 + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR (SI), Y6, Y6 - VPXOR 32(SI), Y10, Y10 - VPXOR 64(SI), Y8, Y8 - VPXOR 96(SI), Y2, Y2 - VMOVDQU Y6, (DI) - VMOVDQU Y10, 32(DI) - VMOVDQU Y8, 64(DI) - VMOVDQU Y2, 96(DI) - LEAQ 128(SI), SI - LEAQ 128(DI), DI - SUBQ $0x80, BX - JMP openAVX2TailLoop + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) -openAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) + // and here + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX + JMP openAVX2MainLoop - // Compute the number of iterations that will hash two blocks of data - MOVQ BX, 224(BP) - MOVQ BX, CX - SUBQ $0x00000100, CX - SHRQ $0x04, CX - ADDQ $0x06, CX - MOVQ $0x0000000a, R9 - CMPQ CX, $0x0a - CMOVQGT R9, CX - MOVQ SI, BX - XORQ R9, R9 +openAVX2MainLoopDone: + // Handle the various tail sizes efficiently + TESTQ BX, BX + JE openSSEFinalize + CMPQ BX, $0x80 + JBE openAVX2Tail128 + CMPQ BX, $0x00000100 + JBE openAVX2Tail256 + CMPQ BX, $0x00000180 + JBE openAVX2Tail384 + JMP openAVX2Tail512 -openAVX2Tail384LoopB: - ADDQ (BX), R10 - ADCQ 8(BX), R11 +openSSEFinalize: + // Hash in the PT, AAD lengths + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 IMULQ R12, R15 - MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -4075,174 +797,261 @@ openAVX2Tail384LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(BX), BX -openAVX2Tail384LoopA: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - ADDQ (BX), R10 - ADCQ 8(BX), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(BX), BX - INCQ R9 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - CMPQ R9, CX - JB openAVX2Tail384LoopB - CMPQ R9, $0x0a - JNE openAVX2Tail384LoopA - MOVQ BX, R9 - SUBQ SI, BX - MOVQ BX, CX - MOVQ 224(BP), BX + // Final reduce + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 + + // Add in the "s" part of the key + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 + + // Finally, constant time compare to the tag at the end of the message + XORQ AX, AX + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 + CMOVQEQ DX, AX + + // Return true iff tags are equal + MOVB AX, ret+96(FP) + RET + +openSSETail16: + TESTQ BX, BX + JE openSSEFinalize + + // We can safely load the CT from the end, because it is padded with the MAC + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 + + // We can only store one byte at a time, since plaintext can be shorter than 16 bytes +openSSETail16Store: + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX + JNE openSSETail16Store + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP openSSEFinalize -openAVX2Tail384Hash: - ADDQ $0x10, CX - CMPQ CX, BX - JGT openAVX2Tail384HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 +openAVX2192: + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 + +openAVX2192InnerCipherLoop: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 + JNE openAVX2192InnerCipherLoop + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + + // Clamp and store poly key + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) + + // Stream for up to 192 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + +openAVX2ShortOpen: + // Hash + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + +openAVX2ShortOpenLoop: + CMPQ BX, $0x20 + JB openAVX2ShortTail32 + SUBQ $0x20, BX + + // Load for hashing + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4275,83 +1084,34 @@ openAVX2Tail384Hash: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 - JMP openAVX2Tail384Hash -openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPERM2I128 $0x02, Y0, Y14, Y3 - VPERM2I128 $0x02, Y12, Y4, Y7 - VPERM2I128 $0x13, Y0, Y14, Y11 - VPERM2I128 $0x13, Y12, Y4, Y15 - VPXOR (SI), Y3, Y3 - VPXOR 32(SI), Y7, Y7 - VPXOR 64(SI), Y11, Y11 - VPXOR 96(SI), Y15, Y15 - VMOVDQU Y3, (DI) - VMOVDQU Y7, 32(DI) - VMOVDQU Y11, 64(DI) - VMOVDQU Y15, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y3 - VPERM2I128 $0x02, Y13, Y1, Y7 - VPERM2I128 $0x13, Y5, Y9, Y11 - VPERM2I128 $0x13, Y13, Y1, Y15 - VPXOR 128(SI), Y3, Y3 - VPXOR 160(SI), Y7, Y7 - VPXOR 192(SI), Y11, Y11 - VPXOR 224(SI), Y15, Y15 - VMOVDQU Y3, 128(DI) - VMOVDQU Y7, 160(DI) - VMOVDQU Y11, 192(DI) - VMOVDQU Y15, 224(DI) - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - LEAQ 256(SI), SI - LEAQ 256(DI), DI - SUBQ $0x00000100, BX - JMP openAVX2TailLoop + // Load for decryption + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI -openAVX2Tail512: - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA Y0, Y7 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA Y14, Y11 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA Y12, Y15 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) - VMOVDQA Y3, 192(BP) - XORQ CX, CX - MOVQ SI, R9 + // Shift stream left + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 + JMP openAVX2ShortOpenLoop + +openAVX2ShortTail32: + CMPQ BX, $0x10 + VMOVDQA X0, X1 + JB openAVX2ShortDone + SUBQ $0x10, BX -openAVX2Tail512LoopB: - ADDQ (R9), R10 - ADCQ 8(R9), R11 + // Load for hashing + ADDQ (SI), R10 + ADCQ 8(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4384,252 +1144,202 @@ openAVX2Tail512LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 -openAVX2Tail512LoopA: + // Load for decryption + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 + +openAVX2ShortDone: + VZEROUPPER + JMP openSSETail16 + +openAVX2320: + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 + +openAVX2320InnerCipherLoop: VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 + VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 + VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 + VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y10, Y10 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 - VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - ADDQ 16(R9), R10 - ADCQ 24(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 32(R9), R9 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 + VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 + VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y10, Y10 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 - VPALIGNR $0x04, Y3, Y3, Y3 - INCQ CX - CMPQ CX, $0x04 - JLT openAVX2Tail512LoopB - CMPQ CX, $0x0a - JNE openAVX2Tail512LoopA - MOVQ BX, CX - SUBQ $0x00000180, CX - ANDQ $-16, CX + DECQ R9 + JNE openAVX2320InnerCipherLoop + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 -openAVX2Tail512HashLoop: - TESTQ CX, CX - JE openAVX2Tail512HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 + // Clamp and store poly key + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) + + // Stream for up to 320 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 + JMP openAVX2ShortOpen + +openAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB + +openAVX2Tail128LoopA: + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4662,884 +1372,137 @@ openAVX2Tail512HashLoop: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 - SUBQ $0x10, CX - JMP openAVX2Tail512HashLoop -openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 +openAVX2Tail128LoopB: + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 - VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 32(BP), Y11, Y11 - VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 64(BP), Y15, Y15 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPADDD 192(BP), Y3, Y3 - VMOVDQA Y15, 224(BP) - VPERM2I128 $0x02, Y0, Y14, Y15 - VPERM2I128 $0x13, Y0, Y14, Y14 - VPERM2I128 $0x02, Y12, Y4, Y0 - VPERM2I128 $0x13, Y12, Y4, Y12 - VPXOR (SI), Y15, Y15 - VPXOR 32(SI), Y0, Y0 - VPXOR 64(SI), Y14, Y14 - VPXOR 96(SI), Y12, Y12 - VMOVDQU Y15, (DI) - VMOVDQU Y0, 32(DI) - VMOVDQU Y14, 64(DI) - VMOVDQU Y12, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR 128(SI), Y0, Y0 - VPXOR 160(SI), Y14, Y14 - VPXOR 192(SI), Y12, Y12 - VPXOR 224(SI), Y4, Y4 - VMOVDQU Y0, 128(DI) - VMOVDQU Y14, 160(DI) - VMOVDQU Y12, 192(DI) - VMOVDQU Y4, 224(DI) - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - VPXOR 256(SI), Y0, Y0 - VPXOR 288(SI), Y14, Y14 - VPXOR 320(SI), Y12, Y12 - VPXOR 352(SI), Y4, Y4 - VMOVDQU Y0, 256(DI) - VMOVDQU Y14, 288(DI) - VMOVDQU Y12, 320(DI) - VMOVDQU Y4, 352(DI) - VPERM2I128 $0x02, Y7, Y11, Y0 - VPERM2I128 $0x02, 224(BP), Y3, Y14 - VPERM2I128 $0x13, Y7, Y11, Y12 - VPERM2I128 $0x13, 224(BP), Y3, Y4 - LEAQ 384(SI), SI - LEAQ 384(DI), DI - SUBQ $0x00000180, BX - JMP openAVX2TailLoop - -DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 -GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 - -DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff -DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc -DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff -DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff -GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 - -DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 -DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 -GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 - -DATA ·andMask<>+0(SB)/8, $0x00000000000000ff -DATA ·andMask<>+8(SB)/8, $0x0000000000000000 -DATA ·andMask<>+16(SB)/8, $0x000000000000ffff -DATA ·andMask<>+24(SB)/8, $0x0000000000000000 -DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+40(SB)/8, $0x0000000000000000 -DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+56(SB)/8, $0x0000000000000000 -DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+72(SB)/8, $0x0000000000000000 -DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+88(SB)/8, $0x0000000000000000 -DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+104(SB)/8, $0x0000000000000000 -DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+120(SB)/8, $0x0000000000000000 -DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+136(SB)/8, $0x00000000000000ff -DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+152(SB)/8, $0x000000000000ffff -DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff -GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 -DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 -DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 -DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 -DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 -GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 +openAVX2TailLoop: + CMPQ BX, $0x20 + JB openAVX2Tail + SUBQ $0x20, BX -DATA ·rol16<>+0(SB)/8, $0x0504070601000302 -DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a -DATA ·rol16<>+16(SB)/8, $0x0504070601000302 -DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a -GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + // Load for decryption + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + JMP openAVX2TailLoop -DATA ·rol8<>+0(SB)/8, $0x0605040702010003 -DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b -DATA ·rol8<>+16(SB)/8, $0x0605040702010003 -DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b -GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 +openAVX2Tail: + CMPQ BX, $0x10 + VMOVDQA X0, X1 + JB openAVX2TailDone + SUBQ $0x10, BX -DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 -DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 -DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 -DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 -GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + // Load for decryption + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 -// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) -// Requires: AVX, AVX2, BMI2, CMOV, SSE2 -TEXT ·chacha20Poly1305Seal(SB), $288-96 - MOVQ SP, BP - ADDQ $0x20, BP - ANDQ $-32, BP - MOVQ dst_base+0(FP), DI - MOVQ key_base+24(FP), R8 - MOVQ src_base+48(FP), SI - MOVQ src_len+56(FP), BX - MOVQ ad_base+72(FP), CX - CMPB ·useAVX2+0(SB), $0x01 - JE chacha20Poly1305Seal_AVX2 +openAVX2TailDone: + VZEROUPPER + JMP openSSETail16 - // Special optimization, for very short buffers - CMPQ BX, $0x80 - JBE sealSSE128 - - // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - - // Store state on stack for future use - MOVO X3, 32(BP) - MOVO X6, 48(BP) - - // Load state, increment counter blocks - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - MOVQ $0x0000000a, R9 - -sealSSEIntroLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ R9 - JNE sealSSEIntroLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVO X0, (BP) - MOVO X3, 16(BP) - - // Hash AAD - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, (DI) - MOVOU X4, 16(DI) - MOVOU X7, 32(DI) - MOVOU X10, 48(DI) - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 64(DI) - MOVOU X5, 80(DI) - MOVOU X8, 96(DI) - MOVOU X11, 112(DI) - MOVQ $0x00000080, CX - SUBQ $0x80, BX - LEAQ 128(SI), SI - MOVO X12, X1 - MOVO X13, X4 - MOVO X14, X7 - MOVO X15, X10 - CMPQ BX, $0x40 - JBE sealSSE128SealHash - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X12 - PXOR X3, X13 - PXOR X6, X14 - PXOR X9, X15 - MOVOU X12, 128(DI) - MOVOU X13, 144(DI) - MOVOU X14, 160(DI) - MOVOU X15, 176(DI) - ADDQ $0x40, CX - SUBQ $0x40, BX - LEAQ 64(SI), SI - MOVQ $0x00000002, CX - MOVQ $0x00000008, R9 - CMPQ BX, $0x40 - JBE sealSSETail64 - CMPQ BX, $0x80 - JBE sealSSETail128 - CMPQ BX, $0xc0 - JBE sealSSETail192 - -sealSSEMainLoop: - // Load state, increment counter blocks - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - -sealSSEInnerLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail256: + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + + // Compute the number of iterations that will hash data + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + +openAVX2Tail256LoopA: + ADDQ (BX), R10 + ADCQ 8(BX), R11 ADCQ $0x01, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - LEAQ 16(DI), DI - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 @@ -5555,112 +1518,117 @@ sealSSEInnerLoop: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ R9 - JGE sealSSEInnerLoop - ADDQ (DI), R10 - ADCQ 8(DI), R11 + LEAQ 16(BX), BX + +openAVX2Tail256LoopB: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX + +openAVX2Tail256Hash: + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -5672,143 +1640,94 @@ sealSSEInnerLoop: SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - DECQ CX - JG sealSSEInnerLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - MOVO X15, 64(BP) - - // Load - xor - store - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVO 64(BP), X15 - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVOU 128(SI), X0 - MOVOU 144(SI), X3 - MOVOU 160(SI), X6 - MOVOU 176(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 128(DI) - MOVOU X5, 144(DI) - MOVOU X8, 160(DI) - MOVOU X11, 176(DI) - ADDQ $0xc0, SI - MOVQ $0x000000c0, CX - SUBQ $0xc0, BX - MOVO X12, X1 - MOVO X13, X4 - MOVO X14, X7 - MOVO X15, X10 - CMPQ BX, $0x40 - JBE sealSSE128SealHash - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X12 - PXOR X3, X13 - PXOR X6, X14 - PXOR X9, X15 - MOVOU X12, 192(DI) - MOVOU X13, 208(DI) - MOVOU X14, 224(DI) - MOVOU X15, 240(DI) - LEAQ 64(SI), SI - SUBQ $0x40, BX - MOVQ $0x00000006, CX - MOVQ $0x00000004, R9 - CMPQ BX, $0xc0 - JG sealSSEMainLoop - MOVQ BX, CX - TESTQ BX, BX - JE sealSSE128SealHash - MOVQ $0x00000006, CX - CMPQ BX, $0x40 - JBE sealSSETail64 - CMPQ BX, $0x80 - JBE sealSSETail128 - JMP sealSSETail192 - -sealSSETail64: - MOVO ·chacha20Constants<>+0(SB), X1 - MOVO 32(BP), X4 - MOVO 48(BP), X7 - MOVO 128(BP), X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 80(BP) - -sealSSETail64LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + +openAVX2Tail256HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + +openAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare six blocks + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + + // Compute the number of iterations that will hash two blocks of data + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + +openAVX2Tail384LoopB: + ADDQ (BX), R10 + ADCQ 8(BX), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -5825,175 +1744,190 @@ sealSSETail64LoopA: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI + LEAQ 16(BX), BX + +openAVX2Tail384LoopA: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX -sealSSETail64LoopB: - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x0c, X13 - PSRLL $0x14, X4 - PXOR X13, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x07, X13 - PSRLL $0x19, X4 - PXOR X13, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x0c, X13 - PSRLL $0x14, X4 - PXOR X13, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x07, X13 - PSRLL $0x19, X4 - PXOR X13, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - DECQ CX - JG sealSSETail64LoopA - DECQ R9 - JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X4 - PADDL 48(BP), X7 - PADDL 80(BP), X10 - JMP sealSSE128Seal - -sealSSETail128: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - -sealSSETail128LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail384Hash: + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6010,430 +1944,99 @@ sealSSETail128LoopA: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash -sealSSETail128LoopB: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - DECQ CX - JG sealSSETail128LoopA - DECQ R9 - JGE sealSSETail128LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 80(BP), X9 - PADDL 96(BP), X10 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X0 - PXOR X13, X3 - PXOR X14, X6 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVQ $0x00000040, CX - LEAQ 64(SI), SI - SUBQ $0x40, BX - JMP sealSSE128SealHash - -sealSSETail192: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X11, 112(BP) - -sealSSETail192LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI +openAVX2Tail384HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX + JMP openAVX2TailLoop -sealSSETail192LoopB: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 +openAVX2Tail512: + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 + +openAVX2Tail512LoopB: + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6450,465 +2053,268 @@ sealSSETail192LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ CX - JG sealSSETail192LoopA - DECQ R9 - JGE sealSSETail192LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 32(BP), X5 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 48(BP), X8 - PADDL 80(BP), X9 - PADDL 96(BP), X10 - PADDL 112(BP), X11 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X0 - PXOR X13, X3 - PXOR X14, X6 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVOU 64(SI), X12 - MOVOU 80(SI), X13 - MOVOU 96(SI), X14 - MOVOU 112(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVO X2, X1 - MOVO X5, X4 - MOVO X8, X7 - MOVO X11, X10 - MOVQ $0x00000080, CX - LEAQ 128(SI), SI - SUBQ $0x80, BX - JMP sealSSE128SealHash - -sealSSE128: - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X3, X13 - MOVO X6, X14 - MOVO X10, X15 - MOVQ $0x0000000a, R9 - -sealSSE128InnerCipherLoop: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ R9 - JNE sealSSE128InnerCipherLoop - - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL X13, X3 - PADDL X13, X4 - PADDL X13, X5 - PADDL X14, X7 - PADDL X14, X8 - PADDL X15, X10 - PADDL ·sseIncMask<>+0(SB), X15 - PADDL X15, X11 - PAND ·polyClampMask<>+0(SB), X0 - MOVOU X0, (BP) - MOVOU X3, 16(BP) + LEAQ 16(R9), R9 - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - XORQ CX, CX +openAVX2Tail512LoopA: + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 + JLT openAVX2Tail512LoopB + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX -sealSSE128SealHash: - CMPQ CX, $0x10 - JB sealSSE128Seal - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail512HashLoop: + TESTQ CX, CX + JE openAVX2Tail512HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6925,238 +2331,162 @@ sealSSE128SealHash: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 + LEAQ 16(R9), R9 SUBQ $0x10, CX - ADDQ $0x10, DI - JMP sealSSE128SealHash - -sealSSE128Seal: - CMPQ BX, $0x10 - JB sealSSETail - SUBQ $0x10, BX - - // Load for decryption - MOVOU (SI), X12 - PXOR X12, X1 - MOVOU X1, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - - // Extract for hashing - MOVQ X1, R13 - PSRLDQ $0x08, X1 - MOVQ X1, R14 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Shift the stream "left" - MOVO X4, X1 - MOVO X7, X4 - MOVO X10, X7 - MOVO X2, X10 - MOVO X5, X2 - MOVO X8, X5 - MOVO X11, X8 - JMP sealSSE128Seal + JMP openAVX2Tail512HashLoop -sealSSETail: - TESTQ BX, BX - JE sealSSEFinalize +openAVX2Tail512HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop - // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ BX, R9 - SHLQ $0x04, R9 - LEAQ ·andMask<>+0(SB), R13 - MOVQ BX, CX - LEAQ -1(SI)(BX*1), SI - XORQ R15, R15 - XORQ R8, R8 - XORQ AX, AX +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 -sealSSETailLoadLoop: - SHLQ $0x08, R15, R8 - SHLQ $0x08, R15 - MOVB (SI), AX - XORQ AX, R15 - LEAQ -1(SI), SI - DECQ CX - JNE sealSSETailLoadLoop - MOVQ R15, 64(BP) - MOVQ R8, 72(BP) - PXOR 64(BP), X1 - MOVOU X1, (DI) - MOVOU -16(R13)(R9*1), X12 - PAND X12, X1 - MOVQ X1, R13 - PSRLDQ $0x08, X1 - MOVQ X1, R14 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ BX, DI +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 -sealSSEFinalize: - // Hash in the buffer lengths - ADDQ ad_len+80(FP), R10 - ADCQ src_len+56(FP), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 - // Final reduce - MOVQ R10, R13 - MOVQ R11, R14 - MOVQ R12, R15 - SUBQ $-5, R10 - SBBQ $-1, R11 - SBBQ $0x03, R12 - CMOVQCS R13, R10 - CMOVQCS R14, R11 - CMOVQCS R15, R12 +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 - // Add in the "s" part of the key - ADDQ 16(BP), R10 - ADCQ 24(BP), R11 +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 - // Finally store the tag at the end of the message - MOVQ R10, (DI) - MOVQ R11, 8(DI) - RET +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 -chacha20Poly1305Seal_AVX2: +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 + MOVQ SP, BP + ADDQ $0x20, BP + ANDQ $-32, BP + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX VZEROUPPER - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x70 - BYTE $0x10 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x20 - BYTE $0xc4 - BYTE $0xc2 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x30 - VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VBROADCASTI128 16(R8), Y14 + VBROADCASTI128 32(R8), Y12 + VBROADCASTI128 48(R8), Y4 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers CMPQ BX, $0x000000c0 @@ -8170,6 +3500,144 @@ sealAVX2InternalLoopStart: JBE sealAVX2Tail384 JMP sealAVX2Tail512 +sealSSETail: + TESTQ BX, BX + JE sealSSEFinalize + + // We can only load the PT one byte at a time to avoid read after end of buffer + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 + XORQ AX, AX + +sealSSETailLoadLoop: + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX + JNE sealSSETailLoadLoop + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI + +sealSSEFinalize: + // Hash in the buffer lengths + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + + // Final reduce + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 + + // Add in the "s" part of the key + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 + + // Finally store the tag at the end of the message + MOVQ R10, (DI) + MOVQ R11, 8(DI) + RET + seal192AVX2: VMOVDQA Y0, Y5 VMOVDQA Y14, Y9 diff --git a/vendor/golang.org/x/crypto/pkcs12/crypto.go b/vendor/golang.org/x/crypto/pkcs12/crypto.go index 212538cb5a..3f307323cc 100644 --- a/vendor/golang.org/x/crypto/pkcs12/crypto.go +++ b/vendor/golang.org/x/crypto/pkcs12/crypto.go @@ -80,6 +80,10 @@ func pbDecrypterFor(algorithm pkix.AlgorithmIdentifier, password []byte) (cipher return nil, 0, err } + if params.Iterations < 0 || params.Iterations > maxIterations { + return nil, 0, NotImplementedError("iteration count is invalid or too high") + } + key := cipherType.deriveKey(params.Salt, password, params.Iterations) iv := cipherType.deriveIV(params.Salt, password, params.Iterations) diff --git a/vendor/golang.org/x/crypto/pkcs12/mac.go b/vendor/golang.org/x/crypto/pkcs12/mac.go index 5f38aa7de8..84039cee1e 100644 --- a/vendor/golang.org/x/crypto/pkcs12/mac.go +++ b/vendor/golang.org/x/crypto/pkcs12/mac.go @@ -27,11 +27,19 @@ var ( oidSHA1 = asn1.ObjectIdentifier([]int{1, 3, 14, 3, 2, 26}) ) +// maxIterations is a safety limit to prevent CPU exhaustion from +// crafted PKCS#12 files with unreasonable iteration counts. +const maxIterations = 1 << 20 // ~1 million + func verifyMac(macData *macData, message, password []byte) error { if !macData.Mac.Algorithm.Algorithm.Equal(oidSHA1) { return NotImplementedError("unknown digest algorithm: " + macData.Mac.Algorithm.Algorithm.String()) } + if macData.Iterations < 0 || macData.Iterations > maxIterations { + return NotImplementedError("iteration count is invalid or too high") + } + key := pbkdf(sha1Sum, 20, 64, macData.MacSalt, password, macData.Iterations, 3, 20) mac := hmac.New(sha1.New, key) diff --git a/vendor/golang.org/x/net/html/entity.go b/vendor/golang.org/x/net/html/entity.go index b628880a01..4e8d5d55f2 100644 --- a/vendor/golang.org/x/net/html/entity.go +++ b/vendor/golang.org/x/net/html/entity.go @@ -2156,9 +2156,8 @@ var entity = map[string]rune{ // HTML entities that are two unicode codepoints. var entity2 = map[string][2]rune{ - // TODO(nigeltao): Handle replacements that are wider than their names. - // "nLt;": {'\u226A', '\u20D2'}, - // "nGt;": {'\u226B', '\u20D2'}, + "nLt;": {'\u226A', '\u20D2'}, + "nGt;": {'\u226B', '\u20D2'}, "NotEqualTilde;": {'\u2242', '\u0338'}, "NotGreaterFullEqual;": {'\u2267', '\u0338'}, "NotGreaterGreater;": {'\u226B', '\u0338'}, diff --git a/vendor/golang.org/x/net/html/escape.go b/vendor/golang.org/x/net/html/escape.go index 12f2273706..df3edc5b12 100644 --- a/vendor/golang.org/x/net/html/escape.go +++ b/vendor/golang.org/x/net/html/escape.go @@ -6,6 +6,7 @@ package html import ( "bytes" + "slices" "strings" "unicode/utf8" ) @@ -50,25 +51,24 @@ var replacementTable = [...]rune{ // 0x0D->'\u000D' is a no-op. } -// unescapeEntity reads an entity like "<" from b[src:] and writes the -// corresponding "<" to b[dst:], returning the incremented dst and src cursors. -// Precondition: b[src] == '&' && dst <= src. -// attribute should be true if parsing an attribute value. -func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { +// unescapeEntity attempts to consume a character reference from s[src:], +// returning the rune, potential second rune, and number of bytes consumed +// (which indicates the length of the character reference). It is assumed that +// the first byte of s is '&'. attribute should be true if parsing an attribute +// value. +func unescapeEntity(s []byte, attribute bool) (rune, rune, int) { // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference // i starts at 1 because we already know that s[0] == '&'. - i, s := 1, b[src:] + i := 1 if len(s) <= 1 { - b[dst] = b[src] - return dst + 1, src + 1 + return '&', 0, 1 } if s[i] == '#' { - if len(s) <= 3 { // We need to have at least "&#.". - b[dst] = b[src] - return dst + 1, src + 1 + if len(s) <= 2 { // We need to have at least "&#". + return '&', 0, 1 } i++ c := s[i] @@ -78,34 +78,43 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { i++ } + i0 := i x := '\x00' for i < len(s) { c = s[i] - i++ + var d rune + var mult rune if hex { + mult = 16 if '0' <= c && c <= '9' { - x = 16*x + rune(c) - '0' - continue + d = rune(c) - '0' } else if 'a' <= c && c <= 'f' { - x = 16*x + rune(c) - 'a' + 10 - continue + d = rune(c) - 'a' + 10 } else if 'A' <= c && c <= 'F' { - x = 16*x + rune(c) - 'A' + 10 - continue + d = rune(c) - 'A' + 10 + } else { + break + } + } else { + mult = 10 + if '0' <= c && c <= '9' { + d = rune(c) - '0' + } else { + break } - } else if '0' <= c && c <= '9' { - x = 10*x + rune(c) - '0' - continue } - if c != ';' { - i-- + if x <= 0x10FFFF { + x = mult*x + d } - break + i++ + } + + if i == i0 { // No characters matched. + return '&', 0, 1 } - if i <= 3 { // No characters matched. - b[dst] = b[src] - return dst + 1, src + 1 + if i < len(s) && s[i] == ';' { + i++ } if 0x80 <= x && x <= 0x9F { @@ -116,7 +125,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { x = '\uFFFD' } - return dst + utf8.EncodeRune(b[dst:], x), src + i + return x, 0, i } // Consume the maximum number of characters possible, with the @@ -141,10 +150,9 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { // No-op. } else if x := entity[entityName]; x != 0 { - return dst + utf8.EncodeRune(b[dst:], x), src + i + return x, 0, i } else if x := entity2[entityName]; x[0] != 0 { - dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) - return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i + return x[0], x[1], i } else if !attribute { maxLen := len(entityName) - 1 if maxLen > longestEntityWithoutSemicolon { @@ -152,35 +160,67 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { } for j := maxLen; j > 1; j-- { if x := entity[entityName[:j]]; x != 0 { - return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 + return x, 0, j + 1 } } } - dst1, src1 = dst+i, src+i - copy(b[dst:dst1], b[src:src1]) - return dst1, src1 + return '&', 0, 1 } -// unescape unescapes b's entities in-place, so that "a<b" becomes "a entityNameLen { + if reusingB { + out = slices.Clone(out) + reusingB = false } - return b[0:dst] + out = slices.Grow(out, replLen) + } + out = utf8.AppendRune(out, r1) + if r2 != 0 { + out = utf8.AppendRune(out, r2) } + + src += entityNameLen } - return b + + return out } // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc". diff --git a/vendor/golang.org/x/net/html/foreign.go b/vendor/golang.org/x/net/html/foreign.go index e8515d8e88..65d01d1ed9 100644 --- a/vendor/golang.org/x/net/html/foreign.go +++ b/vendor/golang.org/x/net/html/foreign.go @@ -23,7 +23,7 @@ func adjustForeignAttributes(aa []Attribute) { } switch a.Key { case "xlink:actuate", "xlink:arcrole", "xlink:href", "xlink:role", "xlink:show", - "xlink:title", "xlink:type", "xml:base", "xml:lang", "xml:space", "xmlns:xlink": + "xlink:title", "xlink:type", "xml:lang", "xml:space", "xmlns:xlink": j := strings.Index(a.Key, ":") aa[i].Namespace = a.Key[:j] aa[i].Key = a.Key[j+1:] diff --git a/vendor/golang.org/x/net/html/parse.go b/vendor/golang.org/x/net/html/parse.go index b3d2a25581..165b6108d4 100644 --- a/vendor/golang.org/x/net/html/parse.go +++ b/vendor/golang.org/x/net/html/parse.go @@ -63,7 +63,7 @@ func (p *parser) top() *Node { // Stop tags for use in popUntil. These come from section 12.2.4.2. var ( defaultScopeStopTags = map[string][]a.Atom{ - "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, + "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template, a.Select}, "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, "svg": {a.Desc, a.ForeignObject, a.Title}, } @@ -78,7 +78,6 @@ const ( tableScope tableRowScope tableBodyScope - selectScope ) // popUntil pops the stack of open elements at the highest element whose tag @@ -133,10 +132,6 @@ func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { return -1 } - case selectScope: - if tagAtom != a.Optgroup && tagAtom != a.Option { - return -1 - } default: panic(fmt.Sprintf("html: internal error: indexOfElementInScope unknown scope: %d", s)) } @@ -460,21 +455,6 @@ func (p *parser) resetInsertionMode() { } switch n.DataAtom { - case a.Select: - if !last { - for ancestor, first := n, p.oe[0]; ancestor != first; { - ancestor = p.oe[p.oe.index(ancestor)-1] - switch ancestor.DataAtom { - case a.Template: - p.im = inSelectIM - return - case a.Table: - p.im = inSelectInTableIM - return - } - } - } - p.im = inSelectIM case a.Td, a.Th: // TODO: remove this divergence from the HTML5 spec. // @@ -1002,7 +982,10 @@ func inBodyIM(p *parser) bool { p.popUntil(buttonScope, a.P) p.addElement() case a.Button: - p.popUntil(defaultScope, a.Button) + if p.elementInScope(defaultScope, a.Button) { + p.generateImpliedEndTags() + p.popUntil(defaultScope, a.Button) + } p.reconstructActiveFormattingElements() p.addElement() p.framesetOK = false @@ -1040,7 +1023,18 @@ func inBodyIM(p *parser) bool { p.framesetOK = false p.im = inTableIM return true - case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: + case a.Area, a.Br, a.Embed, a.Img, a.Keygen, a.Wbr: + p.reconstructActiveFormattingElements() + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + p.framesetOK = false + case a.Input: + if p.fragment && p.context.DataAtom == a.Select { + // Ignore the token. + return true + } + p.popUntil(defaultScope, a.Select) p.reconstructActiveFormattingElements() p.addElement() p.oe.pop() @@ -1061,7 +1055,13 @@ func inBodyIM(p *parser) bool { p.oe.pop() p.acknowledgeSelfClosingTag() case a.Hr: - p.popUntil(buttonScope, a.P) + if p.elementInScope(buttonScope, a.P) { + p.generateImpliedEndTags("p") + p.popUntil(defaultScope, a.P) + } + if p.elementInScope(defaultScope, a.Select) { + p.generateImpliedEndTags() + } p.addElement() p.oe.pop() p.acknowledgeSelfClosingTag() @@ -1095,13 +1095,30 @@ func inBodyIM(p *parser) bool { // Don't let the tokenizer go into raw text mode when scripting is disabled. p.tokenizer.NextIsNotRawText() case a.Select: + if p.fragment && p.context.DataAtom == a.Select { + // Ignore the token. + return true + } else if p.popUntil(defaultScope, a.Select) { + return true + } p.reconstructActiveFormattingElements() p.addElement() p.framesetOK = false - p.im = inSelectIM return true - case a.Optgroup, a.Option: - if p.top().DataAtom == a.Option { + case a.Option: + if p.elementInScope(defaultScope, a.Select) { + p.generateImpliedEndTags("optgroup") + // If oe has option element in scope, parse error? + } else if p.top().DataAtom == a.Option { + p.oe.pop() + } + p.reconstructActiveFormattingElements() + p.addElement() + case a.Optgroup: + if p.elementInScope(defaultScope, a.Select) { + p.generateImpliedEndTags() + // If oe has option or optgroup element in scope, parse error? + } else if p.top().DataAtom == a.Option { p.oe.pop() } p.reconstructActiveFormattingElements() @@ -1149,7 +1166,12 @@ func inBodyIM(p *parser) bool { return false } return true - case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Summary, a.Ul: + case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Select, a.Summary, a.Ul: + if !p.elementInScope(defaultScope, p.tok.DataAtom) { + // Ignore the token. + return true + } + p.generateImpliedEndTags() p.popUntil(defaultScope, p.tok.DataAtom) case a.Form: if p.oe.contains(a.Template) { @@ -1488,17 +1510,6 @@ func inTableIM(p *parser) bool { } p.addElement() p.form = p.oe.pop() - case a.Select: - p.reconstructActiveFormattingElements() - switch p.top().DataAtom { - case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: - p.fosterParenting = true - } - p.addElement() - p.fosterParenting = false - p.framesetOK = false - p.im = inSelectInTableIM - return true } case EndTagToken: switch p.tok.DataAtom { @@ -1547,12 +1558,6 @@ func inCaptionIM(p *parser) bool { p.clearActiveFormattingElements() p.im = inTableIM return false - case a.Select: - p.reconstructActiveFormattingElements() - p.addElement() - p.framesetOK = false - p.im = inSelectInTableIM - return true } case EndTagToken: switch p.tok.DataAtom { @@ -1762,12 +1767,6 @@ func inCellIM(p *parser) bool { } // Ignore the token. return true - case a.Select: - p.reconstructActiveFormattingElements() - p.addElement() - p.framesetOK = false - p.im = inSelectInTableIM - return true } case EndTagToken: switch p.tok.DataAtom { @@ -1798,118 +1797,6 @@ func inCellIM(p *parser) bool { return inBodyIM(p) } -// Section 12.2.6.4.16. -func inSelectIM(p *parser) bool { - switch p.tok.Type { - case TextToken: - p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) - case StartTagToken: - switch p.tok.DataAtom { - case a.Html: - return inBodyIM(p) - case a.Option: - if p.top().DataAtom == a.Option { - p.oe.pop() - } - p.addElement() - case a.Optgroup: - if p.top().DataAtom == a.Option { - p.oe.pop() - } - if p.top().DataAtom == a.Optgroup { - p.oe.pop() - } - p.addElement() - case a.Select: - if !p.popUntil(selectScope, a.Select) { - // Ignore the token. - return true - } - p.resetInsertionMode() - case a.Input, a.Keygen, a.Textarea: - if p.elementInScope(selectScope, a.Select) { - p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) - return false - } - // In order to properly ignore