4
0
mirror of https://github.com/cwinfo/matterbridge.git synced 2025-07-04 13:37:44 +00:00

Update vendor

This commit is contained in:
Wim
2021-10-16 23:11:32 +02:00
parent 57fce93af7
commit 20f6c05ec5
588 changed files with 119386 additions and 3424 deletions

View File

@ -57,6 +57,32 @@ var (
ErrNoAccount = errors.New("acme: account does not exist")
)
// A Subproblem describes an ACME subproblem as reported in an Error.
type Subproblem struct {
// Type is a URI reference that identifies the problem type,
// typically in a "urn:acme:error:xxx" form.
Type string
// Detail is a human-readable explanation specific to this occurrence of the problem.
Detail string
// Instance indicates a URL that the client should direct a human user to visit
// in order for instructions on how to agree to the updated Terms of Service.
// In such an event CA sets StatusCode to 403, Type to
// "urn:ietf:params:acme:error:userActionRequired", and adds a Link header with relation
// "terms-of-service" containing the latest TOS URL.
Instance string
// Identifier may contain the ACME identifier that the error is for.
Identifier *AuthzID
}
func (sp Subproblem) String() string {
str := fmt.Sprintf("%s: ", sp.Type)
if sp.Identifier != nil {
str += fmt.Sprintf("[%s: %s] ", sp.Identifier.Type, sp.Identifier.Value)
}
str += sp.Detail
return str
}
// Error is an ACME error, defined in Problem Details for HTTP APIs doc
// http://tools.ietf.org/html/draft-ietf-appsawg-http-problem.
type Error struct {
@ -76,10 +102,21 @@ type Error struct {
// Header is the original server error response headers.
// It may be nil.
Header http.Header
// Subproblems may contain more detailed information about the individual problems
// that caused the error. This field is only sent by RFC 8555 compatible ACME
// servers. Defined in RFC 8555 Section 6.7.1.
Subproblems []Subproblem
}
func (e *Error) Error() string {
return fmt.Sprintf("%d %s: %s", e.StatusCode, e.ProblemType, e.Detail)
str := fmt.Sprintf("%d %s: %s", e.StatusCode, e.ProblemType, e.Detail)
if len(e.Subproblems) > 0 {
str += fmt.Sprintf("; subproblems:")
for _, sp := range e.Subproblems {
str += fmt.Sprintf("\n\t%s", sp)
}
}
return str
}
// AuthorizationError indicates that an authorization for an identifier
@ -533,20 +570,23 @@ func (c *wireChallenge) challenge() *Challenge {
// wireError is a subset of fields of the Problem Details object
// as described in https://tools.ietf.org/html/rfc7807#section-3.1.
type wireError struct {
Status int
Type string
Detail string
Instance string
Status int
Type string
Detail string
Instance string
Subproblems []Subproblem
}
func (e *wireError) error(h http.Header) *Error {
return &Error{
err := &Error{
StatusCode: e.Status,
ProblemType: e.Type,
Detail: e.Detail,
Instance: e.Instance,
Header: h,
Subproblems: e.Subproblems,
}
return err
}
// CertOption is an optional argument type for the TLS ChallengeCert methods for

285
vendor/golang.org/x/crypto/argon2/argon2.go generated vendored Normal file
View File

@ -0,0 +1,285 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package argon2 implements the key derivation function Argon2.
// Argon2 was selected as the winner of the Password Hashing Competition and can
// be used to derive cryptographic keys from passwords.
//
// For a detailed specification of Argon2 see [1].
//
// If you aren't sure which function you need, use Argon2id (IDKey) and
// the parameter recommendations for your scenario.
//
//
// Argon2i
//
// Argon2i (implemented by Key) is the side-channel resistant version of Argon2.
// It uses data-independent memory access, which is preferred for password
// hashing and password-based key derivation. Argon2i requires more passes over
// memory than Argon2id to protect from trade-off attacks. The recommended
// parameters (taken from [2]) for non-interactive operations are time=3 and to
// use the maximum available memory.
//
//
// Argon2id
//
// Argon2id (implemented by IDKey) is a hybrid version of Argon2 combining
// Argon2i and Argon2d. It uses data-independent memory access for the first
// half of the first iteration over the memory and data-dependent memory access
// for the rest. Argon2id is side-channel resistant and provides better brute-
// force cost savings due to time-memory tradeoffs than Argon2i. The recommended
// parameters for non-interactive operations (taken from [2]) are time=1 and to
// use the maximum available memory.
//
// [1] https://github.com/P-H-C/phc-winner-argon2/blob/master/argon2-specs.pdf
// [2] https://tools.ietf.org/html/draft-irtf-cfrg-argon2-03#section-9.3
package argon2
import (
"encoding/binary"
"sync"
"golang.org/x/crypto/blake2b"
)
// The Argon2 version implemented by this package.
const Version = 0x13
const (
argon2d = iota
argon2i
argon2id
)
// Key derives a key from the password, salt, and cost parameters using Argon2i
// returning a byte slice of length keyLen that can be used as cryptographic
// key. The CPU cost and parallelism degree must be greater than zero.
//
// For example, you can get a derived key for e.g. AES-256 (which needs a
// 32-byte key) by doing:
//
// key := argon2.Key([]byte("some password"), salt, 3, 32*1024, 4, 32)
//
// The draft RFC recommends[2] time=3, and memory=32*1024 is a sensible number.
// If using that amount of memory (32 MB) is not possible in some contexts then
// the time parameter can be increased to compensate.
//
// The time parameter specifies the number of passes over the memory and the
// memory parameter specifies the size of the memory in KiB. For example
// memory=32*1024 sets the memory cost to ~32 MB. The number of threads can be
// adjusted to the number of available CPUs. The cost parameters should be
// increased as memory latency and CPU parallelism increases. Remember to get a
// good random salt.
func Key(password, salt []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
return deriveKey(argon2i, password, salt, nil, nil, time, memory, threads, keyLen)
}
// IDKey derives a key from the password, salt, and cost parameters using
// Argon2id returning a byte slice of length keyLen that can be used as
// cryptographic key. The CPU cost and parallelism degree must be greater than
// zero.
//
// For example, you can get a derived key for e.g. AES-256 (which needs a
// 32-byte key) by doing:
//
// key := argon2.IDKey([]byte("some password"), salt, 1, 64*1024, 4, 32)
//
// The draft RFC recommends[2] time=1, and memory=64*1024 is a sensible number.
// If using that amount of memory (64 MB) is not possible in some contexts then
// the time parameter can be increased to compensate.
//
// The time parameter specifies the number of passes over the memory and the
// memory parameter specifies the size of the memory in KiB. For example
// memory=64*1024 sets the memory cost to ~64 MB. The number of threads can be
// adjusted to the numbers of available CPUs. The cost parameters should be
// increased as memory latency and CPU parallelism increases. Remember to get a
// good random salt.
func IDKey(password, salt []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
return deriveKey(argon2id, password, salt, nil, nil, time, memory, threads, keyLen)
}
func deriveKey(mode int, password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
if time < 1 {
panic("argon2: number of rounds too small")
}
if threads < 1 {
panic("argon2: parallelism degree too low")
}
h0 := initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
if memory < 2*syncPoints*uint32(threads) {
memory = 2 * syncPoints * uint32(threads)
}
B := initBlocks(&h0, memory, uint32(threads))
processBlocks(B, time, memory, uint32(threads), mode)
return extractKey(B, memory, uint32(threads), keyLen)
}
const (
blockLength = 128
syncPoints = 4
)
type block [blockLength]uint64
func initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte {
var (
h0 [blake2b.Size + 8]byte
params [24]byte
tmp [4]byte
)
b2, _ := blake2b.New512(nil)
binary.LittleEndian.PutUint32(params[0:4], threads)
binary.LittleEndian.PutUint32(params[4:8], keyLen)
binary.LittleEndian.PutUint32(params[8:12], memory)
binary.LittleEndian.PutUint32(params[12:16], time)
binary.LittleEndian.PutUint32(params[16:20], uint32(Version))
binary.LittleEndian.PutUint32(params[20:24], uint32(mode))
b2.Write(params[:])
binary.LittleEndian.PutUint32(tmp[:], uint32(len(password)))
b2.Write(tmp[:])
b2.Write(password)
binary.LittleEndian.PutUint32(tmp[:], uint32(len(salt)))
b2.Write(tmp[:])
b2.Write(salt)
binary.LittleEndian.PutUint32(tmp[:], uint32(len(key)))
b2.Write(tmp[:])
b2.Write(key)
binary.LittleEndian.PutUint32(tmp[:], uint32(len(data)))
b2.Write(tmp[:])
b2.Write(data)
b2.Sum(h0[:0])
return h0
}
func initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []block {
var block0 [1024]byte
B := make([]block, memory)
for lane := uint32(0); lane < threads; lane++ {
j := lane * (memory / threads)
binary.LittleEndian.PutUint32(h0[blake2b.Size+4:], lane)
binary.LittleEndian.PutUint32(h0[blake2b.Size:], 0)
blake2bHash(block0[:], h0[:])
for i := range B[j+0] {
B[j+0][i] = binary.LittleEndian.Uint64(block0[i*8:])
}
binary.LittleEndian.PutUint32(h0[blake2b.Size:], 1)
blake2bHash(block0[:], h0[:])
for i := range B[j+1] {
B[j+1][i] = binary.LittleEndian.Uint64(block0[i*8:])
}
}
return B
}
func processBlocks(B []block, time, memory, threads uint32, mode int) {
lanes := memory / threads
segments := lanes / syncPoints
processSegment := func(n, slice, lane uint32, wg *sync.WaitGroup) {
var addresses, in, zero block
if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
in[0] = uint64(n)
in[1] = uint64(lane)
in[2] = uint64(slice)
in[3] = uint64(memory)
in[4] = uint64(time)
in[5] = uint64(mode)
}
index := uint32(0)
if n == 0 && slice == 0 {
index = 2 // we have already generated the first two blocks
if mode == argon2i || mode == argon2id {
in[6]++
processBlock(&addresses, &in, &zero)
processBlock(&addresses, &addresses, &zero)
}
}
offset := lane*lanes + slice*segments + index
var random uint64
for index < segments {
prev := offset - 1
if index == 0 && slice == 0 {
prev += lanes // last block in lane
}
if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
if index%blockLength == 0 {
in[6]++
processBlock(&addresses, &in, &zero)
processBlock(&addresses, &addresses, &zero)
}
random = addresses[index%blockLength]
} else {
random = B[prev][0]
}
newOffset := indexAlpha(random, lanes, segments, threads, n, slice, lane, index)
processBlockXOR(&B[offset], &B[prev], &B[newOffset])
index, offset = index+1, offset+1
}
wg.Done()
}
for n := uint32(0); n < time; n++ {
for slice := uint32(0); slice < syncPoints; slice++ {
var wg sync.WaitGroup
for lane := uint32(0); lane < threads; lane++ {
wg.Add(1)
go processSegment(n, slice, lane, &wg)
}
wg.Wait()
}
}
}
func extractKey(B []block, memory, threads, keyLen uint32) []byte {
lanes := memory / threads
for lane := uint32(0); lane < threads-1; lane++ {
for i, v := range B[(lane*lanes)+lanes-1] {
B[memory-1][i] ^= v
}
}
var block [1024]byte
for i, v := range B[memory-1] {
binary.LittleEndian.PutUint64(block[i*8:], v)
}
key := make([]byte, keyLen)
blake2bHash(key, block[:])
return key
}
func indexAlpha(rand uint64, lanes, segments, threads, n, slice, lane, index uint32) uint32 {
refLane := uint32(rand>>32) % threads
if n == 0 && slice == 0 {
refLane = lane
}
m, s := 3*segments, ((slice+1)%syncPoints)*segments
if lane == refLane {
m += index
}
if n == 0 {
m, s = slice*segments, 0
if slice == 0 || lane == refLane {
m += index
}
}
if index == 0 || lane == refLane {
m--
}
return phi(rand, uint64(m), uint64(s), refLane, lanes)
}
func phi(rand, m, s uint64, lane, lanes uint32) uint32 {
p := rand & 0xFFFFFFFF
p = (p * p) >> 32
p = (p * m) >> 32
return lane*lanes + uint32((s+m-(p+1))%uint64(lanes))
}

53
vendor/golang.org/x/crypto/argon2/blake2b.go generated vendored Normal file
View File

@ -0,0 +1,53 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package argon2
import (
"encoding/binary"
"hash"
"golang.org/x/crypto/blake2b"
)
// blake2bHash computes an arbitrary long hash value of in
// and writes the hash to out.
func blake2bHash(out []byte, in []byte) {
var b2 hash.Hash
if n := len(out); n < blake2b.Size {
b2, _ = blake2b.New(n, nil)
} else {
b2, _ = blake2b.New512(nil)
}
var buffer [blake2b.Size]byte
binary.LittleEndian.PutUint32(buffer[:4], uint32(len(out)))
b2.Write(buffer[:4])
b2.Write(in)
if len(out) <= blake2b.Size {
b2.Sum(out[:0])
return
}
outLen := len(out)
b2.Sum(buffer[:0])
b2.Reset()
copy(out, buffer[:32])
out = out[32:]
for len(out) > blake2b.Size {
b2.Write(buffer[:])
b2.Sum(buffer[:0])
copy(out, buffer[:32])
out = out[32:]
b2.Reset()
}
if outLen%blake2b.Size > 0 { // outLen > 64
r := ((outLen + 31) / 32) - 2 // ⌈τ /32⌉-2
b2, _ = blake2b.New(outLen-32*r, nil)
}
b2.Write(buffer[:])
b2.Sum(out[:0])
}

61
vendor/golang.org/x/crypto/argon2/blamka_amd64.go generated vendored Normal file
View File

@ -0,0 +1,61 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
// +build amd64,gc,!purego
package argon2
import "golang.org/x/sys/cpu"
func init() {
useSSE4 = cpu.X86.HasSSE41
}
//go:noescape
func mixBlocksSSE2(out, a, b, c *block)
//go:noescape
func xorBlocksSSE2(out, a, b, c *block)
//go:noescape
func blamkaSSE4(b *block)
func processBlockSSE(out, in1, in2 *block, xor bool) {
var t block
mixBlocksSSE2(&t, in1, in2, &t)
if useSSE4 {
blamkaSSE4(&t)
} else {
for i := 0; i < blockLength; i += 16 {
blamkaGeneric(
&t[i+0], &t[i+1], &t[i+2], &t[i+3],
&t[i+4], &t[i+5], &t[i+6], &t[i+7],
&t[i+8], &t[i+9], &t[i+10], &t[i+11],
&t[i+12], &t[i+13], &t[i+14], &t[i+15],
)
}
for i := 0; i < blockLength/8; i += 2 {
blamkaGeneric(
&t[i], &t[i+1], &t[16+i], &t[16+i+1],
&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
)
}
}
if xor {
xorBlocksSSE2(out, in1, in2, &t)
} else {
mixBlocksSSE2(out, in1, in2, &t)
}
}
func processBlock(out, in1, in2 *block) {
processBlockSSE(out, in1, in2, false)
}
func processBlockXOR(out, in1, in2 *block) {
processBlockSSE(out, in1, in2, true)
}

244
vendor/golang.org/x/crypto/argon2/blamka_amd64.s generated vendored Normal file
View File

@ -0,0 +1,244 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
// +build amd64,gc,!purego
#include "textflag.h"
DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v6, t1; \
PUNPCKLQDQ v6, t2; \
PUNPCKHQDQ v7, v6; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ v7, t2; \
MOVO t1, v7; \
MOVO v2, t1; \
PUNPCKHQDQ t2, v7; \
PUNPCKLQDQ v3, t2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v3
#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v2, t1; \
PUNPCKLQDQ v2, t2; \
PUNPCKHQDQ v3, v2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ v3, t2; \
MOVO t1, v3; \
MOVO v6, t1; \
PUNPCKHQDQ t2, v3; \
PUNPCKLQDQ v7, t2; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v7
#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
MOVO v0, t0; \
PMULULQ v2, t0; \
PADDQ v2, v0; \
PADDQ t0, v0; \
PADDQ t0, v0; \
PXOR v0, v6; \
PSHUFD $0xB1, v6, v6; \
MOVO v4, t0; \
PMULULQ v6, t0; \
PADDQ v6, v4; \
PADDQ t0, v4; \
PADDQ t0, v4; \
PXOR v4, v2; \
PSHUFB c40, v2; \
MOVO v0, t0; \
PMULULQ v2, t0; \
PADDQ v2, v0; \
PADDQ t0, v0; \
PADDQ t0, v0; \
PXOR v0, v6; \
PSHUFB c48, v6; \
MOVO v4, t0; \
PMULULQ v6, t0; \
PADDQ v6, v4; \
PADDQ t0, v4; \
PADDQ t0, v4; \
PXOR v4, v2; \
MOVO v2, t0; \
PADDQ v2, t0; \
PSRLQ $63, v2; \
PXOR t0, v2; \
MOVO v1, t0; \
PMULULQ v3, t0; \
PADDQ v3, v1; \
PADDQ t0, v1; \
PADDQ t0, v1; \
PXOR v1, v7; \
PSHUFD $0xB1, v7, v7; \
MOVO v5, t0; \
PMULULQ v7, t0; \
PADDQ v7, v5; \
PADDQ t0, v5; \
PADDQ t0, v5; \
PXOR v5, v3; \
PSHUFB c40, v3; \
MOVO v1, t0; \
PMULULQ v3, t0; \
PADDQ v3, v1; \
PADDQ t0, v1; \
PADDQ t0, v1; \
PXOR v1, v7; \
PSHUFB c48, v7; \
MOVO v5, t0; \
PMULULQ v7, t0; \
PADDQ v7, v5; \
PADDQ t0, v5; \
PADDQ t0, v5; \
PXOR v5, v3; \
MOVO v3, t0; \
PADDQ v3, t0; \
PSRLQ $63, v3; \
PXOR t0, v3
#define LOAD_MSG_0(block, off) \
MOVOU 8*(off+0)(block), X0; \
MOVOU 8*(off+2)(block), X1; \
MOVOU 8*(off+4)(block), X2; \
MOVOU 8*(off+6)(block), X3; \
MOVOU 8*(off+8)(block), X4; \
MOVOU 8*(off+10)(block), X5; \
MOVOU 8*(off+12)(block), X6; \
MOVOU 8*(off+14)(block), X7
#define STORE_MSG_0(block, off) \
MOVOU X0, 8*(off+0)(block); \
MOVOU X1, 8*(off+2)(block); \
MOVOU X2, 8*(off+4)(block); \
MOVOU X3, 8*(off+6)(block); \
MOVOU X4, 8*(off+8)(block); \
MOVOU X5, 8*(off+10)(block); \
MOVOU X6, 8*(off+12)(block); \
MOVOU X7, 8*(off+14)(block)
#define LOAD_MSG_1(block, off) \
MOVOU 8*off+0*8(block), X0; \
MOVOU 8*off+16*8(block), X1; \
MOVOU 8*off+32*8(block), X2; \
MOVOU 8*off+48*8(block), X3; \
MOVOU 8*off+64*8(block), X4; \
MOVOU 8*off+80*8(block), X5; \
MOVOU 8*off+96*8(block), X6; \
MOVOU 8*off+112*8(block), X7
#define STORE_MSG_1(block, off) \
MOVOU X0, 8*off+0*8(block); \
MOVOU X1, 8*off+16*8(block); \
MOVOU X2, 8*off+32*8(block); \
MOVOU X3, 8*off+48*8(block); \
MOVOU X4, 8*off+64*8(block); \
MOVOU X5, 8*off+80*8(block); \
MOVOU X6, 8*off+96*8(block); \
MOVOU X7, 8*off+112*8(block)
#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
LOAD_MSG_0(block, off); \
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
STORE_MSG_0(block, off)
#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
LOAD_MSG_1(block, off); \
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
STORE_MSG_1(block, off)
// func blamkaSSE4(b *block)
TEXT ·blamkaSSE4(SB), 4, $0-8
MOVQ b+0(FP), AX
MOVOU ·c40<>(SB), X10
MOVOU ·c48<>(SB), X11
BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
RET
// func mixBlocksSSE2(out, a, b, c *block)
TEXT ·mixBlocksSSE2(SB), 4, $0-32
MOVQ out+0(FP), DX
MOVQ a+8(FP), AX
MOVQ b+16(FP), BX
MOVQ a+24(FP), CX
MOVQ $128, BP
loop:
MOVOU 0(AX), X0
MOVOU 0(BX), X1
MOVOU 0(CX), X2
PXOR X1, X0
PXOR X2, X0
MOVOU X0, 0(DX)
ADDQ $16, AX
ADDQ $16, BX
ADDQ $16, CX
ADDQ $16, DX
SUBQ $2, BP
JA loop
RET
// func xorBlocksSSE2(out, a, b, c *block)
TEXT ·xorBlocksSSE2(SB), 4, $0-32
MOVQ out+0(FP), DX
MOVQ a+8(FP), AX
MOVQ b+16(FP), BX
MOVQ a+24(FP), CX
MOVQ $128, BP
loop:
MOVOU 0(AX), X0
MOVOU 0(BX), X1
MOVOU 0(CX), X2
MOVOU 0(DX), X3
PXOR X1, X0
PXOR X2, X0
PXOR X3, X0
MOVOU X0, 0(DX)
ADDQ $16, AX
ADDQ $16, BX
ADDQ $16, CX
ADDQ $16, DX
SUBQ $2, BP
JA loop
RET

163
vendor/golang.org/x/crypto/argon2/blamka_generic.go generated vendored Normal file
View File

@ -0,0 +1,163 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package argon2
var useSSE4 bool
func processBlockGeneric(out, in1, in2 *block, xor bool) {
var t block
for i := range t {
t[i] = in1[i] ^ in2[i]
}
for i := 0; i < blockLength; i += 16 {
blamkaGeneric(
&t[i+0], &t[i+1], &t[i+2], &t[i+3],
&t[i+4], &t[i+5], &t[i+6], &t[i+7],
&t[i+8], &t[i+9], &t[i+10], &t[i+11],
&t[i+12], &t[i+13], &t[i+14], &t[i+15],
)
}
for i := 0; i < blockLength/8; i += 2 {
blamkaGeneric(
&t[i], &t[i+1], &t[16+i], &t[16+i+1],
&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
)
}
if xor {
for i := range t {
out[i] ^= in1[i] ^ in2[i] ^ t[i]
}
} else {
for i := range t {
out[i] = in1[i] ^ in2[i] ^ t[i]
}
}
}
func blamkaGeneric(t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15 *uint64) {
v00, v01, v02, v03 := *t00, *t01, *t02, *t03
v04, v05, v06, v07 := *t04, *t05, *t06, *t07
v08, v09, v10, v11 := *t08, *t09, *t10, *t11
v12, v13, v14, v15 := *t12, *t13, *t14, *t15
v00 += v04 + 2*uint64(uint32(v00))*uint64(uint32(v04))
v12 ^= v00
v12 = v12>>32 | v12<<32
v08 += v12 + 2*uint64(uint32(v08))*uint64(uint32(v12))
v04 ^= v08
v04 = v04>>24 | v04<<40
v00 += v04 + 2*uint64(uint32(v00))*uint64(uint32(v04))
v12 ^= v00
v12 = v12>>16 | v12<<48
v08 += v12 + 2*uint64(uint32(v08))*uint64(uint32(v12))
v04 ^= v08
v04 = v04>>63 | v04<<1
v01 += v05 + 2*uint64(uint32(v01))*uint64(uint32(v05))
v13 ^= v01
v13 = v13>>32 | v13<<32
v09 += v13 + 2*uint64(uint32(v09))*uint64(uint32(v13))
v05 ^= v09
v05 = v05>>24 | v05<<40
v01 += v05 + 2*uint64(uint32(v01))*uint64(uint32(v05))
v13 ^= v01
v13 = v13>>16 | v13<<48
v09 += v13 + 2*uint64(uint32(v09))*uint64(uint32(v13))
v05 ^= v09
v05 = v05>>63 | v05<<1
v02 += v06 + 2*uint64(uint32(v02))*uint64(uint32(v06))
v14 ^= v02
v14 = v14>>32 | v14<<32
v10 += v14 + 2*uint64(uint32(v10))*uint64(uint32(v14))
v06 ^= v10
v06 = v06>>24 | v06<<40
v02 += v06 + 2*uint64(uint32(v02))*uint64(uint32(v06))
v14 ^= v02
v14 = v14>>16 | v14<<48
v10 += v14 + 2*uint64(uint32(v10))*uint64(uint32(v14))
v06 ^= v10
v06 = v06>>63 | v06<<1
v03 += v07 + 2*uint64(uint32(v03))*uint64(uint32(v07))
v15 ^= v03
v15 = v15>>32 | v15<<32
v11 += v15 + 2*uint64(uint32(v11))*uint64(uint32(v15))
v07 ^= v11
v07 = v07>>24 | v07<<40
v03 += v07 + 2*uint64(uint32(v03))*uint64(uint32(v07))
v15 ^= v03
v15 = v15>>16 | v15<<48
v11 += v15 + 2*uint64(uint32(v11))*uint64(uint32(v15))
v07 ^= v11
v07 = v07>>63 | v07<<1
v00 += v05 + 2*uint64(uint32(v00))*uint64(uint32(v05))
v15 ^= v00
v15 = v15>>32 | v15<<32
v10 += v15 + 2*uint64(uint32(v10))*uint64(uint32(v15))
v05 ^= v10
v05 = v05>>24 | v05<<40
v00 += v05 + 2*uint64(uint32(v00))*uint64(uint32(v05))
v15 ^= v00
v15 = v15>>16 | v15<<48
v10 += v15 + 2*uint64(uint32(v10))*uint64(uint32(v15))
v05 ^= v10
v05 = v05>>63 | v05<<1
v01 += v06 + 2*uint64(uint32(v01))*uint64(uint32(v06))
v12 ^= v01
v12 = v12>>32 | v12<<32
v11 += v12 + 2*uint64(uint32(v11))*uint64(uint32(v12))
v06 ^= v11
v06 = v06>>24 | v06<<40
v01 += v06 + 2*uint64(uint32(v01))*uint64(uint32(v06))
v12 ^= v01
v12 = v12>>16 | v12<<48
v11 += v12 + 2*uint64(uint32(v11))*uint64(uint32(v12))
v06 ^= v11
v06 = v06>>63 | v06<<1
v02 += v07 + 2*uint64(uint32(v02))*uint64(uint32(v07))
v13 ^= v02
v13 = v13>>32 | v13<<32
v08 += v13 + 2*uint64(uint32(v08))*uint64(uint32(v13))
v07 ^= v08
v07 = v07>>24 | v07<<40
v02 += v07 + 2*uint64(uint32(v02))*uint64(uint32(v07))
v13 ^= v02
v13 = v13>>16 | v13<<48
v08 += v13 + 2*uint64(uint32(v08))*uint64(uint32(v13))
v07 ^= v08
v07 = v07>>63 | v07<<1
v03 += v04 + 2*uint64(uint32(v03))*uint64(uint32(v04))
v14 ^= v03
v14 = v14>>32 | v14<<32
v09 += v14 + 2*uint64(uint32(v09))*uint64(uint32(v14))
v04 ^= v09
v04 = v04>>24 | v04<<40
v03 += v04 + 2*uint64(uint32(v03))*uint64(uint32(v04))
v14 ^= v03
v14 = v14>>16 | v14<<48
v09 += v14 + 2*uint64(uint32(v09))*uint64(uint32(v14))
v04 ^= v09
v04 = v04>>63 | v04<<1
*t00, *t01, *t02, *t03 = v00, v01, v02, v03
*t04, *t05, *t06, *t07 = v04, v05, v06, v07
*t08, *t09, *t10, *t11 = v08, v09, v10, v11
*t12, *t13, *t14, *t15 = v12, v13, v14, v15
}

16
vendor/golang.org/x/crypto/argon2/blamka_ref.go generated vendored Normal file
View File

@ -0,0 +1,16 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || purego || !gc
// +build !amd64 purego !gc
package argon2
func processBlock(out, in1, in2 *block) {
processBlockGeneric(out, in1, in2, false)
}
func processBlockXOR(out, in1, in2 *block) {
processBlockGeneric(out, in1, in2, true)
}

291
vendor/golang.org/x/crypto/blake2b/blake2b.go generated vendored Normal file
View File

@ -0,0 +1,291 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package blake2b implements the BLAKE2b hash algorithm defined by RFC 7693
// and the extendable output function (XOF) BLAKE2Xb.
//
// BLAKE2b is optimized for 64-bit platforms—including NEON-enabled ARMs—and
// produces digests of any size between 1 and 64 bytes.
// For a detailed specification of BLAKE2b see https://blake2.net/blake2.pdf
// and for BLAKE2Xb see https://blake2.net/blake2x.pdf
//
// If you aren't sure which function you need, use BLAKE2b (Sum512 or New512).
// If you need a secret-key MAC (message authentication code), use the New512
// function with a non-nil key.
//
// BLAKE2X is a construction to compute hash values larger than 64 bytes. It
// can produce hash values between 0 and 4 GiB.
package blake2b
import (
"encoding/binary"
"errors"
"hash"
)
const (
// The blocksize of BLAKE2b in bytes.
BlockSize = 128
// The hash size of BLAKE2b-512 in bytes.
Size = 64
// The hash size of BLAKE2b-384 in bytes.
Size384 = 48
// The hash size of BLAKE2b-256 in bytes.
Size256 = 32
)
var (
useAVX2 bool
useAVX bool
useSSE4 bool
)
var (
errKeySize = errors.New("blake2b: invalid key size")
errHashSize = errors.New("blake2b: invalid hash size")
)
var iv = [8]uint64{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
}
// Sum512 returns the BLAKE2b-512 checksum of the data.
func Sum512(data []byte) [Size]byte {
var sum [Size]byte
checkSum(&sum, Size, data)
return sum
}
// Sum384 returns the BLAKE2b-384 checksum of the data.
func Sum384(data []byte) [Size384]byte {
var sum [Size]byte
var sum384 [Size384]byte
checkSum(&sum, Size384, data)
copy(sum384[:], sum[:Size384])
return sum384
}
// Sum256 returns the BLAKE2b-256 checksum of the data.
func Sum256(data []byte) [Size256]byte {
var sum [Size]byte
var sum256 [Size256]byte
checkSum(&sum, Size256, data)
copy(sum256[:], sum[:Size256])
return sum256
}
// New512 returns a new hash.Hash computing the BLAKE2b-512 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New512(key []byte) (hash.Hash, error) { return newDigest(Size, key) }
// New384 returns a new hash.Hash computing the BLAKE2b-384 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New384(key []byte) (hash.Hash, error) { return newDigest(Size384, key) }
// New256 returns a new hash.Hash computing the BLAKE2b-256 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New256(key []byte) (hash.Hash, error) { return newDigest(Size256, key) }
// New returns a new hash.Hash computing the BLAKE2b checksum with a custom length.
// A non-nil key turns the hash into a MAC. The key must be between zero and 64 bytes long.
// The hash size can be a value between 1 and 64 but it is highly recommended to use
// values equal or greater than:
// - 32 if BLAKE2b is used as a hash function (The key is zero bytes long).
// - 16 if BLAKE2b is used as a MAC function (The key is at least 16 bytes long).
// When the key is nil, the returned hash.Hash implements BinaryMarshaler
// and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash.
func New(size int, key []byte) (hash.Hash, error) { return newDigest(size, key) }
func newDigest(hashSize int, key []byte) (*digest, error) {
if hashSize < 1 || hashSize > Size {
return nil, errHashSize
}
if len(key) > Size {
return nil, errKeySize
}
d := &digest{
size: hashSize,
keyLen: len(key),
}
copy(d.key[:], key)
d.Reset()
return d, nil
}
func checkSum(sum *[Size]byte, hashSize int, data []byte) {
h := iv
h[0] ^= uint64(hashSize) | (1 << 16) | (1 << 24)
var c [2]uint64
if length := len(data); length > BlockSize {
n := length &^ (BlockSize - 1)
if length == n {
n -= BlockSize
}
hashBlocks(&h, &c, 0, data[:n])
data = data[n:]
}
var block [BlockSize]byte
offset := copy(block[:], data)
remaining := uint64(BlockSize - offset)
if c[0] < remaining {
c[1]--
}
c[0] -= remaining
hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
for i, v := range h[:(hashSize+7)/8] {
binary.LittleEndian.PutUint64(sum[8*i:], v)
}
}
type digest struct {
h [8]uint64
c [2]uint64
size int
block [BlockSize]byte
offset int
key [BlockSize]byte
keyLen int
}
const (
magic = "b2b"
marshaledSize = len(magic) + 8*8 + 2*8 + 1 + BlockSize + 1
)
func (d *digest) MarshalBinary() ([]byte, error) {
if d.keyLen != 0 {
return nil, errors.New("crypto/blake2b: cannot marshal MACs")
}
b := make([]byte, 0, marshaledSize)
b = append(b, magic...)
for i := 0; i < 8; i++ {
b = appendUint64(b, d.h[i])
}
b = appendUint64(b, d.c[0])
b = appendUint64(b, d.c[1])
// Maximum value for size is 64
b = append(b, byte(d.size))
b = append(b, d.block[:]...)
b = append(b, byte(d.offset))
return b, nil
}
func (d *digest) UnmarshalBinary(b []byte) error {
if len(b) < len(magic) || string(b[:len(magic)]) != magic {
return errors.New("crypto/blake2b: invalid hash state identifier")
}
if len(b) != marshaledSize {
return errors.New("crypto/blake2b: invalid hash state size")
}
b = b[len(magic):]
for i := 0; i < 8; i++ {
b, d.h[i] = consumeUint64(b)
}
b, d.c[0] = consumeUint64(b)
b, d.c[1] = consumeUint64(b)
d.size = int(b[0])
b = b[1:]
copy(d.block[:], b[:BlockSize])
b = b[BlockSize:]
d.offset = int(b[0])
return nil
}
func (d *digest) BlockSize() int { return BlockSize }
func (d *digest) Size() int { return d.size }
func (d *digest) Reset() {
d.h = iv
d.h[0] ^= uint64(d.size) | (uint64(d.keyLen) << 8) | (1 << 16) | (1 << 24)
d.offset, d.c[0], d.c[1] = 0, 0, 0
if d.keyLen > 0 {
d.block = d.key
d.offset = BlockSize
}
}
func (d *digest) Write(p []byte) (n int, err error) {
n = len(p)
if d.offset > 0 {
remaining := BlockSize - d.offset
if n <= remaining {
d.offset += copy(d.block[d.offset:], p)
return
}
copy(d.block[d.offset:], p[:remaining])
hashBlocks(&d.h, &d.c, 0, d.block[:])
d.offset = 0
p = p[remaining:]
}
if length := len(p); length > BlockSize {
nn := length &^ (BlockSize - 1)
if length == nn {
nn -= BlockSize
}
hashBlocks(&d.h, &d.c, 0, p[:nn])
p = p[nn:]
}
if len(p) > 0 {
d.offset += copy(d.block[:], p)
}
return
}
func (d *digest) Sum(sum []byte) []byte {
var hash [Size]byte
d.finalize(&hash)
return append(sum, hash[:d.size]...)
}
func (d *digest) finalize(hash *[Size]byte) {
var block [BlockSize]byte
copy(block[:], d.block[:d.offset])
remaining := uint64(BlockSize - d.offset)
c := d.c
if c[0] < remaining {
c[1]--
}
c[0] -= remaining
h := d.h
hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
for i, v := range h {
binary.LittleEndian.PutUint64(hash[8*i:], v)
}
}
func appendUint64(b []byte, x uint64) []byte {
var a [8]byte
binary.BigEndian.PutUint64(a[:], x)
return append(b, a[:]...)
}
func appendUint32(b []byte, x uint32) []byte {
var a [4]byte
binary.BigEndian.PutUint32(a[:], x)
return append(b, a[:]...)
}
func consumeUint64(b []byte) ([]byte, uint64) {
x := binary.BigEndian.Uint64(b)
return b[8:], x
}
func consumeUint32(b []byte) ([]byte, uint32) {
x := binary.BigEndian.Uint32(b)
return b[4:], x
}

View File

@ -0,0 +1,38 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build go1.7 && amd64 && gc && !purego
// +build go1.7,amd64,gc,!purego
package blake2b
import "golang.org/x/sys/cpu"
func init() {
useAVX2 = cpu.X86.HasAVX2
useAVX = cpu.X86.HasAVX
useSSE4 = cpu.X86.HasSSE41
}
//go:noescape
func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
//go:noescape
func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
//go:noescape
func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
switch {
case useAVX2:
hashBlocksAVX2(h, c, flag, blocks)
case useAVX:
hashBlocksAVX(h, c, flag, blocks)
case useSSE4:
hashBlocksSSE4(h, c, flag, blocks)
default:
hashBlocksGeneric(h, c, flag, blocks)
}
}

745
vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s generated vendored Normal file
View File

@ -0,0 +1,745 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build go1.7 && amd64 && gc && !purego
// +build go1.7,amd64,gc,!purego
#include "textflag.h"
DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
VPADDQ m0, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFD $-79, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPSHUFB c40, Y1, Y1; \
VPADDQ m1, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFB c48, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPADDQ Y1, Y1, t; \
VPSRLQ $63, Y1, Y1; \
VPXOR t, Y1, Y1; \
VPERMQ_0x39_Y1_Y1; \
VPERMQ_0x4E_Y2_Y2; \
VPERMQ_0x93_Y3_Y3; \
VPADDQ m2, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFD $-79, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPSHUFB c40, Y1, Y1; \
VPADDQ m3, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFB c48, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPADDQ Y1, Y1, t; \
VPSRLQ $63, Y1, Y1; \
VPXOR t, Y1, Y1; \
VPERMQ_0x39_Y3_Y3; \
VPERMQ_0x4E_Y2_Y2; \
VPERMQ_0x93_Y1_Y1
#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
// load msg: Y12 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
VMOVQ_SI_X12(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X12(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y12, Y12
// load msg: Y13 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
VMOVQ_SI_X13(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X13(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y13, Y13
// load msg: Y14 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
VMOVQ_SI_X14(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X14(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y14, Y14
// load msg: Y15 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
VMOVQ_SI_X15(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X15(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
VMOVQ_SI_X12_0; \
VMOVQ_SI_X11(4*8); \
VPINSRQ_1_SI_X12(2*8); \
VPINSRQ_1_SI_X11(6*8); \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \
LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
VMOVQ_SI_X11(11*8); \
VPSHUFD $0x4E, 0*8(SI), X14; \
VPINSRQ_1_SI_X11(5*8); \
VINSERTI128 $1, X11, Y14, Y14; \
LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
VMOVQ_SI_X11(5*8); \
VMOVDQU 11*8(SI), X12; \
VPINSRQ_1_SI_X11(15*8); \
VINSERTI128 $1, X11, Y12, Y12; \
VMOVQ_SI_X13(8*8); \
VMOVQ_SI_X11(2*8); \
VPINSRQ_1_SI_X13_0; \
VPINSRQ_1_SI_X11(13*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \
VMOVQ_SI_X15(6*8); \
VMOVQ_SI_X11_0; \
VPINSRQ_1_SI_X15(10*8); \
VPINSRQ_1_SI_X11(8*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \
VMOVQ_SI_X13_0; \
VMOVQ_SI_X11(4*8); \
VPINSRQ_1_SI_X13(7*8); \
VPINSRQ_1_SI_X11(15*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X11_0; \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X11(8*8); \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \
LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \
LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
VMOVQ_SI_X14_0; \
VPSHUFD $0x4E, 8*8(SI), X11; \
VPINSRQ_1_SI_X14(6*8); \
VINSERTI128 $1, X11, Y14, Y14; \
LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \
VMOVQ_SI_X15_0; \
VMOVQ_SI_X11(6*8); \
VPINSRQ_1_SI_X15(4*8); \
VPINSRQ_1_SI_X11(10*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
VMOVQ_SI_X12(6*8); \
VMOVQ_SI_X11(11*8); \
VPINSRQ_1_SI_X12(14*8); \
VPINSRQ_1_SI_X11_0; \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
VMOVQ_SI_X11(1*8); \
VMOVDQU 12*8(SI), X14; \
VPINSRQ_1_SI_X11(10*8); \
VINSERTI128 $1, X11, Y14, Y14; \
VMOVQ_SI_X15(2*8); \
VMOVDQU 4*8(SI), X11; \
VPINSRQ_1_SI_X15(7*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \
VMOVQ_SI_X13(2*8); \
VPSHUFD $0x4E, 5*8(SI), X11; \
VPINSRQ_1_SI_X13(4*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
VMOVQ_SI_X15(11*8); \
VMOVQ_SI_X11(12*8); \
VPINSRQ_1_SI_X15(14*8); \
VPINSRQ_1_SI_X11_0; \
VINSERTI128 $1, X11, Y15, Y15
// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, DX
ADDQ $31, DX
ANDQ $~31, DX
MOVQ CX, 16(DX)
XORQ CX, CX
MOVQ CX, 24(DX)
VMOVDQU ·AVX2_c40<>(SB), Y4
VMOVDQU ·AVX2_c48<>(SB), Y5
VMOVDQU 0(AX), Y8
VMOVDQU 32(AX), Y9
VMOVDQU ·AVX2_iv0<>(SB), Y6
VMOVDQU ·AVX2_iv1<>(SB), Y7
MOVQ 0(BX), R8
MOVQ 8(BX), R9
MOVQ R9, 8(DX)
loop:
ADDQ $128, R8
MOVQ R8, 0(DX)
CMPQ R8, $128
JGE noinc
INCQ R9
MOVQ R9, 8(DX)
noinc:
VMOVDQA Y8, Y0
VMOVDQA Y9, Y1
VMOVDQA Y6, Y2
VPXOR 0(DX), Y7, Y3
LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
VMOVDQA Y12, 32(DX)
VMOVDQA Y13, 64(DX)
VMOVDQA Y14, 96(DX)
VMOVDQA Y15, 128(DX)
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
VMOVDQA Y12, 160(DX)
VMOVDQA Y13, 192(DX)
VMOVDQA Y14, 224(DX)
VMOVDQA Y15, 256(DX)
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
VPXOR Y0, Y8, Y8
VPXOR Y1, Y9, Y9
VPXOR Y2, Y8, Y8
VPXOR Y3, Y9, Y9
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
VMOVDQU Y8, 0(AX)
VMOVDQU Y9, 32(AX)
VZEROUPPER
RET
#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
#define SHUFFLE_AVX() \
VMOVDQA X6, X13; \
VMOVDQA X2, X14; \
VMOVDQA X4, X6; \
VPUNPCKLQDQ_X13_X13_X15; \
VMOVDQA X5, X4; \
VMOVDQA X6, X5; \
VPUNPCKHQDQ_X15_X7_X6; \
VPUNPCKLQDQ_X7_X7_X15; \
VPUNPCKHQDQ_X15_X13_X7; \
VPUNPCKLQDQ_X3_X3_X15; \
VPUNPCKHQDQ_X15_X2_X2; \
VPUNPCKLQDQ_X14_X14_X15; \
VPUNPCKHQDQ_X15_X3_X3; \
#define SHUFFLE_AVX_INV() \
VMOVDQA X2, X13; \
VMOVDQA X4, X14; \
VPUNPCKLQDQ_X2_X2_X15; \
VMOVDQA X5, X4; \
VPUNPCKHQDQ_X15_X3_X2; \
VMOVDQA X14, X5; \
VPUNPCKLQDQ_X3_X3_X15; \
VMOVDQA X6, X14; \
VPUNPCKHQDQ_X15_X13_X3; \
VPUNPCKLQDQ_X7_X7_X15; \
VPUNPCKHQDQ_X15_X6_X6; \
VPUNPCKLQDQ_X14_X14_X15; \
VPUNPCKHQDQ_X15_X7_X7; \
#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
VPADDQ m0, v0, v0; \
VPADDQ v2, v0, v0; \
VPADDQ m1, v1, v1; \
VPADDQ v3, v1, v1; \
VPXOR v0, v6, v6; \
VPXOR v1, v7, v7; \
VPSHUFD $-79, v6, v6; \
VPSHUFD $-79, v7, v7; \
VPADDQ v6, v4, v4; \
VPADDQ v7, v5, v5; \
VPXOR v4, v2, v2; \
VPXOR v5, v3, v3; \
VPSHUFB c40, v2, v2; \
VPSHUFB c40, v3, v3; \
VPADDQ m2, v0, v0; \
VPADDQ v2, v0, v0; \
VPADDQ m3, v1, v1; \
VPADDQ v3, v1, v1; \
VPXOR v0, v6, v6; \
VPXOR v1, v7, v7; \
VPSHUFB c48, v6, v6; \
VPSHUFB c48, v7, v7; \
VPADDQ v6, v4, v4; \
VPADDQ v7, v5, v5; \
VPXOR v4, v2, v2; \
VPXOR v5, v3, v3; \
VPADDQ v2, v2, t0; \
VPSRLQ $63, v2, v2; \
VPXOR t0, v2, v2; \
VPADDQ v3, v3, t0; \
VPSRLQ $63, v3, v3; \
VPXOR t0, v3, v3
// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
VMOVQ_SI_X12(i0*8); \
VMOVQ_SI_X13(i2*8); \
VMOVQ_SI_X14(i4*8); \
VMOVQ_SI_X15(i6*8); \
VPINSRQ_1_SI_X12(i1*8); \
VPINSRQ_1_SI_X13(i3*8); \
VPINSRQ_1_SI_X14(i5*8); \
VPINSRQ_1_SI_X15(i7*8)
// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
VMOVQ_SI_X12_0; \
VMOVQ_SI_X13(4*8); \
VMOVQ_SI_X14(1*8); \
VMOVQ_SI_X15(5*8); \
VPINSRQ_1_SI_X12(2*8); \
VPINSRQ_1_SI_X13(6*8); \
VPINSRQ_1_SI_X14(3*8); \
VPINSRQ_1_SI_X15(7*8)
// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
VPSHUFD $0x4E, 0*8(SI), X12; \
VMOVQ_SI_X13(11*8); \
VMOVQ_SI_X14(12*8); \
VMOVQ_SI_X15(7*8); \
VPINSRQ_1_SI_X13(5*8); \
VPINSRQ_1_SI_X14(2*8); \
VPINSRQ_1_SI_X15(3*8)
// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
VMOVDQU 11*8(SI), X12; \
VMOVQ_SI_X13(5*8); \
VMOVQ_SI_X14(8*8); \
VMOVQ_SI_X15(2*8); \
VPINSRQ_1_SI_X13(15*8); \
VPINSRQ_1_SI_X14_0; \
VPINSRQ_1_SI_X15(13*8)
// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X13(4*8); \
VMOVQ_SI_X14(6*8); \
VMOVQ_SI_X15_0; \
VPINSRQ_1_SI_X12(5*8); \
VPINSRQ_1_SI_X13(15*8); \
VPINSRQ_1_SI_X14(10*8); \
VPINSRQ_1_SI_X15(8*8)
// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
VMOVQ_SI_X12(9*8); \
VMOVQ_SI_X13(2*8); \
VMOVQ_SI_X14_0; \
VMOVQ_SI_X15(4*8); \
VPINSRQ_1_SI_X12(5*8); \
VPINSRQ_1_SI_X13(10*8); \
VPINSRQ_1_SI_X14(7*8); \
VPINSRQ_1_SI_X15(15*8)
// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X13_0; \
VMOVQ_SI_X14(12*8); \
VMOVQ_SI_X15(11*8); \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X13(8*8); \
VPINSRQ_1_SI_X14(10*8); \
VPINSRQ_1_SI_X15(3*8)
// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
MOVQ 0*8(SI), X12; \
VPSHUFD $0x4E, 8*8(SI), X13; \
MOVQ 7*8(SI), X14; \
MOVQ 2*8(SI), X15; \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X14(3*8); \
VPINSRQ_1_SI_X15(11*8)
// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
MOVQ 6*8(SI), X12; \
MOVQ 11*8(SI), X13; \
MOVQ 15*8(SI), X14; \
MOVQ 3*8(SI), X15; \
VPINSRQ_1_SI_X12(14*8); \
VPINSRQ_1_SI_X13_0; \
VPINSRQ_1_SI_X14(9*8); \
VPINSRQ_1_SI_X15(8*8)
// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
MOVQ 5*8(SI), X12; \
MOVQ 8*8(SI), X13; \
MOVQ 0*8(SI), X14; \
MOVQ 6*8(SI), X15; \
VPINSRQ_1_SI_X12(15*8); \
VPINSRQ_1_SI_X13(2*8); \
VPINSRQ_1_SI_X14(4*8); \
VPINSRQ_1_SI_X15(10*8)
// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
VMOVDQU 12*8(SI), X12; \
MOVQ 1*8(SI), X13; \
MOVQ 2*8(SI), X14; \
VPINSRQ_1_SI_X13(10*8); \
VPINSRQ_1_SI_X14(7*8); \
VMOVDQU 4*8(SI), X15
// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
MOVQ 15*8(SI), X12; \
MOVQ 3*8(SI), X13; \
MOVQ 11*8(SI), X14; \
MOVQ 12*8(SI), X15; \
VPINSRQ_1_SI_X12(9*8); \
VPINSRQ_1_SI_X13(13*8); \
VPINSRQ_1_SI_X14(14*8); \
VPINSRQ_1_SI_X15_0
// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, R10
ADDQ $15, R10
ANDQ $~15, R10
VMOVDQU ·AVX_c40<>(SB), X0
VMOVDQU ·AVX_c48<>(SB), X1
VMOVDQA X0, X8
VMOVDQA X1, X9
VMOVDQU ·AVX_iv3<>(SB), X0
VMOVDQA X0, 0(R10)
XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0)
VMOVDQU 0(AX), X10
VMOVDQU 16(AX), X11
VMOVDQU 32(AX), X2
VMOVDQU 48(AX), X3
MOVQ 0(BX), R8
MOVQ 8(BX), R9
loop:
ADDQ $128, R8
CMPQ R8, $128
JGE noinc
INCQ R9
noinc:
VMOVQ_R8_X15
VPINSRQ_1_R9_X15
VMOVDQA X10, X0
VMOVDQA X11, X1
VMOVDQU ·AVX_iv0<>(SB), X4
VMOVDQU ·AVX_iv1<>(SB), X5
VMOVDQU ·AVX_iv2<>(SB), X6
VPXOR X15, X6, X6
VMOVDQA 0(R10), X7
LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
VMOVDQA X12, 16(R10)
VMOVDQA X13, 32(R10)
VMOVDQA X14, 48(R10)
VMOVDQA X15, 64(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
VMOVDQA X12, 80(R10)
VMOVDQA X13, 96(R10)
VMOVDQA X14, 112(R10)
VMOVDQA X15, 128(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
VMOVDQA X12, 144(R10)
VMOVDQA X13, 160(R10)
VMOVDQA X14, 176(R10)
VMOVDQA X15, 192(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
VMOVDQA X12, 208(R10)
VMOVDQA X13, 224(R10)
VMOVDQA X14, 240(R10)
VMOVDQA X15, 256(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
SHUFFLE_AVX()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
SHUFFLE_AVX_INV()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
SHUFFLE_AVX()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
SHUFFLE_AVX_INV()
VMOVDQU 32(AX), X14
VMOVDQU 48(AX), X15
VPXOR X0, X10, X10
VPXOR X1, X11, X11
VPXOR X2, X14, X14
VPXOR X3, X15, X15
VPXOR X4, X10, X10
VPXOR X5, X11, X11
VPXOR X6, X14, X2
VPXOR X7, X15, X3
VMOVDQU X2, 32(AX)
VMOVDQU X3, 48(AX)
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
VMOVDQU X10, 0(AX)
VMOVDQU X11, 16(AX)
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
VZEROUPPER
RET

25
vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go generated vendored Normal file
View File

@ -0,0 +1,25 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !go1.7 && amd64 && gc && !purego
// +build !go1.7,amd64,gc,!purego
package blake2b
import "golang.org/x/sys/cpu"
func init() {
useSSE4 = cpu.X86.HasSSE41
}
//go:noescape
func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
if useSSE4 {
hashBlocksSSE4(h, c, flag, blocks)
} else {
hashBlocksGeneric(h, c, flag, blocks)
}
}

279
vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s generated vendored Normal file
View File

@ -0,0 +1,279 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
// +build amd64,gc,!purego
#include "textflag.h"
DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v6, t1; \
PUNPCKLQDQ v6, t2; \
PUNPCKHQDQ v7, v6; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ v7, t2; \
MOVO t1, v7; \
MOVO v2, t1; \
PUNPCKHQDQ t2, v7; \
PUNPCKLQDQ v3, t2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v3
#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v2, t1; \
PUNPCKLQDQ v2, t2; \
PUNPCKHQDQ v3, v2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ v3, t2; \
MOVO t1, v3; \
MOVO v6, t1; \
PUNPCKHQDQ t2, v3; \
PUNPCKLQDQ v7, t2; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v7
#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
PADDQ m0, v0; \
PADDQ m1, v1; \
PADDQ v2, v0; \
PADDQ v3, v1; \
PXOR v0, v6; \
PXOR v1, v7; \
PSHUFD $0xB1, v6, v6; \
PSHUFD $0xB1, v7, v7; \
PADDQ v6, v4; \
PADDQ v7, v5; \
PXOR v4, v2; \
PXOR v5, v3; \
PSHUFB c40, v2; \
PSHUFB c40, v3; \
PADDQ m2, v0; \
PADDQ m3, v1; \
PADDQ v2, v0; \
PADDQ v3, v1; \
PXOR v0, v6; \
PXOR v1, v7; \
PSHUFB c48, v6; \
PSHUFB c48, v7; \
PADDQ v6, v4; \
PADDQ v7, v5; \
PXOR v4, v2; \
PXOR v5, v3; \
MOVOU v2, t0; \
PADDQ v2, t0; \
PSRLQ $63, v2; \
PXOR t0, v2; \
MOVOU v3, t0; \
PADDQ v3, t0; \
PSRLQ $63, v3; \
PXOR t0, v3
#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
MOVQ i0*8(src), m0; \
PINSRQ $1, i1*8(src), m0; \
MOVQ i2*8(src), m1; \
PINSRQ $1, i3*8(src), m1; \
MOVQ i4*8(src), m2; \
PINSRQ $1, i5*8(src), m2; \
MOVQ i6*8(src), m3; \
PINSRQ $1, i7*8(src), m3
// func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, R10
ADDQ $15, R10
ANDQ $~15, R10
MOVOU ·iv3<>(SB), X0
MOVO X0, 0(R10)
XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0)
MOVOU ·c40<>(SB), X13
MOVOU ·c48<>(SB), X14
MOVOU 0(AX), X12
MOVOU 16(AX), X15
MOVQ 0(BX), R8
MOVQ 8(BX), R9
loop:
ADDQ $128, R8
CMPQ R8, $128
JGE noinc
INCQ R9
noinc:
MOVQ R8, X8
PINSRQ $1, R9, X8
MOVO X12, X0
MOVO X15, X1
MOVOU 32(AX), X2
MOVOU 48(AX), X3
MOVOU ·iv0<>(SB), X4
MOVOU ·iv1<>(SB), X5
MOVOU ·iv2<>(SB), X6
PXOR X8, X6
MOVO 0(R10), X7
LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
MOVO X8, 16(R10)
MOVO X9, 32(R10)
MOVO X10, 48(R10)
MOVO X11, 64(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
MOVO X8, 80(R10)
MOVO X9, 96(R10)
MOVO X10, 112(R10)
MOVO X11, 128(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
MOVO X8, 144(R10)
MOVO X9, 160(R10)
MOVO X10, 176(R10)
MOVO X11, 192(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
MOVO X8, 208(R10)
MOVO X9, 224(R10)
MOVO X10, 240(R10)
MOVO X11, 256(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
MOVOU 32(AX), X10
MOVOU 48(AX), X11
PXOR X0, X12
PXOR X1, X15
PXOR X2, X10
PXOR X3, X11
PXOR X4, X12
PXOR X5, X15
PXOR X6, X10
PXOR X7, X11
MOVOU X10, 32(AX)
MOVOU X11, 48(AX)
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
MOVOU X12, 0(AX)
MOVOU X15, 16(AX)
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
RET

182
vendor/golang.org/x/crypto/blake2b/blake2b_generic.go generated vendored Normal file
View File

@ -0,0 +1,182 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2b
import (
"encoding/binary"
"math/bits"
)
// the precomputed values for BLAKE2b
// there are 12 16-byte arrays - one for each round
// the entries are calculated from the sigma constants.
var precomputed = [12][16]byte{
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, // equal to the first
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, // equal to the second
}
func hashBlocksGeneric(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
var m [16]uint64
c0, c1 := c[0], c[1]
for i := 0; i < len(blocks); {
c0 += BlockSize
if c0 < BlockSize {
c1++
}
v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7]
v12 ^= c0
v13 ^= c1
v14 ^= flag
for j := range m {
m[j] = binary.LittleEndian.Uint64(blocks[i:])
i += 8
}
for j := range precomputed {
s := &(precomputed[j])
v0 += m[s[0]]
v0 += v4
v12 ^= v0
v12 = bits.RotateLeft64(v12, -32)
v8 += v12
v4 ^= v8
v4 = bits.RotateLeft64(v4, -24)
v1 += m[s[1]]
v1 += v5
v13 ^= v1
v13 = bits.RotateLeft64(v13, -32)
v9 += v13
v5 ^= v9
v5 = bits.RotateLeft64(v5, -24)
v2 += m[s[2]]
v2 += v6
v14 ^= v2
v14 = bits.RotateLeft64(v14, -32)
v10 += v14
v6 ^= v10
v6 = bits.RotateLeft64(v6, -24)
v3 += m[s[3]]
v3 += v7
v15 ^= v3
v15 = bits.RotateLeft64(v15, -32)
v11 += v15
v7 ^= v11
v7 = bits.RotateLeft64(v7, -24)
v0 += m[s[4]]
v0 += v4
v12 ^= v0
v12 = bits.RotateLeft64(v12, -16)
v8 += v12
v4 ^= v8
v4 = bits.RotateLeft64(v4, -63)
v1 += m[s[5]]
v1 += v5
v13 ^= v1
v13 = bits.RotateLeft64(v13, -16)
v9 += v13
v5 ^= v9
v5 = bits.RotateLeft64(v5, -63)
v2 += m[s[6]]
v2 += v6
v14 ^= v2
v14 = bits.RotateLeft64(v14, -16)
v10 += v14
v6 ^= v10
v6 = bits.RotateLeft64(v6, -63)
v3 += m[s[7]]
v3 += v7
v15 ^= v3
v15 = bits.RotateLeft64(v15, -16)
v11 += v15
v7 ^= v11
v7 = bits.RotateLeft64(v7, -63)
v0 += m[s[8]]
v0 += v5
v15 ^= v0
v15 = bits.RotateLeft64(v15, -32)
v10 += v15
v5 ^= v10
v5 = bits.RotateLeft64(v5, -24)
v1 += m[s[9]]
v1 += v6
v12 ^= v1
v12 = bits.RotateLeft64(v12, -32)
v11 += v12
v6 ^= v11
v6 = bits.RotateLeft64(v6, -24)
v2 += m[s[10]]
v2 += v7
v13 ^= v2
v13 = bits.RotateLeft64(v13, -32)
v8 += v13
v7 ^= v8
v7 = bits.RotateLeft64(v7, -24)
v3 += m[s[11]]
v3 += v4
v14 ^= v3
v14 = bits.RotateLeft64(v14, -32)
v9 += v14
v4 ^= v9
v4 = bits.RotateLeft64(v4, -24)
v0 += m[s[12]]
v0 += v5
v15 ^= v0
v15 = bits.RotateLeft64(v15, -16)
v10 += v15
v5 ^= v10
v5 = bits.RotateLeft64(v5, -63)
v1 += m[s[13]]
v1 += v6
v12 ^= v1
v12 = bits.RotateLeft64(v12, -16)
v11 += v12
v6 ^= v11
v6 = bits.RotateLeft64(v6, -63)
v2 += m[s[14]]
v2 += v7
v13 ^= v2
v13 = bits.RotateLeft64(v13, -16)
v8 += v13
v7 ^= v8
v7 = bits.RotateLeft64(v7, -63)
v3 += m[s[15]]
v3 += v4
v14 ^= v3
v14 = bits.RotateLeft64(v14, -16)
v9 += v14
v4 ^= v9
v4 = bits.RotateLeft64(v4, -63)
}
h[0] ^= v0 ^ v8
h[1] ^= v1 ^ v9
h[2] ^= v2 ^ v10
h[3] ^= v3 ^ v11
h[4] ^= v4 ^ v12
h[5] ^= v5 ^ v13
h[6] ^= v6 ^ v14
h[7] ^= v7 ^ v15
}
c[0], c[1] = c0, c1
}

12
vendor/golang.org/x/crypto/blake2b/blake2b_ref.go generated vendored Normal file
View File

@ -0,0 +1,12 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || purego || !gc
// +build !amd64 purego !gc
package blake2b
func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
hashBlocksGeneric(h, c, flag, blocks)
}

177
vendor/golang.org/x/crypto/blake2b/blake2x.go generated vendored Normal file
View File

@ -0,0 +1,177 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2b
import (
"encoding/binary"
"errors"
"io"
)
// XOF defines the interface to hash functions that
// support arbitrary-length output.
type XOF interface {
// Write absorbs more data into the hash's state. It panics if called
// after Read.
io.Writer
// Read reads more output from the hash. It returns io.EOF if the limit
// has been reached.
io.Reader
// Clone returns a copy of the XOF in its current state.
Clone() XOF
// Reset resets the XOF to its initial state.
Reset()
}
// OutputLengthUnknown can be used as the size argument to NewXOF to indicate
// the length of the output is not known in advance.
const OutputLengthUnknown = 0
// magicUnknownOutputLength is a magic value for the output size that indicates
// an unknown number of output bytes.
const magicUnknownOutputLength = (1 << 32) - 1
// maxOutputLength is the absolute maximum number of bytes to produce when the
// number of output bytes is unknown.
const maxOutputLength = (1 << 32) * 64
// NewXOF creates a new variable-output-length hash. The hash either produce a
// known number of bytes (1 <= size < 2**32-1), or an unknown number of bytes
// (size == OutputLengthUnknown). In the latter case, an absolute limit of
// 256GiB applies.
//
// A non-nil key turns the hash into a MAC. The key must between
// zero and 32 bytes long.
func NewXOF(size uint32, key []byte) (XOF, error) {
if len(key) > Size {
return nil, errKeySize
}
if size == magicUnknownOutputLength {
// 2^32-1 indicates an unknown number of bytes and thus isn't a
// valid length.
return nil, errors.New("blake2b: XOF length too large")
}
if size == OutputLengthUnknown {
size = magicUnknownOutputLength
}
x := &xof{
d: digest{
size: Size,
keyLen: len(key),
},
length: size,
}
copy(x.d.key[:], key)
x.Reset()
return x, nil
}
type xof struct {
d digest
length uint32
remaining uint64
cfg, root, block [Size]byte
offset int
nodeOffset uint32
readMode bool
}
func (x *xof) Write(p []byte) (n int, err error) {
if x.readMode {
panic("blake2b: write to XOF after read")
}
return x.d.Write(p)
}
func (x *xof) Clone() XOF {
clone := *x
return &clone
}
func (x *xof) Reset() {
x.cfg[0] = byte(Size)
binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length
binary.LittleEndian.PutUint32(x.cfg[12:], x.length) // XOF length
x.cfg[17] = byte(Size) // inner hash size
x.d.Reset()
x.d.h[1] ^= uint64(x.length) << 32
x.remaining = uint64(x.length)
if x.remaining == magicUnknownOutputLength {
x.remaining = maxOutputLength
}
x.offset, x.nodeOffset = 0, 0
x.readMode = false
}
func (x *xof) Read(p []byte) (n int, err error) {
if !x.readMode {
x.d.finalize(&x.root)
x.readMode = true
}
if x.remaining == 0 {
return 0, io.EOF
}
n = len(p)
if uint64(n) > x.remaining {
n = int(x.remaining)
p = p[:n]
}
if x.offset > 0 {
blockRemaining := Size - x.offset
if n < blockRemaining {
x.offset += copy(p, x.block[x.offset:])
x.remaining -= uint64(n)
return
}
copy(p, x.block[x.offset:])
p = p[blockRemaining:]
x.offset = 0
x.remaining -= uint64(blockRemaining)
}
for len(p) >= Size {
binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
x.nodeOffset++
x.d.initConfig(&x.cfg)
x.d.Write(x.root[:])
x.d.finalize(&x.block)
copy(p, x.block[:])
p = p[Size:]
x.remaining -= uint64(Size)
}
if todo := len(p); todo > 0 {
if x.remaining < uint64(Size) {
x.cfg[0] = byte(x.remaining)
}
binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
x.nodeOffset++
x.d.initConfig(&x.cfg)
x.d.Write(x.root[:])
x.d.finalize(&x.block)
x.offset = copy(p, x.block[:todo])
x.remaining -= uint64(todo)
}
return
}
func (d *digest) initConfig(cfg *[Size]byte) {
d.offset, d.c[0], d.c[1] = 0, 0, 0
for i := range d.h {
d.h[i] = iv[i] ^ binary.LittleEndian.Uint64(cfg[i*8:])
}
}

33
vendor/golang.org/x/crypto/blake2b/register.go generated vendored Normal file
View File

@ -0,0 +1,33 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build go1.9
// +build go1.9
package blake2b
import (
"crypto"
"hash"
)
func init() {
newHash256 := func() hash.Hash {
h, _ := New256(nil)
return h
}
newHash384 := func() hash.Hash {
h, _ := New384(nil)
return h
}
newHash512 := func() hash.Hash {
h, _ := New512(nil)
return h
}
crypto.RegisterHash(crypto.BLAKE2b_256, newHash256)
crypto.RegisterHash(crypto.BLAKE2b_384, newHash384)
crypto.RegisterHash(crypto.BLAKE2b_512, newHash512)
}

View File

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build go1.11 && gc && !purego
// +build go1.11,gc,!purego
#include "textflag.h"

View File

@ -19,6 +19,7 @@
// The differences in this and the original implementation are
// due to the calling conventions and initialization of constants.
//go:build gc && !purego
// +build gc,!purego
#include "textflag.h"

View File

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
#include "go_asm.h"

View File

@ -10,6 +10,8 @@ package curve25519 // import "golang.org/x/crypto/curve25519"
import (
"crypto/subtle"
"fmt"
"golang.org/x/crypto/curve25519/internal/field"
)
// ScalarMult sets dst to the product scalar * point.
@ -18,7 +20,55 @@ import (
// zeroes, irrespective of the scalar. Instead, use the X25519 function, which
// will return an error.
func ScalarMult(dst, scalar, point *[32]byte) {
scalarMult(dst, scalar, point)
var e [32]byte
copy(e[:], scalar[:])
e[0] &= 248
e[31] &= 127
e[31] |= 64
var x1, x2, z2, x3, z3, tmp0, tmp1 field.Element
x1.SetBytes(point[:])
x2.One()
x3.Set(&x1)
z3.One()
swap := 0
for pos := 254; pos >= 0; pos-- {
b := e[pos/8] >> uint(pos&7)
b &= 1
swap ^= int(b)
x2.Swap(&x3, swap)
z2.Swap(&z3, swap)
swap = int(b)
tmp0.Subtract(&x3, &z3)
tmp1.Subtract(&x2, &z2)
x2.Add(&x2, &z2)
z2.Add(&x3, &z3)
z3.Multiply(&tmp0, &x2)
z2.Multiply(&z2, &tmp1)
tmp0.Square(&tmp1)
tmp1.Square(&x2)
x3.Add(&z3, &z2)
z2.Subtract(&z3, &z2)
x2.Multiply(&tmp1, &tmp0)
tmp1.Subtract(&tmp1, &tmp0)
z2.Square(&z2)
z3.Mult32(&tmp1, 121666)
x3.Square(&x3)
tmp0.Add(&tmp0, &z3)
z3.Multiply(&x1, &z2)
z2.Multiply(&tmp1, &tmp0)
}
x2.Swap(&x3, swap)
z2.Swap(&z3, swap)
z2.Invert(&z2)
x2.Multiply(&x2, &z2)
copy(dst[:], x2.Bytes())
}
// ScalarBaseMult sets dst to the product scalar * base where base is the

View File

@ -1,241 +0,0 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
// +build amd64,gc,!purego
package curve25519
// These functions are implemented in the .s files. The names of the functions
// in the rest of the file are also taken from the SUPERCOP sources to help
// people following along.
//go:noescape
func cswap(inout *[5]uint64, v uint64)
//go:noescape
func ladderstep(inout *[5][5]uint64)
//go:noescape
func freeze(inout *[5]uint64)
//go:noescape
func mul(dest, a, b *[5]uint64)
//go:noescape
func square(out, in *[5]uint64)
// mladder uses a Montgomery ladder to calculate (xr/zr) *= s.
func mladder(xr, zr *[5]uint64, s *[32]byte) {
var work [5][5]uint64
work[0] = *xr
setint(&work[1], 1)
setint(&work[2], 0)
work[3] = *xr
setint(&work[4], 1)
j := uint(6)
var prevbit byte
for i := 31; i >= 0; i-- {
for j < 8 {
bit := ((*s)[i] >> j) & 1
swap := bit ^ prevbit
prevbit = bit
cswap(&work[1], uint64(swap))
ladderstep(&work)
j--
}
j = 7
}
*xr = work[1]
*zr = work[2]
}
func scalarMult(out, in, base *[32]byte) {
var e [32]byte
copy(e[:], (*in)[:])
e[0] &= 248
e[31] &= 127
e[31] |= 64
var t, z [5]uint64
unpack(&t, base)
mladder(&t, &z, &e)
invert(&z, &z)
mul(&t, &t, &z)
pack(out, &t)
}
func setint(r *[5]uint64, v uint64) {
r[0] = v
r[1] = 0
r[2] = 0
r[3] = 0
r[4] = 0
}
// unpack sets r = x where r consists of 5, 51-bit limbs in little-endian
// order.
func unpack(r *[5]uint64, x *[32]byte) {
r[0] = uint64(x[0]) |
uint64(x[1])<<8 |
uint64(x[2])<<16 |
uint64(x[3])<<24 |
uint64(x[4])<<32 |
uint64(x[5])<<40 |
uint64(x[6]&7)<<48
r[1] = uint64(x[6])>>3 |
uint64(x[7])<<5 |
uint64(x[8])<<13 |
uint64(x[9])<<21 |
uint64(x[10])<<29 |
uint64(x[11])<<37 |
uint64(x[12]&63)<<45
r[2] = uint64(x[12])>>6 |
uint64(x[13])<<2 |
uint64(x[14])<<10 |
uint64(x[15])<<18 |
uint64(x[16])<<26 |
uint64(x[17])<<34 |
uint64(x[18])<<42 |
uint64(x[19]&1)<<50
r[3] = uint64(x[19])>>1 |
uint64(x[20])<<7 |
uint64(x[21])<<15 |
uint64(x[22])<<23 |
uint64(x[23])<<31 |
uint64(x[24])<<39 |
uint64(x[25]&15)<<47
r[4] = uint64(x[25])>>4 |
uint64(x[26])<<4 |
uint64(x[27])<<12 |
uint64(x[28])<<20 |
uint64(x[29])<<28 |
uint64(x[30])<<36 |
uint64(x[31]&127)<<44
}
// pack sets out = x where out is the usual, little-endian form of the 5,
// 51-bit limbs in x.
func pack(out *[32]byte, x *[5]uint64) {
t := *x
freeze(&t)
out[0] = byte(t[0])
out[1] = byte(t[0] >> 8)
out[2] = byte(t[0] >> 16)
out[3] = byte(t[0] >> 24)
out[4] = byte(t[0] >> 32)
out[5] = byte(t[0] >> 40)
out[6] = byte(t[0] >> 48)
out[6] ^= byte(t[1]<<3) & 0xf8
out[7] = byte(t[1] >> 5)
out[8] = byte(t[1] >> 13)
out[9] = byte(t[1] >> 21)
out[10] = byte(t[1] >> 29)
out[11] = byte(t[1] >> 37)
out[12] = byte(t[1] >> 45)
out[12] ^= byte(t[2]<<6) & 0xc0
out[13] = byte(t[2] >> 2)
out[14] = byte(t[2] >> 10)
out[15] = byte(t[2] >> 18)
out[16] = byte(t[2] >> 26)
out[17] = byte(t[2] >> 34)
out[18] = byte(t[2] >> 42)
out[19] = byte(t[2] >> 50)
out[19] ^= byte(t[3]<<1) & 0xfe
out[20] = byte(t[3] >> 7)
out[21] = byte(t[3] >> 15)
out[22] = byte(t[3] >> 23)
out[23] = byte(t[3] >> 31)
out[24] = byte(t[3] >> 39)
out[25] = byte(t[3] >> 47)
out[25] ^= byte(t[4]<<4) & 0xf0
out[26] = byte(t[4] >> 4)
out[27] = byte(t[4] >> 12)
out[28] = byte(t[4] >> 20)
out[29] = byte(t[4] >> 28)
out[30] = byte(t[4] >> 36)
out[31] = byte(t[4] >> 44)
}
// invert calculates r = x^-1 mod p using Fermat's little theorem.
func invert(r *[5]uint64, x *[5]uint64) {
var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t [5]uint64
square(&z2, x) /* 2 */
square(&t, &z2) /* 4 */
square(&t, &t) /* 8 */
mul(&z9, &t, x) /* 9 */
mul(&z11, &z9, &z2) /* 11 */
square(&t, &z11) /* 22 */
mul(&z2_5_0, &t, &z9) /* 2^5 - 2^0 = 31 */
square(&t, &z2_5_0) /* 2^6 - 2^1 */
for i := 1; i < 5; i++ { /* 2^20 - 2^10 */
square(&t, &t)
}
mul(&z2_10_0, &t, &z2_5_0) /* 2^10 - 2^0 */
square(&t, &z2_10_0) /* 2^11 - 2^1 */
for i := 1; i < 10; i++ { /* 2^20 - 2^10 */
square(&t, &t)
}
mul(&z2_20_0, &t, &z2_10_0) /* 2^20 - 2^0 */
square(&t, &z2_20_0) /* 2^21 - 2^1 */
for i := 1; i < 20; i++ { /* 2^40 - 2^20 */
square(&t, &t)
}
mul(&t, &t, &z2_20_0) /* 2^40 - 2^0 */
square(&t, &t) /* 2^41 - 2^1 */
for i := 1; i < 10; i++ { /* 2^50 - 2^10 */
square(&t, &t)
}
mul(&z2_50_0, &t, &z2_10_0) /* 2^50 - 2^0 */
square(&t, &z2_50_0) /* 2^51 - 2^1 */
for i := 1; i < 50; i++ { /* 2^100 - 2^50 */
square(&t, &t)
}
mul(&z2_100_0, &t, &z2_50_0) /* 2^100 - 2^0 */
square(&t, &z2_100_0) /* 2^101 - 2^1 */
for i := 1; i < 100; i++ { /* 2^200 - 2^100 */
square(&t, &t)
}
mul(&t, &t, &z2_100_0) /* 2^200 - 2^0 */
square(&t, &t) /* 2^201 - 2^1 */
for i := 1; i < 50; i++ { /* 2^250 - 2^50 */
square(&t, &t)
}
mul(&t, &t, &z2_50_0) /* 2^250 - 2^0 */
square(&t, &t) /* 2^251 - 2^1 */
square(&t, &t) /* 2^252 - 2^2 */
square(&t, &t) /* 2^253 - 2^3 */
square(&t, &t) /* 2^254 - 2^4 */
square(&t, &t) /* 2^255 - 2^5 */
mul(r, &t, &z11) /* 2^255 - 21 */
}

File diff suppressed because it is too large Load Diff

View File

@ -1,828 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package curve25519
import "encoding/binary"
// This code is a port of the public domain, "ref10" implementation of
// curve25519 from SUPERCOP 20130419 by D. J. Bernstein.
// fieldElement represents an element of the field GF(2^255 - 19). An element
// t, entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
// t[3]+2^102 t[4]+...+2^230 t[9]. Bounds on each t[i] vary depending on
// context.
type fieldElement [10]int32
func feZero(fe *fieldElement) {
for i := range fe {
fe[i] = 0
}
}
func feOne(fe *fieldElement) {
feZero(fe)
fe[0] = 1
}
func feAdd(dst, a, b *fieldElement) {
for i := range dst {
dst[i] = a[i] + b[i]
}
}
func feSub(dst, a, b *fieldElement) {
for i := range dst {
dst[i] = a[i] - b[i]
}
}
func feCopy(dst, src *fieldElement) {
for i := range dst {
dst[i] = src[i]
}
}
// feCSwap replaces (f,g) with (g,f) if b == 1; replaces (f,g) with (f,g) if b == 0.
//
// Preconditions: b in {0,1}.
func feCSwap(f, g *fieldElement, b int32) {
b = -b
for i := range f {
t := b & (f[i] ^ g[i])
f[i] ^= t
g[i] ^= t
}
}
// load3 reads a 24-bit, little-endian value from in.
func load3(in []byte) int64 {
var r int64
r = int64(in[0])
r |= int64(in[1]) << 8
r |= int64(in[2]) << 16
return r
}
// load4 reads a 32-bit, little-endian value from in.
func load4(in []byte) int64 {
return int64(binary.LittleEndian.Uint32(in))
}
func feFromBytes(dst *fieldElement, src *[32]byte) {
h0 := load4(src[:])
h1 := load3(src[4:]) << 6
h2 := load3(src[7:]) << 5
h3 := load3(src[10:]) << 3
h4 := load3(src[13:]) << 2
h5 := load4(src[16:])
h6 := load3(src[20:]) << 7
h7 := load3(src[23:]) << 5
h8 := load3(src[26:]) << 4
h9 := (load3(src[29:]) & 0x7fffff) << 2
var carry [10]int64
carry[9] = (h9 + 1<<24) >> 25
h0 += carry[9] * 19
h9 -= carry[9] << 25
carry[1] = (h1 + 1<<24) >> 25
h2 += carry[1]
h1 -= carry[1] << 25
carry[3] = (h3 + 1<<24) >> 25
h4 += carry[3]
h3 -= carry[3] << 25
carry[5] = (h5 + 1<<24) >> 25
h6 += carry[5]
h5 -= carry[5] << 25
carry[7] = (h7 + 1<<24) >> 25
h8 += carry[7]
h7 -= carry[7] << 25
carry[0] = (h0 + 1<<25) >> 26
h1 += carry[0]
h0 -= carry[0] << 26
carry[2] = (h2 + 1<<25) >> 26
h3 += carry[2]
h2 -= carry[2] << 26
carry[4] = (h4 + 1<<25) >> 26
h5 += carry[4]
h4 -= carry[4] << 26
carry[6] = (h6 + 1<<25) >> 26
h7 += carry[6]
h6 -= carry[6] << 26
carry[8] = (h8 + 1<<25) >> 26
h9 += carry[8]
h8 -= carry[8] << 26
dst[0] = int32(h0)
dst[1] = int32(h1)
dst[2] = int32(h2)
dst[3] = int32(h3)
dst[4] = int32(h4)
dst[5] = int32(h5)
dst[6] = int32(h6)
dst[7] = int32(h7)
dst[8] = int32(h8)
dst[9] = int32(h9)
}
// feToBytes marshals h to s.
// Preconditions:
// |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
//
// Write p=2^255-19; q=floor(h/p).
// Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
//
// Proof:
// Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
// Also have |h-2^230 h9|<2^230 so |19 2^(-255)(h-2^230 h9)|<1/4.
//
// Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
// Then 0<y<1.
//
// Write r=h-pq.
// Have 0<=r<=p-1=2^255-20.
// Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
//
// Write x=r+19(2^-255)r+y.
// Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
//
// Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
// so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
func feToBytes(s *[32]byte, h *fieldElement) {
var carry [10]int32
q := (19*h[9] + (1 << 24)) >> 25
q = (h[0] + q) >> 26
q = (h[1] + q) >> 25
q = (h[2] + q) >> 26
q = (h[3] + q) >> 25
q = (h[4] + q) >> 26
q = (h[5] + q) >> 25
q = (h[6] + q) >> 26
q = (h[7] + q) >> 25
q = (h[8] + q) >> 26
q = (h[9] + q) >> 25
// Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20.
h[0] += 19 * q
// Goal: Output h-2^255 q, which is between 0 and 2^255-20.
carry[0] = h[0] >> 26
h[1] += carry[0]
h[0] -= carry[0] << 26
carry[1] = h[1] >> 25
h[2] += carry[1]
h[1] -= carry[1] << 25
carry[2] = h[2] >> 26
h[3] += carry[2]
h[2] -= carry[2] << 26
carry[3] = h[3] >> 25
h[4] += carry[3]
h[3] -= carry[3] << 25
carry[4] = h[4] >> 26
h[5] += carry[4]
h[4] -= carry[4] << 26
carry[5] = h[5] >> 25
h[6] += carry[5]
h[5] -= carry[5] << 25
carry[6] = h[6] >> 26
h[7] += carry[6]
h[6] -= carry[6] << 26
carry[7] = h[7] >> 25
h[8] += carry[7]
h[7] -= carry[7] << 25
carry[8] = h[8] >> 26
h[9] += carry[8]
h[8] -= carry[8] << 26
carry[9] = h[9] >> 25
h[9] -= carry[9] << 25
// h10 = carry9
// Goal: Output h[0]+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
// Have h[0]+...+2^230 h[9] between 0 and 2^255-1;
// evidently 2^255 h10-2^255 q = 0.
// Goal: Output h[0]+...+2^230 h[9].
s[0] = byte(h[0] >> 0)
s[1] = byte(h[0] >> 8)
s[2] = byte(h[0] >> 16)
s[3] = byte((h[0] >> 24) | (h[1] << 2))
s[4] = byte(h[1] >> 6)
s[5] = byte(h[1] >> 14)
s[6] = byte((h[1] >> 22) | (h[2] << 3))
s[7] = byte(h[2] >> 5)
s[8] = byte(h[2] >> 13)
s[9] = byte((h[2] >> 21) | (h[3] << 5))
s[10] = byte(h[3] >> 3)
s[11] = byte(h[3] >> 11)
s[12] = byte((h[3] >> 19) | (h[4] << 6))
s[13] = byte(h[4] >> 2)
s[14] = byte(h[4] >> 10)
s[15] = byte(h[4] >> 18)
s[16] = byte(h[5] >> 0)
s[17] = byte(h[5] >> 8)
s[18] = byte(h[5] >> 16)
s[19] = byte((h[5] >> 24) | (h[6] << 1))
s[20] = byte(h[6] >> 7)
s[21] = byte(h[6] >> 15)
s[22] = byte((h[6] >> 23) | (h[7] << 3))
s[23] = byte(h[7] >> 5)
s[24] = byte(h[7] >> 13)
s[25] = byte((h[7] >> 21) | (h[8] << 4))
s[26] = byte(h[8] >> 4)
s[27] = byte(h[8] >> 12)
s[28] = byte((h[8] >> 20) | (h[9] << 6))
s[29] = byte(h[9] >> 2)
s[30] = byte(h[9] >> 10)
s[31] = byte(h[9] >> 18)
}
// feMul calculates h = f * g
// Can overlap h with f or g.
//
// Preconditions:
// |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
// |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
//
// Postconditions:
// |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
//
// Notes on implementation strategy:
//
// Using schoolbook multiplication.
// Karatsuba would save a little in some cost models.
//
// Most multiplications by 2 and 19 are 32-bit precomputations;
// cheaper than 64-bit postcomputations.
//
// There is one remaining multiplication by 19 in the carry chain;
// one *19 precomputation can be merged into this,
// but the resulting data flow is considerably less clean.
//
// There are 12 carries below.
// 10 of them are 2-way parallelizable and vectorizable.
// Can get away with 11 carries, but then data flow is much deeper.
//
// With tighter constraints on inputs can squeeze carries into int32.
func feMul(h, f, g *fieldElement) {
f0 := f[0]
f1 := f[1]
f2 := f[2]
f3 := f[3]
f4 := f[4]
f5 := f[5]
f6 := f[6]
f7 := f[7]
f8 := f[8]
f9 := f[9]
g0 := g[0]
g1 := g[1]
g2 := g[2]
g3 := g[3]
g4 := g[4]
g5 := g[5]
g6 := g[6]
g7 := g[7]
g8 := g[8]
g9 := g[9]
g1_19 := 19 * g1 // 1.4*2^29
g2_19 := 19 * g2 // 1.4*2^30; still ok
g3_19 := 19 * g3
g4_19 := 19 * g4
g5_19 := 19 * g5
g6_19 := 19 * g6
g7_19 := 19 * g7
g8_19 := 19 * g8
g9_19 := 19 * g9
f1_2 := 2 * f1
f3_2 := 2 * f3
f5_2 := 2 * f5
f7_2 := 2 * f7
f9_2 := 2 * f9
f0g0 := int64(f0) * int64(g0)
f0g1 := int64(f0) * int64(g1)
f0g2 := int64(f0) * int64(g2)
f0g3 := int64(f0) * int64(g3)
f0g4 := int64(f0) * int64(g4)
f0g5 := int64(f0) * int64(g5)
f0g6 := int64(f0) * int64(g6)
f0g7 := int64(f0) * int64(g7)
f0g8 := int64(f0) * int64(g8)
f0g9 := int64(f0) * int64(g9)
f1g0 := int64(f1) * int64(g0)
f1g1_2 := int64(f1_2) * int64(g1)
f1g2 := int64(f1) * int64(g2)
f1g3_2 := int64(f1_2) * int64(g3)
f1g4 := int64(f1) * int64(g4)
f1g5_2 := int64(f1_2) * int64(g5)
f1g6 := int64(f1) * int64(g6)
f1g7_2 := int64(f1_2) * int64(g7)
f1g8 := int64(f1) * int64(g8)
f1g9_38 := int64(f1_2) * int64(g9_19)
f2g0 := int64(f2) * int64(g0)
f2g1 := int64(f2) * int64(g1)
f2g2 := int64(f2) * int64(g2)
f2g3 := int64(f2) * int64(g3)
f2g4 := int64(f2) * int64(g4)
f2g5 := int64(f2) * int64(g5)
f2g6 := int64(f2) * int64(g6)
f2g7 := int64(f2) * int64(g7)
f2g8_19 := int64(f2) * int64(g8_19)
f2g9_19 := int64(f2) * int64(g9_19)
f3g0 := int64(f3) * int64(g0)
f3g1_2 := int64(f3_2) * int64(g1)
f3g2 := int64(f3) * int64(g2)
f3g3_2 := int64(f3_2) * int64(g3)
f3g4 := int64(f3) * int64(g4)
f3g5_2 := int64(f3_2) * int64(g5)
f3g6 := int64(f3) * int64(g6)
f3g7_38 := int64(f3_2) * int64(g7_19)
f3g8_19 := int64(f3) * int64(g8_19)
f3g9_38 := int64(f3_2) * int64(g9_19)
f4g0 := int64(f4) * int64(g0)
f4g1 := int64(f4) * int64(g1)
f4g2 := int64(f4) * int64(g2)
f4g3 := int64(f4) * int64(g3)
f4g4 := int64(f4) * int64(g4)
f4g5 := int64(f4) * int64(g5)
f4g6_19 := int64(f4) * int64(g6_19)
f4g7_19 := int64(f4) * int64(g7_19)
f4g8_19 := int64(f4) * int64(g8_19)
f4g9_19 := int64(f4) * int64(g9_19)
f5g0 := int64(f5) * int64(g0)
f5g1_2 := int64(f5_2) * int64(g1)
f5g2 := int64(f5) * int64(g2)
f5g3_2 := int64(f5_2) * int64(g3)
f5g4 := int64(f5) * int64(g4)
f5g5_38 := int64(f5_2) * int64(g5_19)
f5g6_19 := int64(f5) * int64(g6_19)
f5g7_38 := int64(f5_2) * int64(g7_19)
f5g8_19 := int64(f5) * int64(g8_19)
f5g9_38 := int64(f5_2) * int64(g9_19)
f6g0 := int64(f6) * int64(g0)
f6g1 := int64(f6) * int64(g1)
f6g2 := int64(f6) * int64(g2)
f6g3 := int64(f6) * int64(g3)
f6g4_19 := int64(f6) * int64(g4_19)
f6g5_19 := int64(f6) * int64(g5_19)
f6g6_19 := int64(f6) * int64(g6_19)
f6g7_19 := int64(f6) * int64(g7_19)
f6g8_19 := int64(f6) * int64(g8_19)
f6g9_19 := int64(f6) * int64(g9_19)
f7g0 := int64(f7) * int64(g0)
f7g1_2 := int64(f7_2) * int64(g1)
f7g2 := int64(f7) * int64(g2)
f7g3_38 := int64(f7_2) * int64(g3_19)
f7g4_19 := int64(f7) * int64(g4_19)
f7g5_38 := int64(f7_2) * int64(g5_19)
f7g6_19 := int64(f7) * int64(g6_19)
f7g7_38 := int64(f7_2) * int64(g7_19)
f7g8_19 := int64(f7) * int64(g8_19)
f7g9_38 := int64(f7_2) * int64(g9_19)
f8g0 := int64(f8) * int64(g0)
f8g1 := int64(f8) * int64(g1)
f8g2_19 := int64(f8) * int64(g2_19)
f8g3_19 := int64(f8) * int64(g3_19)
f8g4_19 := int64(f8) * int64(g4_19)
f8g5_19 := int64(f8) * int64(g5_19)
f8g6_19 := int64(f8) * int64(g6_19)
f8g7_19 := int64(f8) * int64(g7_19)
f8g8_19 := int64(f8) * int64(g8_19)
f8g9_19 := int64(f8) * int64(g9_19)
f9g0 := int64(f9) * int64(g0)
f9g1_38 := int64(f9_2) * int64(g1_19)
f9g2_19 := int64(f9) * int64(g2_19)
f9g3_38 := int64(f9_2) * int64(g3_19)
f9g4_19 := int64(f9) * int64(g4_19)
f9g5_38 := int64(f9_2) * int64(g5_19)
f9g6_19 := int64(f9) * int64(g6_19)
f9g7_38 := int64(f9_2) * int64(g7_19)
f9g8_19 := int64(f9) * int64(g8_19)
f9g9_38 := int64(f9_2) * int64(g9_19)
h0 := f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38
h1 := f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19
h2 := f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38
h3 := f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19
h4 := f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38
h5 := f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19
h6 := f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38
h7 := f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19
h8 := f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38
h9 := f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0
var carry [10]int64
// |h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38))
// i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8
// |h1| <= (1.1*1.1*2^51*(1+1+19+19+19+19+19+19+19+19))
// i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9
carry[0] = (h0 + (1 << 25)) >> 26
h1 += carry[0]
h0 -= carry[0] << 26
carry[4] = (h4 + (1 << 25)) >> 26
h5 += carry[4]
h4 -= carry[4] << 26
// |h0| <= 2^25
// |h4| <= 2^25
// |h1| <= 1.51*2^58
// |h5| <= 1.51*2^58
carry[1] = (h1 + (1 << 24)) >> 25
h2 += carry[1]
h1 -= carry[1] << 25
carry[5] = (h5 + (1 << 24)) >> 25
h6 += carry[5]
h5 -= carry[5] << 25
// |h1| <= 2^24; from now on fits into int32
// |h5| <= 2^24; from now on fits into int32
// |h2| <= 1.21*2^59
// |h6| <= 1.21*2^59
carry[2] = (h2 + (1 << 25)) >> 26
h3 += carry[2]
h2 -= carry[2] << 26
carry[6] = (h6 + (1 << 25)) >> 26
h7 += carry[6]
h6 -= carry[6] << 26
// |h2| <= 2^25; from now on fits into int32 unchanged
// |h6| <= 2^25; from now on fits into int32 unchanged
// |h3| <= 1.51*2^58
// |h7| <= 1.51*2^58
carry[3] = (h3 + (1 << 24)) >> 25
h4 += carry[3]
h3 -= carry[3] << 25
carry[7] = (h7 + (1 << 24)) >> 25
h8 += carry[7]
h7 -= carry[7] << 25
// |h3| <= 2^24; from now on fits into int32 unchanged
// |h7| <= 2^24; from now on fits into int32 unchanged
// |h4| <= 1.52*2^33
// |h8| <= 1.52*2^33
carry[4] = (h4 + (1 << 25)) >> 26
h5 += carry[4]
h4 -= carry[4] << 26
carry[8] = (h8 + (1 << 25)) >> 26
h9 += carry[8]
h8 -= carry[8] << 26
// |h4| <= 2^25; from now on fits into int32 unchanged
// |h8| <= 2^25; from now on fits into int32 unchanged
// |h5| <= 1.01*2^24
// |h9| <= 1.51*2^58
carry[9] = (h9 + (1 << 24)) >> 25
h0 += carry[9] * 19
h9 -= carry[9] << 25
// |h9| <= 2^24; from now on fits into int32 unchanged
// |h0| <= 1.8*2^37
carry[0] = (h0 + (1 << 25)) >> 26
h1 += carry[0]
h0 -= carry[0] << 26
// |h0| <= 2^25; from now on fits into int32 unchanged
// |h1| <= 1.01*2^24
h[0] = int32(h0)
h[1] = int32(h1)
h[2] = int32(h2)
h[3] = int32(h3)
h[4] = int32(h4)
h[5] = int32(h5)
h[6] = int32(h6)
h[7] = int32(h7)
h[8] = int32(h8)
h[9] = int32(h9)
}
// feSquare calculates h = f*f. Can overlap h with f.
//
// Preconditions:
// |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
//
// Postconditions:
// |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
func feSquare(h, f *fieldElement) {
f0 := f[0]
f1 := f[1]
f2 := f[2]
f3 := f[3]
f4 := f[4]
f5 := f[5]
f6 := f[6]
f7 := f[7]
f8 := f[8]
f9 := f[9]
f0_2 := 2 * f0
f1_2 := 2 * f1
f2_2 := 2 * f2
f3_2 := 2 * f3
f4_2 := 2 * f4
f5_2 := 2 * f5
f6_2 := 2 * f6
f7_2 := 2 * f7
f5_38 := 38 * f5 // 1.31*2^30
f6_19 := 19 * f6 // 1.31*2^30
f7_38 := 38 * f7 // 1.31*2^30
f8_19 := 19 * f8 // 1.31*2^30
f9_38 := 38 * f9 // 1.31*2^30
f0f0 := int64(f0) * int64(f0)
f0f1_2 := int64(f0_2) * int64(f1)
f0f2_2 := int64(f0_2) * int64(f2)
f0f3_2 := int64(f0_2) * int64(f3)
f0f4_2 := int64(f0_2) * int64(f4)
f0f5_2 := int64(f0_2) * int64(f5)
f0f6_2 := int64(f0_2) * int64(f6)
f0f7_2 := int64(f0_2) * int64(f7)
f0f8_2 := int64(f0_2) * int64(f8)
f0f9_2 := int64(f0_2) * int64(f9)
f1f1_2 := int64(f1_2) * int64(f1)
f1f2_2 := int64(f1_2) * int64(f2)
f1f3_4 := int64(f1_2) * int64(f3_2)
f1f4_2 := int64(f1_2) * int64(f4)
f1f5_4 := int64(f1_2) * int64(f5_2)
f1f6_2 := int64(f1_2) * int64(f6)
f1f7_4 := int64(f1_2) * int64(f7_2)
f1f8_2 := int64(f1_2) * int64(f8)
f1f9_76 := int64(f1_2) * int64(f9_38)
f2f2 := int64(f2) * int64(f2)
f2f3_2 := int64(f2_2) * int64(f3)
f2f4_2 := int64(f2_2) * int64(f4)
f2f5_2 := int64(f2_2) * int64(f5)
f2f6_2 := int64(f2_2) * int64(f6)
f2f7_2 := int64(f2_2) * int64(f7)
f2f8_38 := int64(f2_2) * int64(f8_19)
f2f9_38 := int64(f2) * int64(f9_38)
f3f3_2 := int64(f3_2) * int64(f3)
f3f4_2 := int64(f3_2) * int64(f4)
f3f5_4 := int64(f3_2) * int64(f5_2)
f3f6_2 := int64(f3_2) * int64(f6)
f3f7_76 := int64(f3_2) * int64(f7_38)
f3f8_38 := int64(f3_2) * int64(f8_19)
f3f9_76 := int64(f3_2) * int64(f9_38)
f4f4 := int64(f4) * int64(f4)
f4f5_2 := int64(f4_2) * int64(f5)
f4f6_38 := int64(f4_2) * int64(f6_19)
f4f7_38 := int64(f4) * int64(f7_38)
f4f8_38 := int64(f4_2) * int64(f8_19)
f4f9_38 := int64(f4) * int64(f9_38)
f5f5_38 := int64(f5) * int64(f5_38)
f5f6_38 := int64(f5_2) * int64(f6_19)
f5f7_76 := int64(f5_2) * int64(f7_38)
f5f8_38 := int64(f5_2) * int64(f8_19)
f5f9_76 := int64(f5_2) * int64(f9_38)
f6f6_19 := int64(f6) * int64(f6_19)
f6f7_38 := int64(f6) * int64(f7_38)
f6f8_38 := int64(f6_2) * int64(f8_19)
f6f9_38 := int64(f6) * int64(f9_38)
f7f7_38 := int64(f7) * int64(f7_38)
f7f8_38 := int64(f7_2) * int64(f8_19)
f7f9_76 := int64(f7_2) * int64(f9_38)
f8f8_19 := int64(f8) * int64(f8_19)
f8f9_38 := int64(f8) * int64(f9_38)
f9f9_38 := int64(f9) * int64(f9_38)
h0 := f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
h1 := f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
h2 := f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
h3 := f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
h4 := f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
h5 := f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
h6 := f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
h7 := f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
h8 := f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
h9 := f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
var carry [10]int64
carry[0] = (h0 + (1 << 25)) >> 26
h1 += carry[0]
h0 -= carry[0] << 26
carry[4] = (h4 + (1 << 25)) >> 26
h5 += carry[4]
h4 -= carry[4] << 26
carry[1] = (h1 + (1 << 24)) >> 25
h2 += carry[1]
h1 -= carry[1] << 25
carry[5] = (h5 + (1 << 24)) >> 25
h6 += carry[5]
h5 -= carry[5] << 25
carry[2] = (h2 + (1 << 25)) >> 26
h3 += carry[2]
h2 -= carry[2] << 26
carry[6] = (h6 + (1 << 25)) >> 26
h7 += carry[6]
h6 -= carry[6] << 26
carry[3] = (h3 + (1 << 24)) >> 25
h4 += carry[3]
h3 -= carry[3] << 25
carry[7] = (h7 + (1 << 24)) >> 25
h8 += carry[7]
h7 -= carry[7] << 25
carry[4] = (h4 + (1 << 25)) >> 26
h5 += carry[4]
h4 -= carry[4] << 26
carry[8] = (h8 + (1 << 25)) >> 26
h9 += carry[8]
h8 -= carry[8] << 26
carry[9] = (h9 + (1 << 24)) >> 25
h0 += carry[9] * 19
h9 -= carry[9] << 25
carry[0] = (h0 + (1 << 25)) >> 26
h1 += carry[0]
h0 -= carry[0] << 26
h[0] = int32(h0)
h[1] = int32(h1)
h[2] = int32(h2)
h[3] = int32(h3)
h[4] = int32(h4)
h[5] = int32(h5)
h[6] = int32(h6)
h[7] = int32(h7)
h[8] = int32(h8)
h[9] = int32(h9)
}
// feMul121666 calculates h = f * 121666. Can overlap h with f.
//
// Preconditions:
// |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
//
// Postconditions:
// |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
func feMul121666(h, f *fieldElement) {
h0 := int64(f[0]) * 121666
h1 := int64(f[1]) * 121666
h2 := int64(f[2]) * 121666
h3 := int64(f[3]) * 121666
h4 := int64(f[4]) * 121666
h5 := int64(f[5]) * 121666
h6 := int64(f[6]) * 121666
h7 := int64(f[7]) * 121666
h8 := int64(f[8]) * 121666
h9 := int64(f[9]) * 121666
var carry [10]int64
carry[9] = (h9 + (1 << 24)) >> 25
h0 += carry[9] * 19
h9 -= carry[9] << 25
carry[1] = (h1 + (1 << 24)) >> 25
h2 += carry[1]
h1 -= carry[1] << 25
carry[3] = (h3 + (1 << 24)) >> 25
h4 += carry[3]
h3 -= carry[3] << 25
carry[5] = (h5 + (1 << 24)) >> 25
h6 += carry[5]
h5 -= carry[5] << 25
carry[7] = (h7 + (1 << 24)) >> 25
h8 += carry[7]
h7 -= carry[7] << 25
carry[0] = (h0 + (1 << 25)) >> 26
h1 += carry[0]
h0 -= carry[0] << 26
carry[2] = (h2 + (1 << 25)) >> 26
h3 += carry[2]
h2 -= carry[2] << 26
carry[4] = (h4 + (1 << 25)) >> 26
h5 += carry[4]
h4 -= carry[4] << 26
carry[6] = (h6 + (1 << 25)) >> 26
h7 += carry[6]
h6 -= carry[6] << 26
carry[8] = (h8 + (1 << 25)) >> 26
h9 += carry[8]
h8 -= carry[8] << 26
h[0] = int32(h0)
h[1] = int32(h1)
h[2] = int32(h2)
h[3] = int32(h3)
h[4] = int32(h4)
h[5] = int32(h5)
h[6] = int32(h6)
h[7] = int32(h7)
h[8] = int32(h8)
h[9] = int32(h9)
}
// feInvert sets out = z^-1.
func feInvert(out, z *fieldElement) {
var t0, t1, t2, t3 fieldElement
var i int
feSquare(&t0, z)
for i = 1; i < 1; i++ {
feSquare(&t0, &t0)
}
feSquare(&t1, &t0)
for i = 1; i < 2; i++ {
feSquare(&t1, &t1)
}
feMul(&t1, z, &t1)
feMul(&t0, &t0, &t1)
feSquare(&t2, &t0)
for i = 1; i < 1; i++ {
feSquare(&t2, &t2)
}
feMul(&t1, &t1, &t2)
feSquare(&t2, &t1)
for i = 1; i < 5; i++ {
feSquare(&t2, &t2)
}
feMul(&t1, &t2, &t1)
feSquare(&t2, &t1)
for i = 1; i < 10; i++ {
feSquare(&t2, &t2)
}
feMul(&t2, &t2, &t1)
feSquare(&t3, &t2)
for i = 1; i < 20; i++ {
feSquare(&t3, &t3)
}
feMul(&t2, &t3, &t2)
feSquare(&t2, &t2)
for i = 1; i < 10; i++ {
feSquare(&t2, &t2)
}
feMul(&t1, &t2, &t1)
feSquare(&t2, &t1)
for i = 1; i < 50; i++ {
feSquare(&t2, &t2)
}
feMul(&t2, &t2, &t1)
feSquare(&t3, &t2)
for i = 1; i < 100; i++ {
feSquare(&t3, &t3)
}
feMul(&t2, &t3, &t2)
feSquare(&t2, &t2)
for i = 1; i < 50; i++ {
feSquare(&t2, &t2)
}
feMul(&t1, &t2, &t1)
feSquare(&t1, &t1)
for i = 1; i < 5; i++ {
feSquare(&t1, &t1)
}
feMul(out, &t1, &t0)
}
func scalarMultGeneric(out, in, base *[32]byte) {
var e [32]byte
copy(e[:], in[:])
e[0] &= 248
e[31] &= 127
e[31] |= 64
var x1, x2, z2, x3, z3, tmp0, tmp1 fieldElement
feFromBytes(&x1, base)
feOne(&x2)
feCopy(&x3, &x1)
feOne(&z3)
swap := int32(0)
for pos := 254; pos >= 0; pos-- {
b := e[pos/8] >> uint(pos&7)
b &= 1
swap ^= int32(b)
feCSwap(&x2, &x3, swap)
feCSwap(&z2, &z3, swap)
swap = int32(b)
feSub(&tmp0, &x3, &z3)
feSub(&tmp1, &x2, &z2)
feAdd(&x2, &x2, &z2)
feAdd(&z2, &x3, &z3)
feMul(&z3, &tmp0, &x2)
feMul(&z2, &z2, &tmp1)
feSquare(&tmp0, &tmp1)
feSquare(&tmp1, &x2)
feAdd(&x3, &z3, &z2)
feSub(&z2, &z3, &z2)
feMul(&x2, &tmp1, &tmp0)
feSub(&tmp1, &tmp1, &tmp0)
feSquare(&z2, &z2)
feMul121666(&z3, &tmp1)
feSquare(&x3, &x3)
feAdd(&tmp0, &tmp0, &z3)
feMul(&z3, &x1, &z2)
feMul(&z2, &tmp1, &tmp0)
}
feCSwap(&x2, &x3, swap)
feCSwap(&z2, &z3, swap)
feInvert(&z2, &z2)
feMul(&x2, &x2, &z2)
feToBytes(out, &x2)
}

View File

@ -1,12 +0,0 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || !gc || purego
// +build !amd64 !gc purego
package curve25519
func scalarMult(out, in, base *[32]byte) {
scalarMultGeneric(out, in, base)
}

View File

@ -0,0 +1,7 @@
This package is kept in sync with crypto/ed25519/internal/edwards25519/field in
the standard library.
If there are any changes in the standard library that need to be synced to this
package, run sync.sh. It will not overwrite any local changes made since the
previous sync, so it's ok to land changes in this package first, and then sync
to the standard library later.

View File

@ -0,0 +1,416 @@
// Copyright (c) 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package field implements fast arithmetic modulo 2^255-19.
package field
import (
"crypto/subtle"
"encoding/binary"
"math/bits"
)
// Element represents an element of the field GF(2^255-19). Note that this
// is not a cryptographically secure group, and should only be used to interact
// with edwards25519.Point coordinates.
//
// This type works similarly to math/big.Int, and all arguments and receivers
// are allowed to alias.
//
// The zero value is a valid zero element.
type Element struct {
// An element t represents the integer
// t.l0 + t.l1*2^51 + t.l2*2^102 + t.l3*2^153 + t.l4*2^204
//
// Between operations, all limbs are expected to be lower than 2^52.
l0 uint64
l1 uint64
l2 uint64
l3 uint64
l4 uint64
}
const maskLow51Bits uint64 = (1 << 51) - 1
var feZero = &Element{0, 0, 0, 0, 0}
// Zero sets v = 0, and returns v.
func (v *Element) Zero() *Element {
*v = *feZero
return v
}
var feOne = &Element{1, 0, 0, 0, 0}
// One sets v = 1, and returns v.
func (v *Element) One() *Element {
*v = *feOne
return v
}
// reduce reduces v modulo 2^255 - 19 and returns it.
func (v *Element) reduce() *Element {
v.carryPropagate()
// After the light reduction we now have a field element representation
// v < 2^255 + 2^13 * 19, but need v < 2^255 - 19.
// If v >= 2^255 - 19, then v + 19 >= 2^255, which would overflow 2^255 - 1,
// generating a carry. That is, c will be 0 if v < 2^255 - 19, and 1 otherwise.
c := (v.l0 + 19) >> 51
c = (v.l1 + c) >> 51
c = (v.l2 + c) >> 51
c = (v.l3 + c) >> 51
c = (v.l4 + c) >> 51
// If v < 2^255 - 19 and c = 0, this will be a no-op. Otherwise, it's
// effectively applying the reduction identity to the carry.
v.l0 += 19 * c
v.l1 += v.l0 >> 51
v.l0 = v.l0 & maskLow51Bits
v.l2 += v.l1 >> 51
v.l1 = v.l1 & maskLow51Bits
v.l3 += v.l2 >> 51
v.l2 = v.l2 & maskLow51Bits
v.l4 += v.l3 >> 51
v.l3 = v.l3 & maskLow51Bits
// no additional carry
v.l4 = v.l4 & maskLow51Bits
return v
}
// Add sets v = a + b, and returns v.
func (v *Element) Add(a, b *Element) *Element {
v.l0 = a.l0 + b.l0
v.l1 = a.l1 + b.l1
v.l2 = a.l2 + b.l2
v.l3 = a.l3 + b.l3
v.l4 = a.l4 + b.l4
// Using the generic implementation here is actually faster than the
// assembly. Probably because the body of this function is so simple that
// the compiler can figure out better optimizations by inlining the carry
// propagation. TODO
return v.carryPropagateGeneric()
}
// Subtract sets v = a - b, and returns v.
func (v *Element) Subtract(a, b *Element) *Element {
// We first add 2 * p, to guarantee the subtraction won't underflow, and
// then subtract b (which can be up to 2^255 + 2^13 * 19).
v.l0 = (a.l0 + 0xFFFFFFFFFFFDA) - b.l0
v.l1 = (a.l1 + 0xFFFFFFFFFFFFE) - b.l1
v.l2 = (a.l2 + 0xFFFFFFFFFFFFE) - b.l2
v.l3 = (a.l3 + 0xFFFFFFFFFFFFE) - b.l3
v.l4 = (a.l4 + 0xFFFFFFFFFFFFE) - b.l4
return v.carryPropagate()
}
// Negate sets v = -a, and returns v.
func (v *Element) Negate(a *Element) *Element {
return v.Subtract(feZero, a)
}
// Invert sets v = 1/z mod p, and returns v.
//
// If z == 0, Invert returns v = 0.
func (v *Element) Invert(z *Element) *Element {
// Inversion is implemented as exponentiation with exponent p 2. It uses the
// same sequence of 255 squarings and 11 multiplications as [Curve25519].
var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t Element
z2.Square(z) // 2
t.Square(&z2) // 4
t.Square(&t) // 8
z9.Multiply(&t, z) // 9
z11.Multiply(&z9, &z2) // 11
t.Square(&z11) // 22
z2_5_0.Multiply(&t, &z9) // 31 = 2^5 - 2^0
t.Square(&z2_5_0) // 2^6 - 2^1
for i := 0; i < 4; i++ {
t.Square(&t) // 2^10 - 2^5
}
z2_10_0.Multiply(&t, &z2_5_0) // 2^10 - 2^0
t.Square(&z2_10_0) // 2^11 - 2^1
for i := 0; i < 9; i++ {
t.Square(&t) // 2^20 - 2^10
}
z2_20_0.Multiply(&t, &z2_10_0) // 2^20 - 2^0
t.Square(&z2_20_0) // 2^21 - 2^1
for i := 0; i < 19; i++ {
t.Square(&t) // 2^40 - 2^20
}
t.Multiply(&t, &z2_20_0) // 2^40 - 2^0
t.Square(&t) // 2^41 - 2^1
for i := 0; i < 9; i++ {
t.Square(&t) // 2^50 - 2^10
}
z2_50_0.Multiply(&t, &z2_10_0) // 2^50 - 2^0
t.Square(&z2_50_0) // 2^51 - 2^1
for i := 0; i < 49; i++ {
t.Square(&t) // 2^100 - 2^50
}
z2_100_0.Multiply(&t, &z2_50_0) // 2^100 - 2^0
t.Square(&z2_100_0) // 2^101 - 2^1
for i := 0; i < 99; i++ {
t.Square(&t) // 2^200 - 2^100
}
t.Multiply(&t, &z2_100_0) // 2^200 - 2^0
t.Square(&t) // 2^201 - 2^1
for i := 0; i < 49; i++ {
t.Square(&t) // 2^250 - 2^50
}
t.Multiply(&t, &z2_50_0) // 2^250 - 2^0
t.Square(&t) // 2^251 - 2^1
t.Square(&t) // 2^252 - 2^2
t.Square(&t) // 2^253 - 2^3
t.Square(&t) // 2^254 - 2^4
t.Square(&t) // 2^255 - 2^5
return v.Multiply(&t, &z11) // 2^255 - 21
}
// Set sets v = a, and returns v.
func (v *Element) Set(a *Element) *Element {
*v = *a
return v
}
// SetBytes sets v to x, which must be a 32-byte little-endian encoding.
//
// Consistent with RFC 7748, the most significant bit (the high bit of the
// last byte) is ignored, and non-canonical values (2^255-19 through 2^255-1)
// are accepted. Note that this is laxer than specified by RFC 8032.
func (v *Element) SetBytes(x []byte) *Element {
if len(x) != 32 {
panic("edwards25519: invalid field element input size")
}
// Bits 0:51 (bytes 0:8, bits 0:64, shift 0, mask 51).
v.l0 = binary.LittleEndian.Uint64(x[0:8])
v.l0 &= maskLow51Bits
// Bits 51:102 (bytes 6:14, bits 48:112, shift 3, mask 51).
v.l1 = binary.LittleEndian.Uint64(x[6:14]) >> 3
v.l1 &= maskLow51Bits
// Bits 102:153 (bytes 12:20, bits 96:160, shift 6, mask 51).
v.l2 = binary.LittleEndian.Uint64(x[12:20]) >> 6
v.l2 &= maskLow51Bits
// Bits 153:204 (bytes 19:27, bits 152:216, shift 1, mask 51).
v.l3 = binary.LittleEndian.Uint64(x[19:27]) >> 1
v.l3 &= maskLow51Bits
// Bits 204:251 (bytes 24:32, bits 192:256, shift 12, mask 51).
// Note: not bytes 25:33, shift 4, to avoid overread.
v.l4 = binary.LittleEndian.Uint64(x[24:32]) >> 12
v.l4 &= maskLow51Bits
return v
}
// Bytes returns the canonical 32-byte little-endian encoding of v.
func (v *Element) Bytes() []byte {
// This function is outlined to make the allocations inline in the caller
// rather than happen on the heap.
var out [32]byte
return v.bytes(&out)
}
func (v *Element) bytes(out *[32]byte) []byte {
t := *v
t.reduce()
var buf [8]byte
for i, l := range [5]uint64{t.l0, t.l1, t.l2, t.l3, t.l4} {
bitsOffset := i * 51
binary.LittleEndian.PutUint64(buf[:], l<<uint(bitsOffset%8))
for i, bb := range buf {
off := bitsOffset/8 + i
if off >= len(out) {
break
}
out[off] |= bb
}
}
return out[:]
}
// Equal returns 1 if v and u are equal, and 0 otherwise.
func (v *Element) Equal(u *Element) int {
sa, sv := u.Bytes(), v.Bytes()
return subtle.ConstantTimeCompare(sa, sv)
}
// mask64Bits returns 0xffffffff if cond is 1, and 0 otherwise.
func mask64Bits(cond int) uint64 { return ^(uint64(cond) - 1) }
// Select sets v to a if cond == 1, and to b if cond == 0.
func (v *Element) Select(a, b *Element, cond int) *Element {
m := mask64Bits(cond)
v.l0 = (m & a.l0) | (^m & b.l0)
v.l1 = (m & a.l1) | (^m & b.l1)
v.l2 = (m & a.l2) | (^m & b.l2)
v.l3 = (m & a.l3) | (^m & b.l3)
v.l4 = (m & a.l4) | (^m & b.l4)
return v
}
// Swap swaps v and u if cond == 1 or leaves them unchanged if cond == 0, and returns v.
func (v *Element) Swap(u *Element, cond int) {
m := mask64Bits(cond)
t := m & (v.l0 ^ u.l0)
v.l0 ^= t
u.l0 ^= t
t = m & (v.l1 ^ u.l1)
v.l1 ^= t
u.l1 ^= t
t = m & (v.l2 ^ u.l2)
v.l2 ^= t
u.l2 ^= t
t = m & (v.l3 ^ u.l3)
v.l3 ^= t
u.l3 ^= t
t = m & (v.l4 ^ u.l4)
v.l4 ^= t
u.l4 ^= t
}
// IsNegative returns 1 if v is negative, and 0 otherwise.
func (v *Element) IsNegative() int {
return int(v.Bytes()[0] & 1)
}
// Absolute sets v to |u|, and returns v.
func (v *Element) Absolute(u *Element) *Element {
return v.Select(new(Element).Negate(u), u, u.IsNegative())
}
// Multiply sets v = x * y, and returns v.
func (v *Element) Multiply(x, y *Element) *Element {
feMul(v, x, y)
return v
}
// Square sets v = x * x, and returns v.
func (v *Element) Square(x *Element) *Element {
feSquare(v, x)
return v
}
// Mult32 sets v = x * y, and returns v.
func (v *Element) Mult32(x *Element, y uint32) *Element {
x0lo, x0hi := mul51(x.l0, y)
x1lo, x1hi := mul51(x.l1, y)
x2lo, x2hi := mul51(x.l2, y)
x3lo, x3hi := mul51(x.l3, y)
x4lo, x4hi := mul51(x.l4, y)
v.l0 = x0lo + 19*x4hi // carried over per the reduction identity
v.l1 = x1lo + x0hi
v.l2 = x2lo + x1hi
v.l3 = x3lo + x2hi
v.l4 = x4lo + x3hi
// The hi portions are going to be only 32 bits, plus any previous excess,
// so we can skip the carry propagation.
return v
}
// mul51 returns lo + hi * 2⁵¹ = a * b.
func mul51(a uint64, b uint32) (lo uint64, hi uint64) {
mh, ml := bits.Mul64(a, uint64(b))
lo = ml & maskLow51Bits
hi = (mh << 13) | (ml >> 51)
return
}
// Pow22523 set v = x^((p-5)/8), and returns v. (p-5)/8 is 2^252-3.
func (v *Element) Pow22523(x *Element) *Element {
var t0, t1, t2 Element
t0.Square(x) // x^2
t1.Square(&t0) // x^4
t1.Square(&t1) // x^8
t1.Multiply(x, &t1) // x^9
t0.Multiply(&t0, &t1) // x^11
t0.Square(&t0) // x^22
t0.Multiply(&t1, &t0) // x^31
t1.Square(&t0) // x^62
for i := 1; i < 5; i++ { // x^992
t1.Square(&t1)
}
t0.Multiply(&t1, &t0) // x^1023 -> 1023 = 2^10 - 1
t1.Square(&t0) // 2^11 - 2
for i := 1; i < 10; i++ { // 2^20 - 2^10
t1.Square(&t1)
}
t1.Multiply(&t1, &t0) // 2^20 - 1
t2.Square(&t1) // 2^21 - 2
for i := 1; i < 20; i++ { // 2^40 - 2^20
t2.Square(&t2)
}
t1.Multiply(&t2, &t1) // 2^40 - 1
t1.Square(&t1) // 2^41 - 2
for i := 1; i < 10; i++ { // 2^50 - 2^10
t1.Square(&t1)
}
t0.Multiply(&t1, &t0) // 2^50 - 1
t1.Square(&t0) // 2^51 - 2
for i := 1; i < 50; i++ { // 2^100 - 2^50
t1.Square(&t1)
}
t1.Multiply(&t1, &t0) // 2^100 - 1
t2.Square(&t1) // 2^101 - 2
for i := 1; i < 100; i++ { // 2^200 - 2^100
t2.Square(&t2)
}
t1.Multiply(&t2, &t1) // 2^200 - 1
t1.Square(&t1) // 2^201 - 2
for i := 1; i < 50; i++ { // 2^250 - 2^50
t1.Square(&t1)
}
t0.Multiply(&t1, &t0) // 2^250 - 1
t0.Square(&t0) // 2^251 - 2
t0.Square(&t0) // 2^252 - 4
return v.Multiply(&t0, x) // 2^252 - 3 -> x^(2^252-3)
}
// sqrtM1 is 2^((p-1)/4), which squared is equal to -1 by Euler's Criterion.
var sqrtM1 = &Element{1718705420411056, 234908883556509,
2233514472574048, 2117202627021982, 765476049583133}
// SqrtRatio sets r to the non-negative square root of the ratio of u and v.
//
// If u/v is square, SqrtRatio returns r and 1. If u/v is not square, SqrtRatio
// sets r according to Section 4.3 of draft-irtf-cfrg-ristretto255-decaf448-00,
// and returns r and 0.
func (r *Element) SqrtRatio(u, v *Element) (rr *Element, wasSquare int) {
var a, b Element
// r = (u * v3) * (u * v7)^((p-5)/8)
v2 := a.Square(v)
uv3 := b.Multiply(u, b.Multiply(v2, v))
uv7 := a.Multiply(uv3, a.Square(v2))
r.Multiply(uv3, r.Pow22523(uv7))
check := a.Multiply(v, a.Square(r)) // check = v * r^2
uNeg := b.Negate(u)
correctSignSqrt := check.Equal(u)
flippedSignSqrt := check.Equal(uNeg)
flippedSignSqrtI := check.Equal(uNeg.Multiply(uNeg, sqrtM1))
rPrime := b.Multiply(r, sqrtM1) // r_prime = SQRT_M1 * r
// r = CT_SELECT(r_prime IF flipped_sign_sqrt | flipped_sign_sqrt_i ELSE r)
r.Select(rPrime, r, flippedSignSqrt|flippedSignSqrtI)
r.Absolute(r) // Choose the nonnegative square root.
return r, correctSignSqrt | flippedSignSqrt
}

View File

@ -0,0 +1,13 @@
// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT.
// +build amd64,gc,!purego
package field
// feMul sets out = a * b. It works like feMulGeneric.
//go:noescape
func feMul(out *Element, a *Element, b *Element)
// feSquare sets out = a * a. It works like feSquareGeneric.
//go:noescape
func feSquare(out *Element, a *Element)

View File

@ -0,0 +1,379 @@
// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT.
//go:build amd64 && gc && !purego
// +build amd64,gc,!purego
#include "textflag.h"
// func feMul(out *Element, a *Element, b *Element)
TEXT ·feMul(SB), NOSPLIT, $0-24
MOVQ a+8(FP), CX
MOVQ b+16(FP), BX
// r0 = a0×b0
MOVQ (CX), AX
MULQ (BX)
MOVQ AX, DI
MOVQ DX, SI
// r0 += 19×a1×b4
MOVQ 8(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(BX)
ADDQ AX, DI
ADCQ DX, SI
// r0 += 19×a2×b3
MOVQ 16(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 24(BX)
ADDQ AX, DI
ADCQ DX, SI
// r0 += 19×a3×b2
MOVQ 24(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 16(BX)
ADDQ AX, DI
ADCQ DX, SI
// r0 += 19×a4×b1
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 8(BX)
ADDQ AX, DI
ADCQ DX, SI
// r1 = a0×b1
MOVQ (CX), AX
MULQ 8(BX)
MOVQ AX, R9
MOVQ DX, R8
// r1 += a1×b0
MOVQ 8(CX), AX
MULQ (BX)
ADDQ AX, R9
ADCQ DX, R8
// r1 += 19×a2×b4
MOVQ 16(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(BX)
ADDQ AX, R9
ADCQ DX, R8
// r1 += 19×a3×b3
MOVQ 24(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 24(BX)
ADDQ AX, R9
ADCQ DX, R8
// r1 += 19×a4×b2
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 16(BX)
ADDQ AX, R9
ADCQ DX, R8
// r2 = a0×b2
MOVQ (CX), AX
MULQ 16(BX)
MOVQ AX, R11
MOVQ DX, R10
// r2 += a1×b1
MOVQ 8(CX), AX
MULQ 8(BX)
ADDQ AX, R11
ADCQ DX, R10
// r2 += a2×b0
MOVQ 16(CX), AX
MULQ (BX)
ADDQ AX, R11
ADCQ DX, R10
// r2 += 19×a3×b4
MOVQ 24(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(BX)
ADDQ AX, R11
ADCQ DX, R10
// r2 += 19×a4×b3
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 24(BX)
ADDQ AX, R11
ADCQ DX, R10
// r3 = a0×b3
MOVQ (CX), AX
MULQ 24(BX)
MOVQ AX, R13
MOVQ DX, R12
// r3 += a1×b2
MOVQ 8(CX), AX
MULQ 16(BX)
ADDQ AX, R13
ADCQ DX, R12
// r3 += a2×b1
MOVQ 16(CX), AX
MULQ 8(BX)
ADDQ AX, R13
ADCQ DX, R12
// r3 += a3×b0
MOVQ 24(CX), AX
MULQ (BX)
ADDQ AX, R13
ADCQ DX, R12
// r3 += 19×a4×b4
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(BX)
ADDQ AX, R13
ADCQ DX, R12
// r4 = a0×b4
MOVQ (CX), AX
MULQ 32(BX)
MOVQ AX, R15
MOVQ DX, R14
// r4 += a1×b3
MOVQ 8(CX), AX
MULQ 24(BX)
ADDQ AX, R15
ADCQ DX, R14
// r4 += a2×b2
MOVQ 16(CX), AX
MULQ 16(BX)
ADDQ AX, R15
ADCQ DX, R14
// r4 += a3×b1
MOVQ 24(CX), AX
MULQ 8(BX)
ADDQ AX, R15
ADCQ DX, R14
// r4 += a4×b0
MOVQ 32(CX), AX
MULQ (BX)
ADDQ AX, R15
ADCQ DX, R14
// First reduction chain
MOVQ $0x0007ffffffffffff, AX
SHLQ $0x0d, DI, SI
SHLQ $0x0d, R9, R8
SHLQ $0x0d, R11, R10
SHLQ $0x0d, R13, R12
SHLQ $0x0d, R15, R14
ANDQ AX, DI
IMUL3Q $0x13, R14, R14
ADDQ R14, DI
ANDQ AX, R9
ADDQ SI, R9
ANDQ AX, R11
ADDQ R8, R11
ANDQ AX, R13
ADDQ R10, R13
ANDQ AX, R15
ADDQ R12, R15
// Second reduction chain (carryPropagate)
MOVQ DI, SI
SHRQ $0x33, SI
MOVQ R9, R8
SHRQ $0x33, R8
MOVQ R11, R10
SHRQ $0x33, R10
MOVQ R13, R12
SHRQ $0x33, R12
MOVQ R15, R14
SHRQ $0x33, R14
ANDQ AX, DI
IMUL3Q $0x13, R14, R14
ADDQ R14, DI
ANDQ AX, R9
ADDQ SI, R9
ANDQ AX, R11
ADDQ R8, R11
ANDQ AX, R13
ADDQ R10, R13
ANDQ AX, R15
ADDQ R12, R15
// Store output
MOVQ out+0(FP), AX
MOVQ DI, (AX)
MOVQ R9, 8(AX)
MOVQ R11, 16(AX)
MOVQ R13, 24(AX)
MOVQ R15, 32(AX)
RET
// func feSquare(out *Element, a *Element)
TEXT ·feSquare(SB), NOSPLIT, $0-16
MOVQ a+8(FP), CX
// r0 = l0×l0
MOVQ (CX), AX
MULQ (CX)
MOVQ AX, SI
MOVQ DX, BX
// r0 += 38×l1×l4
MOVQ 8(CX), AX
IMUL3Q $0x26, AX, AX
MULQ 32(CX)
ADDQ AX, SI
ADCQ DX, BX
// r0 += 38×l2×l3
MOVQ 16(CX), AX
IMUL3Q $0x26, AX, AX
MULQ 24(CX)
ADDQ AX, SI
ADCQ DX, BX
// r1 = 2×l0×l1
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 8(CX)
MOVQ AX, R8
MOVQ DX, DI
// r1 += 38×l2×l4
MOVQ 16(CX), AX
IMUL3Q $0x26, AX, AX
MULQ 32(CX)
ADDQ AX, R8
ADCQ DX, DI
// r1 += 19×l3×l3
MOVQ 24(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 24(CX)
ADDQ AX, R8
ADCQ DX, DI
// r2 = 2×l0×l2
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 16(CX)
MOVQ AX, R10
MOVQ DX, R9
// r2 += l1×l1
MOVQ 8(CX), AX
MULQ 8(CX)
ADDQ AX, R10
ADCQ DX, R9
// r2 += 38×l3×l4
MOVQ 24(CX), AX
IMUL3Q $0x26, AX, AX
MULQ 32(CX)
ADDQ AX, R10
ADCQ DX, R9
// r3 = 2×l0×l3
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 24(CX)
MOVQ AX, R12
MOVQ DX, R11
// r3 += 2×l1×l2
MOVQ 8(CX), AX
IMUL3Q $0x02, AX, AX
MULQ 16(CX)
ADDQ AX, R12
ADCQ DX, R11
// r3 += 19×l4×l4
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(CX)
ADDQ AX, R12
ADCQ DX, R11
// r4 = 2×l0×l4
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 32(CX)
MOVQ AX, R14
MOVQ DX, R13
// r4 += 2×l1×l3
MOVQ 8(CX), AX
IMUL3Q $0x02, AX, AX
MULQ 24(CX)
ADDQ AX, R14
ADCQ DX, R13
// r4 += l2×l2
MOVQ 16(CX), AX
MULQ 16(CX)
ADDQ AX, R14
ADCQ DX, R13
// First reduction chain
MOVQ $0x0007ffffffffffff, AX
SHLQ $0x0d, SI, BX
SHLQ $0x0d, R8, DI
SHLQ $0x0d, R10, R9
SHLQ $0x0d, R12, R11
SHLQ $0x0d, R14, R13
ANDQ AX, SI
IMUL3Q $0x13, R13, R13
ADDQ R13, SI
ANDQ AX, R8
ADDQ BX, R8
ANDQ AX, R10
ADDQ DI, R10
ANDQ AX, R12
ADDQ R9, R12
ANDQ AX, R14
ADDQ R11, R14
// Second reduction chain (carryPropagate)
MOVQ SI, BX
SHRQ $0x33, BX
MOVQ R8, DI
SHRQ $0x33, DI
MOVQ R10, R9
SHRQ $0x33, R9
MOVQ R12, R11
SHRQ $0x33, R11
MOVQ R14, R13
SHRQ $0x33, R13
ANDQ AX, SI
IMUL3Q $0x13, R13, R13
ADDQ R13, SI
ANDQ AX, R8
ADDQ BX, R8
ANDQ AX, R10
ADDQ DI, R10
ANDQ AX, R12
ADDQ R9, R12
ANDQ AX, R14
ADDQ R11, R14
// Store output
MOVQ out+0(FP), AX
MOVQ SI, (AX)
MOVQ R8, 8(AX)
MOVQ R10, 16(AX)
MOVQ R12, 24(AX)
MOVQ R14, 32(AX)
RET

View File

@ -0,0 +1,12 @@
// Copyright (c) 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || !gc || purego
// +build !amd64 !gc purego
package field
func feMul(v, x, y *Element) { feMulGeneric(v, x, y) }
func feSquare(v, x *Element) { feSquareGeneric(v, x) }

View File

@ -0,0 +1,16 @@
// Copyright (c) 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build arm64 && gc && !purego
// +build arm64,gc,!purego
package field
//go:noescape
func carryPropagate(v *Element)
func (v *Element) carryPropagate() *Element {
carryPropagate(v)
return v
}

View File

@ -0,0 +1,43 @@
// Copyright (c) 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build arm64 && gc && !purego
// +build arm64,gc,!purego
#include "textflag.h"
// carryPropagate works exactly like carryPropagateGeneric and uses the
// same AND, ADD, and LSR+MADD instructions emitted by the compiler, but
// avoids loading R0-R4 twice and uses LDP and STP.
//
// See https://golang.org/issues/43145 for the main compiler issue.
//
// func carryPropagate(v *Element)
TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8
MOVD v+0(FP), R20
LDP 0(R20), (R0, R1)
LDP 16(R20), (R2, R3)
MOVD 32(R20), R4
AND $0x7ffffffffffff, R0, R10
AND $0x7ffffffffffff, R1, R11
AND $0x7ffffffffffff, R2, R12
AND $0x7ffffffffffff, R3, R13
AND $0x7ffffffffffff, R4, R14
ADD R0>>51, R11, R11
ADD R1>>51, R12, R12
ADD R2>>51, R13, R13
ADD R3>>51, R14, R14
// R4>>51 * 19 + R10 -> R10
LSR $51, R4, R21
MOVD $19, R22
MADD R22, R10, R21, R10
STP (R10, R11), 0(R20)
STP (R12, R13), 16(R20)
MOVD R14, 32(R20)
RET

View File

@ -0,0 +1,12 @@
// Copyright (c) 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !arm64 || !gc || purego
// +build !arm64 !gc purego
package field
func (v *Element) carryPropagate() *Element {
return v.carryPropagateGeneric()
}

View File

@ -0,0 +1,264 @@
// Copyright (c) 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package field
import "math/bits"
// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
// bits.Mul64 and bits.Add64 intrinsics.
type uint128 struct {
lo, hi uint64
}
// mul64 returns a * b.
func mul64(a, b uint64) uint128 {
hi, lo := bits.Mul64(a, b)
return uint128{lo, hi}
}
// addMul64 returns v + a * b.
func addMul64(v uint128, a, b uint64) uint128 {
hi, lo := bits.Mul64(a, b)
lo, c := bits.Add64(lo, v.lo, 0)
hi, _ = bits.Add64(hi, v.hi, c)
return uint128{lo, hi}
}
// shiftRightBy51 returns a >> 51. a is assumed to be at most 115 bits.
func shiftRightBy51(a uint128) uint64 {
return (a.hi << (64 - 51)) | (a.lo >> 51)
}
func feMulGeneric(v, a, b *Element) {
a0 := a.l0
a1 := a.l1
a2 := a.l2
a3 := a.l3
a4 := a.l4
b0 := b.l0
b1 := b.l1
b2 := b.l2
b3 := b.l3
b4 := b.l4
// Limb multiplication works like pen-and-paper columnar multiplication, but
// with 51-bit limbs instead of digits.
//
// a4 a3 a2 a1 a0 x
// b4 b3 b2 b1 b0 =
// ------------------------
// a4b0 a3b0 a2b0 a1b0 a0b0 +
// a4b1 a3b1 a2b1 a1b1 a0b1 +
// a4b2 a3b2 a2b2 a1b2 a0b2 +
// a4b3 a3b3 a2b3 a1b3 a0b3 +
// a4b4 a3b4 a2b4 a1b4 a0b4 =
// ----------------------------------------------
// r8 r7 r6 r5 r4 r3 r2 r1 r0
//
// We can then use the reduction identity (a * 2²⁵⁵ + b = a * 19 + b) to
// reduce the limbs that would overflow 255 bits. r5 * 2²⁵⁵ becomes 19 * r5,
// r6 * 2³⁰⁶ becomes 19 * r6 * 2⁵¹, etc.
//
// Reduction can be carried out simultaneously to multiplication. For
// example, we do not compute r5: whenever the result of a multiplication
// belongs to r5, like a1b4, we multiply it by 19 and add the result to r0.
//
// a4b0 a3b0 a2b0 a1b0 a0b0 +
// a3b1 a2b1 a1b1 a0b1 19×a4b1 +
// a2b2 a1b2 a0b2 19×a4b2 19×a3b2 +
// a1b3 a0b3 19×a4b3 19×a3b3 19×a2b3 +
// a0b4 19×a4b4 19×a3b4 19×a2b4 19×a1b4 =
// --------------------------------------
// r4 r3 r2 r1 r0
//
// Finally we add up the columns into wide, overlapping limbs.
a1_19 := a1 * 19
a2_19 := a2 * 19
a3_19 := a3 * 19
a4_19 := a4 * 19
// r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1)
r0 := mul64(a0, b0)
r0 = addMul64(r0, a1_19, b4)
r0 = addMul64(r0, a2_19, b3)
r0 = addMul64(r0, a3_19, b2)
r0 = addMul64(r0, a4_19, b1)
// r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2)
r1 := mul64(a0, b1)
r1 = addMul64(r1, a1, b0)
r1 = addMul64(r1, a2_19, b4)
r1 = addMul64(r1, a3_19, b3)
r1 = addMul64(r1, a4_19, b2)
// r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3)
r2 := mul64(a0, b2)
r2 = addMul64(r2, a1, b1)
r2 = addMul64(r2, a2, b0)
r2 = addMul64(r2, a3_19, b4)
r2 = addMul64(r2, a4_19, b3)
// r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4
r3 := mul64(a0, b3)
r3 = addMul64(r3, a1, b2)
r3 = addMul64(r3, a2, b1)
r3 = addMul64(r3, a3, b0)
r3 = addMul64(r3, a4_19, b4)
// r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0
r4 := mul64(a0, b4)
r4 = addMul64(r4, a1, b3)
r4 = addMul64(r4, a2, b2)
r4 = addMul64(r4, a3, b1)
r4 = addMul64(r4, a4, b0)
// After the multiplication, we need to reduce (carry) the five coefficients
// to obtain a result with limbs that are at most slightly larger than 2⁵¹,
// to respect the Element invariant.
//
// Overall, the reduction works the same as carryPropagate, except with
// wider inputs: we take the carry for each coefficient by shifting it right
// by 51, and add it to the limb above it. The top carry is multiplied by 19
// according to the reduction identity and added to the lowest limb.
//
// The largest coefficient (r0) will be at most 111 bits, which guarantees
// that all carries are at most 111 - 51 = 60 bits, which fits in a uint64.
//
// r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1)
// r0 < 2⁵²×2⁵² + 19×(2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵²)
// r0 < (1 + 19 × 4) × 2⁵² × 2⁵²
// r0 < 2⁷ × 2⁵² × 2⁵²
// r0 < 2¹¹¹
//
// Moreover, the top coefficient (r4) is at most 107 bits, so c4 is at most
// 56 bits, and c4 * 19 is at most 61 bits, which again fits in a uint64 and
// allows us to easily apply the reduction identity.
//
// r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0
// r4 < 5 × 2⁵² × 2⁵²
// r4 < 2¹⁰⁷
//
c0 := shiftRightBy51(r0)
c1 := shiftRightBy51(r1)
c2 := shiftRightBy51(r2)
c3 := shiftRightBy51(r3)
c4 := shiftRightBy51(r4)
rr0 := r0.lo&maskLow51Bits + c4*19
rr1 := r1.lo&maskLow51Bits + c0
rr2 := r2.lo&maskLow51Bits + c1
rr3 := r3.lo&maskLow51Bits + c2
rr4 := r4.lo&maskLow51Bits + c3
// Now all coefficients fit into 64-bit registers but are still too large to
// be passed around as a Element. We therefore do one last carry chain,
// where the carries will be small enough to fit in the wiggle room above 2⁵¹.
*v = Element{rr0, rr1, rr2, rr3, rr4}
v.carryPropagate()
}
func feSquareGeneric(v, a *Element) {
l0 := a.l0
l1 := a.l1
l2 := a.l2
l3 := a.l3
l4 := a.l4
// Squaring works precisely like multiplication above, but thanks to its
// symmetry we get to group a few terms together.
//
// l4 l3 l2 l1 l0 x
// l4 l3 l2 l1 l0 =
// ------------------------
// l4l0 l3l0 l2l0 l1l0 l0l0 +
// l4l1 l3l1 l2l1 l1l1 l0l1 +
// l4l2 l3l2 l2l2 l1l2 l0l2 +
// l4l3 l3l3 l2l3 l1l3 l0l3 +
// l4l4 l3l4 l2l4 l1l4 l0l4 =
// ----------------------------------------------
// r8 r7 r6 r5 r4 r3 r2 r1 r0
//
// l4l0 l3l0 l2l0 l1l0 l0l0 +
// l3l1 l2l1 l1l1 l0l1 19×l4l1 +
// l2l2 l1l2 l0l2 19×l4l2 19×l3l2 +
// l1l3 l0l3 19×l4l3 19×l3l3 19×l2l3 +
// l0l4 19×l4l4 19×l3l4 19×l2l4 19×l1l4 =
// --------------------------------------
// r4 r3 r2 r1 r0
//
// With precomputed 2×, 19×, and 2×19× terms, we can compute each limb with
// only three Mul64 and four Add64, instead of five and eight.
l0_2 := l0 * 2
l1_2 := l1 * 2
l1_38 := l1 * 38
l2_38 := l2 * 38
l3_38 := l3 * 38
l3_19 := l3 * 19
l4_19 := l4 * 19
// r0 = l0×l0 + 19×(l1×l4 + l2×l3 + l3×l2 + l4×l1) = l0×l0 + 19×2×(l1×l4 + l2×l3)
r0 := mul64(l0, l0)
r0 = addMul64(r0, l1_38, l4)
r0 = addMul64(r0, l2_38, l3)
// r1 = l0×l1 + l1×l0 + 19×(l2×l4 + l3×l3 + l4×l2) = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
r1 := mul64(l0_2, l1)
r1 = addMul64(r1, l2_38, l4)
r1 = addMul64(r1, l3_19, l3)
// r2 = l0×l2 + l1×l1 + l2×l0 + 19×(l3×l4 + l4×l3) = 2×l0×l2 + l1×l1 + 19×2×l3×l4
r2 := mul64(l0_2, l2)
r2 = addMul64(r2, l1, l1)
r2 = addMul64(r2, l3_38, l4)
// r3 = l0×l3 + l1×l2 + l2×l1 + l3×l0 + 19×l4×l4 = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
r3 := mul64(l0_2, l3)
r3 = addMul64(r3, l1_2, l2)
r3 = addMul64(r3, l4_19, l4)
// r4 = l0×l4 + l1×l3 + l2×l2 + l3×l1 + l4×l0 = 2×l0×l4 + 2×l1×l3 + l2×l2
r4 := mul64(l0_2, l4)
r4 = addMul64(r4, l1_2, l3)
r4 = addMul64(r4, l2, l2)
c0 := shiftRightBy51(r0)
c1 := shiftRightBy51(r1)
c2 := shiftRightBy51(r2)
c3 := shiftRightBy51(r3)
c4 := shiftRightBy51(r4)
rr0 := r0.lo&maskLow51Bits + c4*19
rr1 := r1.lo&maskLow51Bits + c0
rr2 := r2.lo&maskLow51Bits + c1
rr3 := r3.lo&maskLow51Bits + c2
rr4 := r4.lo&maskLow51Bits + c3
*v = Element{rr0, rr1, rr2, rr3, rr4}
v.carryPropagate()
}
// carryPropagate brings the limbs below 52 bits by applying the reduction
// identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry. TODO inline
func (v *Element) carryPropagateGeneric() *Element {
c0 := v.l0 >> 51
c1 := v.l1 >> 51
c2 := v.l2 >> 51
c3 := v.l3 >> 51
c4 := v.l4 >> 51
v.l0 = v.l0&maskLow51Bits + c4*19
v.l1 = v.l1&maskLow51Bits + c0
v.l2 = v.l2&maskLow51Bits + c1
v.l3 = v.l3&maskLow51Bits + c2
v.l4 = v.l4&maskLow51Bits + c3
return v
}

View File

@ -0,0 +1 @@
b0c49ae9f59d233526f8934262c5bbbe14d4358d

View File

@ -0,0 +1,19 @@
#! /bin/bash
set -euo pipefail
cd "$(git rev-parse --show-toplevel)"
STD_PATH=src/crypto/ed25519/internal/edwards25519/field
LOCAL_PATH=curve25519/internal/field
LAST_SYNC_REF=$(cat $LOCAL_PATH/sync.checkpoint)
git fetch https://go.googlesource.com/go master
if git diff --quiet $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH; then
echo "No changes."
else
NEW_REF=$(git rev-parse FETCH_HEAD | tee $LOCAL_PATH/sync.checkpoint)
echo "Applying changes from $LAST_SYNC_REF to $NEW_REF..."
git diff $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH | \
git apply -3 --directory=$LOCAL_PATH
fi

77
vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go generated vendored Normal file
View File

@ -0,0 +1,77 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package pbkdf2 implements the key derivation function PBKDF2 as defined in RFC
2898 / PKCS #5 v2.0.
A key derivation function is useful when encrypting data based on a password
or any other not-fully-random data. It uses a pseudorandom function to derive
a secure encryption key based on the password.
While v2.0 of the standard defines only one pseudorandom function to use,
HMAC-SHA1, the drafted v2.1 specification allows use of all five FIPS Approved
Hash Functions SHA-1, SHA-224, SHA-256, SHA-384 and SHA-512 for HMAC. To
choose, you can pass the `New` functions from the different SHA packages to
pbkdf2.Key.
*/
package pbkdf2 // import "golang.org/x/crypto/pbkdf2"
import (
"crypto/hmac"
"hash"
)
// Key derives a key from the password, salt and iteration count, returning a
// []byte of length keylen that can be used as cryptographic key. The key is
// derived based on the method described as PBKDF2 with the HMAC variant using
// the supplied hash function.
//
// For example, to use a HMAC-SHA-1 based PBKDF2 key derivation function, you
// can get a derived key for e.g. AES-256 (which needs a 32-byte key) by
// doing:
//
// dk := pbkdf2.Key([]byte("some password"), salt, 4096, 32, sha1.New)
//
// Remember to get a good random salt. At least 8 bytes is recommended by the
// RFC.
//
// Using a higher iteration count will increase the cost of an exhaustive
// search but will also make derivation proportionally slower.
func Key(password, salt []byte, iter, keyLen int, h func() hash.Hash) []byte {
prf := hmac.New(h, password)
hashLen := prf.Size()
numBlocks := (keyLen + hashLen - 1) / hashLen
var buf [4]byte
dk := make([]byte, 0, numBlocks*hashLen)
U := make([]byte, hashLen)
for block := 1; block <= numBlocks; block++ {
// N.B.: || means concatenation, ^ means XOR
// for each block T_i = U_1 ^ U_2 ^ ... ^ U_iter
// U_1 = PRF(password, salt || uint(i))
prf.Reset()
prf.Write(salt)
buf[0] = byte(block >> 24)
buf[1] = byte(block >> 16)
buf[2] = byte(block >> 8)
buf[3] = byte(block)
prf.Write(buf[:4])
dk = prf.Sum(dk)
T := dk[len(dk)-hashLen:]
copy(U, T)
// U_n = PRF(password, U_(n-1))
for n := 2; n <= iter; n++ {
prf.Reset()
prf.Write(U)
U = U[:0]
U = prf.Sum(U)
for x := range U {
T[x] ^= U[x]
}
}
}
return dk[:keyLen]
}

View File

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
#include "textflag.h"

View File

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
#include "textflag.h"
@ -82,7 +83,7 @@ multiply:
BGE loop
bytes_between_0_and_15:
CMP $0, R5
CMP R5, $0
BEQ done
MOVD $0, R16 // h0
MOVD $0, R17 // h1
@ -122,7 +123,7 @@ just1:
// Exactly 8
MOVD (R4), R16
CMP $0, R17
CMP R17, $0
// Check if we've already set R17; if not
// set 1 to indicate end of msg.
@ -151,7 +152,7 @@ less4:
ADD $2, R4
less2:
CMP $0, R5
CMP R5, $0
BEQ insert1
MOVBZ (R4), R21
SLD R22, R21, R21
@ -166,12 +167,12 @@ insert1:
carry:
// Add new values to h0, h1, h2
ADDC R16, R8
ADDE R17, R9
ADDE $0, R10
MOVD $16, R5
ADD R5, R4
BR multiply
ADDC R16, R8
ADDE R17, R9
ADDZE R10, R10
MOVD $16, R5
ADD R5, R4
BR multiply
done:
// Save h0, h1, h2 in state

View File

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
// +build gc,!purego
#include "textflag.h"

View File

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && !purego && gc
// +build amd64,!purego,gc
// This code was translated into a form compatible with 6a from the public

212
vendor/golang.org/x/crypto/scrypt/scrypt.go generated vendored Normal file
View File

@ -0,0 +1,212 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package scrypt implements the scrypt key derivation function as defined in
// Colin Percival's paper "Stronger Key Derivation via Sequential Memory-Hard
// Functions" (https://www.tarsnap.com/scrypt/scrypt.pdf).
package scrypt // import "golang.org/x/crypto/scrypt"
import (
"crypto/sha256"
"encoding/binary"
"errors"
"math/bits"
"golang.org/x/crypto/pbkdf2"
)
const maxInt = int(^uint(0) >> 1)
// blockCopy copies n numbers from src into dst.
func blockCopy(dst, src []uint32, n int) {
copy(dst, src[:n])
}
// blockXOR XORs numbers from dst with n numbers from src.
func blockXOR(dst, src []uint32, n int) {
for i, v := range src[:n] {
dst[i] ^= v
}
}
// salsaXOR applies Salsa20/8 to the XOR of 16 numbers from tmp and in,
// and puts the result into both tmp and out.
func salsaXOR(tmp *[16]uint32, in, out []uint32) {
w0 := tmp[0] ^ in[0]
w1 := tmp[1] ^ in[1]
w2 := tmp[2] ^ in[2]
w3 := tmp[3] ^ in[3]
w4 := tmp[4] ^ in[4]
w5 := tmp[5] ^ in[5]
w6 := tmp[6] ^ in[6]
w7 := tmp[7] ^ in[7]
w8 := tmp[8] ^ in[8]
w9 := tmp[9] ^ in[9]
w10 := tmp[10] ^ in[10]
w11 := tmp[11] ^ in[11]
w12 := tmp[12] ^ in[12]
w13 := tmp[13] ^ in[13]
w14 := tmp[14] ^ in[14]
w15 := tmp[15] ^ in[15]
x0, x1, x2, x3, x4, x5, x6, x7, x8 := w0, w1, w2, w3, w4, w5, w6, w7, w8
x9, x10, x11, x12, x13, x14, x15 := w9, w10, w11, w12, w13, w14, w15
for i := 0; i < 8; i += 2 {
x4 ^= bits.RotateLeft32(x0+x12, 7)
x8 ^= bits.RotateLeft32(x4+x0, 9)
x12 ^= bits.RotateLeft32(x8+x4, 13)
x0 ^= bits.RotateLeft32(x12+x8, 18)
x9 ^= bits.RotateLeft32(x5+x1, 7)
x13 ^= bits.RotateLeft32(x9+x5, 9)
x1 ^= bits.RotateLeft32(x13+x9, 13)
x5 ^= bits.RotateLeft32(x1+x13, 18)
x14 ^= bits.RotateLeft32(x10+x6, 7)
x2 ^= bits.RotateLeft32(x14+x10, 9)
x6 ^= bits.RotateLeft32(x2+x14, 13)
x10 ^= bits.RotateLeft32(x6+x2, 18)
x3 ^= bits.RotateLeft32(x15+x11, 7)
x7 ^= bits.RotateLeft32(x3+x15, 9)
x11 ^= bits.RotateLeft32(x7+x3, 13)
x15 ^= bits.RotateLeft32(x11+x7, 18)
x1 ^= bits.RotateLeft32(x0+x3, 7)
x2 ^= bits.RotateLeft32(x1+x0, 9)
x3 ^= bits.RotateLeft32(x2+x1, 13)
x0 ^= bits.RotateLeft32(x3+x2, 18)
x6 ^= bits.RotateLeft32(x5+x4, 7)
x7 ^= bits.RotateLeft32(x6+x5, 9)
x4 ^= bits.RotateLeft32(x7+x6, 13)
x5 ^= bits.RotateLeft32(x4+x7, 18)
x11 ^= bits.RotateLeft32(x10+x9, 7)
x8 ^= bits.RotateLeft32(x11+x10, 9)
x9 ^= bits.RotateLeft32(x8+x11, 13)
x10 ^= bits.RotateLeft32(x9+x8, 18)
x12 ^= bits.RotateLeft32(x15+x14, 7)
x13 ^= bits.RotateLeft32(x12+x15, 9)
x14 ^= bits.RotateLeft32(x13+x12, 13)
x15 ^= bits.RotateLeft32(x14+x13, 18)
}
x0 += w0
x1 += w1
x2 += w2
x3 += w3
x4 += w4
x5 += w5
x6 += w6
x7 += w7
x8 += w8
x9 += w9
x10 += w10
x11 += w11
x12 += w12
x13 += w13
x14 += w14
x15 += w15
out[0], tmp[0] = x0, x0
out[1], tmp[1] = x1, x1
out[2], tmp[2] = x2, x2
out[3], tmp[3] = x3, x3
out[4], tmp[4] = x4, x4
out[5], tmp[5] = x5, x5
out[6], tmp[6] = x6, x6
out[7], tmp[7] = x7, x7
out[8], tmp[8] = x8, x8
out[9], tmp[9] = x9, x9
out[10], tmp[10] = x10, x10
out[11], tmp[11] = x11, x11
out[12], tmp[12] = x12, x12
out[13], tmp[13] = x13, x13
out[14], tmp[14] = x14, x14
out[15], tmp[15] = x15, x15
}
func blockMix(tmp *[16]uint32, in, out []uint32, r int) {
blockCopy(tmp[:], in[(2*r-1)*16:], 16)
for i := 0; i < 2*r; i += 2 {
salsaXOR(tmp, in[i*16:], out[i*8:])
salsaXOR(tmp, in[i*16+16:], out[i*8+r*16:])
}
}
func integer(b []uint32, r int) uint64 {
j := (2*r - 1) * 16
return uint64(b[j]) | uint64(b[j+1])<<32
}
func smix(b []byte, r, N int, v, xy []uint32) {
var tmp [16]uint32
R := 32 * r
x := xy
y := xy[R:]
j := 0
for i := 0; i < R; i++ {
x[i] = binary.LittleEndian.Uint32(b[j:])
j += 4
}
for i := 0; i < N; i += 2 {
blockCopy(v[i*R:], x, R)
blockMix(&tmp, x, y, r)
blockCopy(v[(i+1)*R:], y, R)
blockMix(&tmp, y, x, r)
}
for i := 0; i < N; i += 2 {
j := int(integer(x, r) & uint64(N-1))
blockXOR(x, v[j*R:], R)
blockMix(&tmp, x, y, r)
j = int(integer(y, r) & uint64(N-1))
blockXOR(y, v[j*R:], R)
blockMix(&tmp, y, x, r)
}
j = 0
for _, v := range x[:R] {
binary.LittleEndian.PutUint32(b[j:], v)
j += 4
}
}
// Key derives a key from the password, salt, and cost parameters, returning
// a byte slice of length keyLen that can be used as cryptographic key.
//
// N is a CPU/memory cost parameter, which must be a power of two greater than 1.
// r and p must satisfy r * p < 2³⁰. If the parameters do not satisfy the
// limits, the function returns a nil byte slice and an error.
//
// For example, you can get a derived key for e.g. AES-256 (which needs a
// 32-byte key) by doing:
//
// dk, err := scrypt.Key([]byte("some password"), salt, 32768, 8, 1, 32)
//
// The recommended parameters for interactive logins as of 2017 are N=32768, r=8
// and p=1. The parameters N, r, and p should be increased as memory latency and
// CPU parallelism increases; consider setting N to the highest power of 2 you
// can derive within 100 milliseconds. Remember to get a good random salt.
func Key(password, salt []byte, N, r, p, keyLen int) ([]byte, error) {
if N <= 1 || N&(N-1) != 0 {
return nil, errors.New("scrypt: N must be > 1 and a power of 2")
}
if uint64(r)*uint64(p) >= 1<<30 || r > maxInt/128/p || r > maxInt/256 || N > maxInt/128/r {
return nil, errors.New("scrypt: parameters are too large")
}
xy := make([]uint32, 64*r)
v := make([]uint32, 32*N*r)
b := pbkdf2.Key(password, salt, 1, p*128*r, sha256.New)
for i := 0; i < p; i++ {
smix(b[i*128*r:], r, N, v, xy)
}
return pbkdf2.Key(password, b, 1, keyLen, sha256.New), nil
}

View File

@ -77,7 +77,7 @@ func NewClientConn(c net.Conn, addr string, config *ClientConfig) (Conn, <-chan
}
conn := &connection{
sshConn: sshConn{conn: c},
sshConn: sshConn{conn: c, user: fullConf.User},
}
if err := conn.clientHandshake(addr, &fullConf); err != nil {