Upgrading dependencies to include logrus.

This commit is contained in:
Renan DelValle 2018-11-09 15:58:49 -08:00
parent bc28198c2d
commit c03901c0f1
No known key found for this signature in database
GPG key ID: C240AD6D6F443EC9
379 changed files with 90030 additions and 47 deletions

33
vendor/golang.org/x/crypto/poly1305/poly1305.go generated vendored Normal file
View file

@ -0,0 +1,33 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package poly1305 implements Poly1305 one-time message authentication code as
specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
Poly1305 is a fast, one-time authentication function. It is infeasible for an
attacker to generate an authenticator for a message without the key. However, a
key must only be used for a single message. Authenticating two different
messages with the same key allows an attacker to forge authenticators for other
messages with the same key.
Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
used with a fixed key in order to generate one-time keys from an nonce.
However, in this package AES isn't used and the one-time key is specified
directly.
*/
package poly1305 // import "golang.org/x/crypto/poly1305"
import "crypto/subtle"
// TagSize is the size, in bytes, of a poly1305 authenticator.
const TagSize = 16
// Verify returns true if mac is a valid authenticator for m with the given
// key.
func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
var tmp [16]byte
Sum(&tmp, m, key)
return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1
}

132
vendor/golang.org/x/crypto/poly1305/poly1305_test.go generated vendored Normal file
View file

@ -0,0 +1,132 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package poly1305
import (
"encoding/hex"
"flag"
"testing"
"unsafe"
)
var stressFlag = flag.Bool("stress", false, "run slow stress tests")
type test struct {
in string
key string
tag string
}
func (t *test) Input() []byte {
in, err := hex.DecodeString(t.in)
if err != nil {
panic(err)
}
return in
}
func (t *test) Key() [32]byte {
buf, err := hex.DecodeString(t.key)
if err != nil {
panic(err)
}
var key [32]byte
copy(key[:], buf[:32])
return key
}
func (t *test) Tag() [16]byte {
buf, err := hex.DecodeString(t.tag)
if err != nil {
panic(err)
}
var tag [16]byte
copy(tag[:], buf[:16])
return tag
}
func testSum(t *testing.T, unaligned bool, sumImpl func(tag *[TagSize]byte, msg []byte, key *[32]byte)) {
var tag [16]byte
for i, v := range testData {
in := v.Input()
if unaligned {
in = unalignBytes(in)
}
key := v.Key()
sumImpl(&tag, in, &key)
if tag != v.Tag() {
t.Errorf("%d: expected %x, got %x", i, v.Tag(), tag[:])
}
}
}
func TestBurnin(t *testing.T) {
// This test can be used to sanity-check significant changes. It can
// take about many minutes to run, even on fast machines. It's disabled
// by default.
if !*stressFlag {
t.Skip("skipping without -stress")
}
var key [32]byte
var input [25]byte
var output [16]byte
for i := range key {
key[i] = 1
}
for i := range input {
input[i] = 2
}
for i := uint64(0); i < 1e10; i++ {
Sum(&output, input[:], &key)
copy(key[0:], output[:])
copy(key[16:], output[:])
copy(input[:], output[:])
copy(input[16:], output[:])
}
const expected = "5e3b866aea0b636d240c83c428f84bfa"
if got := hex.EncodeToString(output[:]); got != expected {
t.Errorf("expected %s, got %s", expected, got)
}
}
func TestSum(t *testing.T) { testSum(t, false, Sum) }
func TestSumUnaligned(t *testing.T) { testSum(t, true, Sum) }
func TestSumGeneric(t *testing.T) { testSum(t, false, sumGeneric) }
func TestSumGenericUnaligned(t *testing.T) { testSum(t, true, sumGeneric) }
func benchmark(b *testing.B, size int, unaligned bool) {
var out [16]byte
var key [32]byte
in := make([]byte, size)
if unaligned {
in = unalignBytes(in)
}
b.SetBytes(int64(len(in)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Sum(&out, in, &key)
}
}
func Benchmark64(b *testing.B) { benchmark(b, 64, false) }
func Benchmark1K(b *testing.B) { benchmark(b, 1024, false) }
func Benchmark64Unaligned(b *testing.B) { benchmark(b, 64, true) }
func Benchmark1KUnaligned(b *testing.B) { benchmark(b, 1024, true) }
func Benchmark2M(b *testing.B) { benchmark(b, 2097152, true) }
func unalignBytes(in []byte) []byte {
out := make([]byte, len(in)+1)
if uintptr(unsafe.Pointer(&out[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
out = out[1:]
} else {
out = out[:len(in)]
}
copy(out, in)
return out
}

22
vendor/golang.org/x/crypto/poly1305/sum_amd64.go generated vendored Normal file
View file

@ -0,0 +1,22 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64,!gccgo,!appengine
package poly1305
// This function is implemented in sum_amd64.s
//go:noescape
func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
// Sum generates an authenticator for m using a one-time key and puts the
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
var mPtr *byte
if len(m) > 0 {
mPtr = &m[0]
}
poly1305(out, mPtr, uint64(len(m)), key)
}

125
vendor/golang.org/x/crypto/poly1305/sum_amd64.s generated vendored Normal file
View file

@ -0,0 +1,125 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64,!gccgo,!appengine
#include "textflag.h"
#define POLY1305_ADD(msg, h0, h1, h2) \
ADDQ 0(msg), h0; \
ADCQ 8(msg), h1; \
ADCQ $1, h2; \
LEAQ 16(msg), msg
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
MOVQ r0, AX; \
MULQ h0; \
MOVQ AX, t0; \
MOVQ DX, t1; \
MOVQ r0, AX; \
MULQ h1; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ r0, t2; \
IMULQ h2, t2; \
ADDQ DX, t2; \
\
MOVQ r1, AX; \
MULQ h0; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ DX, h0; \
MOVQ r1, t3; \
IMULQ h2, t3; \
MOVQ r1, AX; \
MULQ h1; \
ADDQ AX, t2; \
ADCQ DX, t3; \
ADDQ h0, t2; \
ADCQ $0, t3; \
\
MOVQ t0, h0; \
MOVQ t1, h1; \
MOVQ t2, h2; \
ANDQ $3, h2; \
MOVQ t2, t0; \
ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
ADDQ t0, h0; \
ADCQ t3, h1; \
ADCQ $0, h2; \
SHRQ $2, t3, t2; \
SHRQ $2, t3; \
ADDQ t2, h0; \
ADCQ t3, h1; \
ADCQ $0, h2
DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
GLOBL ·poly1305Mask<>(SB), RODATA, $16
// func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
TEXT ·poly1305(SB), $0-32
MOVQ out+0(FP), DI
MOVQ m+8(FP), SI
MOVQ mlen+16(FP), R15
MOVQ key+24(FP), AX
MOVQ 0(AX), R11
MOVQ 8(AX), R12
ANDQ ·poly1305Mask<>(SB), R11 // r0
ANDQ ·poly1305Mask<>+8(SB), R12 // r1
XORQ R8, R8 // h0
XORQ R9, R9 // h1
XORQ R10, R10 // h2
CMPQ R15, $16
JB bytes_between_0_and_15
loop:
POLY1305_ADD(SI, R8, R9, R10)
multiply:
POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
SUBQ $16, R15
CMPQ R15, $16
JAE loop
bytes_between_0_and_15:
TESTQ R15, R15
JZ done
MOVQ $1, BX
XORQ CX, CX
XORQ R13, R13
ADDQ R15, SI
flush_buffer:
SHLQ $8, BX, CX
SHLQ $8, BX
MOVB -1(SI), R13
XORQ R13, BX
DECQ SI
DECQ R15
JNZ flush_buffer
ADDQ BX, R8
ADCQ CX, R9
ADCQ $0, R10
MOVQ $16, R15
JMP multiply
done:
MOVQ R8, AX
MOVQ R9, BX
SUBQ $0xFFFFFFFFFFFFFFFB, AX
SBBQ $0xFFFFFFFFFFFFFFFF, BX
SBBQ $3, R10
CMOVQCS R8, AX
CMOVQCS R9, BX
MOVQ key+24(FP), R8
ADDQ 16(R8), AX
ADCQ 24(R8), BX
MOVQ AX, 0(DI)
MOVQ BX, 8(DI)
RET

22
vendor/golang.org/x/crypto/poly1305/sum_arm.go generated vendored Normal file
View file

@ -0,0 +1,22 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build arm,!gccgo,!appengine,!nacl
package poly1305
// This function is implemented in sum_arm.s
//go:noescape
func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte)
// Sum generates an authenticator for m using a one-time key and puts the
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
var mPtr *byte
if len(m) > 0 {
mPtr = &m[0]
}
poly1305_auth_armv6(out, mPtr, uint32(len(m)), key)
}

427
vendor/golang.org/x/crypto/poly1305/sum_arm.s generated vendored Normal file
View file

@ -0,0 +1,427 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build arm,!gccgo,!appengine,!nacl
#include "textflag.h"
// This code was translated into a form compatible with 5a from the public
// domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
// Warning: the linker may use R11 to synthesize certain instructions. Please
// take care and verify that no synthetic instructions use it.
TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
// Needs 16 bytes of stack and 64 bytes of space pointed to by R0. (It
// might look like it's only 60 bytes of space but the final four bytes
// will be written by another function.) We need to skip over four
// bytes of stack because that's saving the value of 'g'.
ADD $4, R13, R8
MOVM.IB [R4-R7], (R8)
MOVM.IA.W (R1), [R2-R5]
MOVW $·poly1305_init_constants_armv6<>(SB), R7
MOVW R2, R8
MOVW R2>>26, R9
MOVW R3>>20, g
MOVW R4>>14, R11
MOVW R5>>8, R12
ORR R3<<6, R9, R9
ORR R4<<12, g, g
ORR R5<<18, R11, R11
MOVM.IA (R7), [R2-R6]
AND R8, R2, R2
AND R9, R3, R3
AND g, R4, R4
AND R11, R5, R5
AND R12, R6, R6
MOVM.IA.W [R2-R6], (R0)
EOR R2, R2, R2
EOR R3, R3, R3
EOR R4, R4, R4
EOR R5, R5, R5
EOR R6, R6, R6
MOVM.IA.W [R2-R6], (R0)
MOVM.IA.W (R1), [R2-R5]
MOVM.IA [R2-R6], (R0)
ADD $20, R13, R0
MOVM.DA (R0), [R4-R7]
RET
#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
MOVBU (offset+0)(Rsrc), Rtmp; \
MOVBU Rtmp, (offset+0)(Rdst); \
MOVBU (offset+1)(Rsrc), Rtmp; \
MOVBU Rtmp, (offset+1)(Rdst); \
MOVBU (offset+2)(Rsrc), Rtmp; \
MOVBU Rtmp, (offset+2)(Rdst); \
MOVBU (offset+3)(Rsrc), Rtmp; \
MOVBU Rtmp, (offset+3)(Rdst)
TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
// Needs 24 bytes of stack for saved registers and then 88 bytes of
// scratch space after that. We assume that 24 bytes at (R13) have
// already been used: four bytes for the link register saved in the
// prelude of poly1305_auth_armv6, four bytes for saving the value of g
// in that function and 16 bytes of scratch space used around
// poly1305_finish_ext_armv6_skip1.
ADD $24, R13, R12
MOVM.IB [R4-R8, R14], (R12)
MOVW R0, 88(R13)
MOVW R1, 92(R13)
MOVW R2, 96(R13)
MOVW R1, R14
MOVW R2, R12
MOVW 56(R0), R8
WORD $0xe1180008 // TST R8, R8 not working see issue 5921
EOR R6, R6, R6
MOVW.EQ $(1<<24), R6
MOVW R6, 84(R13)
ADD $116, R13, g
MOVM.IA (R0), [R0-R9]
MOVM.IA [R0-R4], (g)
CMP $16, R12
BLO poly1305_blocks_armv6_done
poly1305_blocks_armv6_mainloop:
WORD $0xe31e0003 // TST R14, #3 not working see issue 5921
BEQ poly1305_blocks_armv6_mainloop_aligned
ADD $100, R13, g
MOVW_UNALIGNED(R14, g, R0, 0)
MOVW_UNALIGNED(R14, g, R0, 4)
MOVW_UNALIGNED(R14, g, R0, 8)
MOVW_UNALIGNED(R14, g, R0, 12)
MOVM.IA (g), [R0-R3]
ADD $16, R14
B poly1305_blocks_armv6_mainloop_loaded
poly1305_blocks_armv6_mainloop_aligned:
MOVM.IA.W (R14), [R0-R3]
poly1305_blocks_armv6_mainloop_loaded:
MOVW R0>>26, g
MOVW R1>>20, R11
MOVW R2>>14, R12
MOVW R14, 92(R13)
MOVW R3>>8, R4
ORR R1<<6, g, g
ORR R2<<12, R11, R11
ORR R3<<18, R12, R12
BIC $0xfc000000, R0, R0
BIC $0xfc000000, g, g
MOVW 84(R13), R3
BIC $0xfc000000, R11, R11
BIC $0xfc000000, R12, R12
ADD R0, R5, R5
ADD g, R6, R6
ORR R3, R4, R4
ADD R11, R7, R7
ADD $116, R13, R14
ADD R12, R8, R8
ADD R4, R9, R9
MOVM.IA (R14), [R0-R4]
MULLU R4, R5, (R11, g)
MULLU R3, R5, (R14, R12)
MULALU R3, R6, (R11, g)
MULALU R2, R6, (R14, R12)
MULALU R2, R7, (R11, g)
MULALU R1, R7, (R14, R12)
ADD R4<<2, R4, R4
ADD R3<<2, R3, R3
MULALU R1, R8, (R11, g)
MULALU R0, R8, (R14, R12)
MULALU R0, R9, (R11, g)
MULALU R4, R9, (R14, R12)
MOVW g, 76(R13)
MOVW R11, 80(R13)
MOVW R12, 68(R13)
MOVW R14, 72(R13)
MULLU R2, R5, (R11, g)
MULLU R1, R5, (R14, R12)
MULALU R1, R6, (R11, g)
MULALU R0, R6, (R14, R12)
MULALU R0, R7, (R11, g)
MULALU R4, R7, (R14, R12)
ADD R2<<2, R2, R2
ADD R1<<2, R1, R1
MULALU R4, R8, (R11, g)
MULALU R3, R8, (R14, R12)
MULALU R3, R9, (R11, g)
MULALU R2, R9, (R14, R12)
MOVW g, 60(R13)
MOVW R11, 64(R13)
MOVW R12, 52(R13)
MOVW R14, 56(R13)
MULLU R0, R5, (R11, g)
MULALU R4, R6, (R11, g)
MULALU R3, R7, (R11, g)
MULALU R2, R8, (R11, g)
MULALU R1, R9, (R11, g)
ADD $52, R13, R0
MOVM.IA (R0), [R0-R7]
MOVW g>>26, R12
MOVW R4>>26, R14
ORR R11<<6, R12, R12
ORR R5<<6, R14, R14
BIC $0xfc000000, g, g
BIC $0xfc000000, R4, R4
ADD.S R12, R0, R0
ADC $0, R1, R1
ADD.S R14, R6, R6
ADC $0, R7, R7
MOVW R0>>26, R12
MOVW R6>>26, R14
ORR R1<<6, R12, R12
ORR R7<<6, R14, R14
BIC $0xfc000000, R0, R0
BIC $0xfc000000, R6, R6
ADD R14<<2, R14, R14
ADD.S R12, R2, R2
ADC $0, R3, R3
ADD R14, g, g
MOVW R2>>26, R12
MOVW g>>26, R14
ORR R3<<6, R12, R12
BIC $0xfc000000, g, R5
BIC $0xfc000000, R2, R7
ADD R12, R4, R4
ADD R14, R0, R0
MOVW R4>>26, R12
BIC $0xfc000000, R4, R8
ADD R12, R6, R9
MOVW 96(R13), R12
MOVW 92(R13), R14
MOVW R0, R6
CMP $32, R12
SUB $16, R12, R12
MOVW R12, 96(R13)
BHS poly1305_blocks_armv6_mainloop
poly1305_blocks_armv6_done:
MOVW 88(R13), R12
MOVW R5, 20(R12)
MOVW R6, 24(R12)
MOVW R7, 28(R12)
MOVW R8, 32(R12)
MOVW R9, 36(R12)
ADD $48, R13, R0
MOVM.DA (R0), [R4-R8, R14]
RET
#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
MOVBU.P 1(Rsrc), Rtmp; \
MOVBU.P Rtmp, 1(Rdst); \
MOVBU.P 1(Rsrc), Rtmp; \
MOVBU.P Rtmp, 1(Rdst)
#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
TEXT ·poly1305_auth_armv6(SB), $196-16
// The value 196, just above, is the sum of 64 (the size of the context
// structure) and 132 (the amount of stack needed).
//
// At this point, the stack pointer (R13) has been moved down. It
// points to the saved link register and there's 196 bytes of free
// space above it.
//
// The stack for this function looks like:
//
// +---------------------
// |
// | 64 bytes of context structure
// |
// +---------------------
// |
// | 112 bytes for poly1305_blocks_armv6
// |
// +---------------------
// | 16 bytes of final block, constructed at
// | poly1305_finish_ext_armv6_skip8
// +---------------------
// | four bytes of saved 'g'
// +---------------------
// | lr, saved by prelude <- R13 points here
// +---------------------
MOVW g, 4(R13)
MOVW out+0(FP), R4
MOVW m+4(FP), R5
MOVW mlen+8(FP), R6
MOVW key+12(FP), R7
ADD $136, R13, R0 // 136 = 4 + 4 + 16 + 112
MOVW R7, R1
// poly1305_init_ext_armv6 will write to the stack from R13+4, but
// that's ok because none of the other values have been written yet.
BL poly1305_init_ext_armv6<>(SB)
BIC.S $15, R6, R2
BEQ poly1305_auth_armv6_noblocks
ADD $136, R13, R0
MOVW R5, R1
ADD R2, R5, R5
SUB R2, R6, R6
BL poly1305_blocks_armv6<>(SB)
poly1305_auth_armv6_noblocks:
ADD $136, R13, R0
MOVW R5, R1
MOVW R6, R2
MOVW R4, R3
MOVW R0, R5
MOVW R1, R6
MOVW R2, R7
MOVW R3, R8
AND.S R2, R2, R2
BEQ poly1305_finish_ext_armv6_noremaining
EOR R0, R0
ADD $8, R13, R9 // 8 = offset to 16 byte scratch space
MOVW R0, (R9)
MOVW R0, 4(R9)
MOVW R0, 8(R9)
MOVW R0, 12(R9)
WORD $0xe3110003 // TST R1, #3 not working see issue 5921
BEQ poly1305_finish_ext_armv6_aligned
WORD $0xe3120008 // TST R2, #8 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip8
MOVWP_UNALIGNED(R1, R9, g)
MOVWP_UNALIGNED(R1, R9, g)
poly1305_finish_ext_armv6_skip8:
WORD $0xe3120004 // TST $4, R2 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip4
MOVWP_UNALIGNED(R1, R9, g)
poly1305_finish_ext_armv6_skip4:
WORD $0xe3120002 // TST $2, R2 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip2
MOVHUP_UNALIGNED(R1, R9, g)
B poly1305_finish_ext_armv6_skip2
poly1305_finish_ext_armv6_aligned:
WORD $0xe3120008 // TST R2, #8 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip8_aligned
MOVM.IA.W (R1), [g-R11]
MOVM.IA.W [g-R11], (R9)
poly1305_finish_ext_armv6_skip8_aligned:
WORD $0xe3120004 // TST $4, R2 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip4_aligned
MOVW.P 4(R1), g
MOVW.P g, 4(R9)
poly1305_finish_ext_armv6_skip4_aligned:
WORD $0xe3120002 // TST $2, R2 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip2
MOVHU.P 2(R1), g
MOVH.P g, 2(R9)
poly1305_finish_ext_armv6_skip2:
WORD $0xe3120001 // TST $1, R2 not working see issue 5921
BEQ poly1305_finish_ext_armv6_skip1
MOVBU.P 1(R1), g
MOVBU.P g, 1(R9)
poly1305_finish_ext_armv6_skip1:
MOVW $1, R11
MOVBU R11, 0(R9)
MOVW R11, 56(R5)
MOVW R5, R0
ADD $8, R13, R1
MOVW $16, R2
BL poly1305_blocks_armv6<>(SB)
poly1305_finish_ext_armv6_noremaining:
MOVW 20(R5), R0
MOVW 24(R5), R1
MOVW 28(R5), R2
MOVW 32(R5), R3
MOVW 36(R5), R4
MOVW R4>>26, R12
BIC $0xfc000000, R4, R4
ADD R12<<2, R12, R12
ADD R12, R0, R0
MOVW R0>>26, R12
BIC $0xfc000000, R0, R0
ADD R12, R1, R1
MOVW R1>>26, R12
BIC $0xfc000000, R1, R1
ADD R12, R2, R2
MOVW R2>>26, R12
BIC $0xfc000000, R2, R2
ADD R12, R3, R3
MOVW R3>>26, R12
BIC $0xfc000000, R3, R3
ADD R12, R4, R4
ADD $5, R0, R6
MOVW R6>>26, R12
BIC $0xfc000000, R6, R6
ADD R12, R1, R7
MOVW R7>>26, R12
BIC $0xfc000000, R7, R7
ADD R12, R2, g
MOVW g>>26, R12
BIC $0xfc000000, g, g
ADD R12, R3, R11
MOVW $-(1<<26), R12
ADD R11>>26, R12, R12
BIC $0xfc000000, R11, R11
ADD R12, R4, R9
MOVW R9>>31, R12
SUB $1, R12
AND R12, R6, R6
AND R12, R7, R7
AND R12, g, g
AND R12, R11, R11
AND R12, R9, R9
MVN R12, R12
AND R12, R0, R0
AND R12, R1, R1
AND R12, R2, R2
AND R12, R3, R3
AND R12, R4, R4
ORR R6, R0, R0
ORR R7, R1, R1
ORR g, R2, R2
ORR R11, R3, R3
ORR R9, R4, R4
ORR R1<<26, R0, R0
MOVW R1>>6, R1
ORR R2<<20, R1, R1
MOVW R2>>12, R2
ORR R3<<14, R2, R2
MOVW R3>>18, R3
ORR R4<<8, R3, R3
MOVW 40(R5), R6
MOVW 44(R5), R7
MOVW 48(R5), g
MOVW 52(R5), R11
ADD.S R6, R0, R0
ADC.S R7, R1, R1
ADC.S g, R2, R2
ADC.S R11, R3, R3
MOVM.IA [R0-R3], (R8)
MOVW R5, R12
EOR R0, R0, R0
EOR R1, R1, R1
EOR R2, R2, R2
EOR R3, R3, R3
EOR R4, R4, R4
EOR R5, R5, R5
EOR R6, R6, R6
EOR R7, R7, R7
MOVM.IA.W [R0-R7], (R12)
MOVM.IA [R0-R7], (R12)
MOVW 4(R13), g
RET

14
vendor/golang.org/x/crypto/poly1305/sum_noasm.go generated vendored Normal file
View file

@ -0,0 +1,14 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build s390x,!go1.11 !arm,!amd64,!s390x gccgo appengine nacl
package poly1305
// Sum generates an authenticator for msg using a one-time key and puts the
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
sumGeneric(out, msg, key)
}

139
vendor/golang.org/x/crypto/poly1305/sum_ref.go generated vendored Normal file
View file

@ -0,0 +1,139 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package poly1305
import "encoding/binary"
// sumGeneric generates an authenticator for msg using a one-time key and
// puts the 16-byte result into out. This is the generic implementation of
// Sum and should be called if no assembly implementation is available.
func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
var (
h0, h1, h2, h3, h4 uint32 // the hash accumulators
r0, r1, r2, r3, r4 uint64 // the r part of the key
)
r0 = uint64(binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff)
r1 = uint64((binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03)
r2 = uint64((binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff)
r3 = uint64((binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff)
r4 = uint64((binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff)
R1, R2, R3, R4 := r1*5, r2*5, r3*5, r4*5
for len(msg) >= TagSize {
// h += msg
h0 += binary.LittleEndian.Uint32(msg[0:]) & 0x3ffffff
h1 += (binary.LittleEndian.Uint32(msg[3:]) >> 2) & 0x3ffffff
h2 += (binary.LittleEndian.Uint32(msg[6:]) >> 4) & 0x3ffffff
h3 += (binary.LittleEndian.Uint32(msg[9:]) >> 6) & 0x3ffffff
h4 += (binary.LittleEndian.Uint32(msg[12:]) >> 8) | (1 << 24)
// h *= r
d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
// h %= p
h0 = uint32(d0) & 0x3ffffff
h1 = uint32(d1) & 0x3ffffff
h2 = uint32(d2) & 0x3ffffff
h3 = uint32(d3) & 0x3ffffff
h4 = uint32(d4) & 0x3ffffff
h0 += uint32(d4>>26) * 5
h1 += h0 >> 26
h0 = h0 & 0x3ffffff
msg = msg[TagSize:]
}
if len(msg) > 0 {
var block [TagSize]byte
off := copy(block[:], msg)
block[off] = 0x01
// h += msg
h0 += binary.LittleEndian.Uint32(block[0:]) & 0x3ffffff
h1 += (binary.LittleEndian.Uint32(block[3:]) >> 2) & 0x3ffffff
h2 += (binary.LittleEndian.Uint32(block[6:]) >> 4) & 0x3ffffff
h3 += (binary.LittleEndian.Uint32(block[9:]) >> 6) & 0x3ffffff
h4 += (binary.LittleEndian.Uint32(block[12:]) >> 8)
// h *= r
d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
// h %= p
h0 = uint32(d0) & 0x3ffffff
h1 = uint32(d1) & 0x3ffffff
h2 = uint32(d2) & 0x3ffffff
h3 = uint32(d3) & 0x3ffffff
h4 = uint32(d4) & 0x3ffffff
h0 += uint32(d4>>26) * 5
h1 += h0 >> 26
h0 = h0 & 0x3ffffff
}
// h %= p reduction
h2 += h1 >> 26
h1 &= 0x3ffffff
h3 += h2 >> 26
h2 &= 0x3ffffff
h4 += h3 >> 26
h3 &= 0x3ffffff
h0 += 5 * (h4 >> 26)
h4 &= 0x3ffffff
h1 += h0 >> 26
h0 &= 0x3ffffff
// h - p
t0 := h0 + 5
t1 := h1 + (t0 >> 26)
t2 := h2 + (t1 >> 26)
t3 := h3 + (t2 >> 26)
t4 := h4 + (t3 >> 26) - (1 << 26)
t0 &= 0x3ffffff
t1 &= 0x3ffffff
t2 &= 0x3ffffff
t3 &= 0x3ffffff
// select h if h < p else h - p
t_mask := (t4 >> 31) - 1
h_mask := ^t_mask
h0 = (h0 & h_mask) | (t0 & t_mask)
h1 = (h1 & h_mask) | (t1 & t_mask)
h2 = (h2 & h_mask) | (t2 & t_mask)
h3 = (h3 & h_mask) | (t3 & t_mask)
h4 = (h4 & h_mask) | (t4 & t_mask)
// h %= 2^128
h0 |= h1 << 26
h1 = ((h1 >> 6) | (h2 << 20))
h2 = ((h2 >> 12) | (h3 << 14))
h3 = ((h3 >> 18) | (h4 << 8))
// s: the s part of the key
// tag = (h + s) % (2^128)
t := uint64(h0) + uint64(binary.LittleEndian.Uint32(key[16:]))
h0 = uint32(t)
t = uint64(h1) + uint64(binary.LittleEndian.Uint32(key[20:])) + (t >> 32)
h1 = uint32(t)
t = uint64(h2) + uint64(binary.LittleEndian.Uint32(key[24:])) + (t >> 32)
h2 = uint32(t)
t = uint64(h3) + uint64(binary.LittleEndian.Uint32(key[28:])) + (t >> 32)
h3 = uint32(t)
binary.LittleEndian.PutUint32(out[0:], h0)
binary.LittleEndian.PutUint32(out[4:], h1)
binary.LittleEndian.PutUint32(out[8:], h2)
binary.LittleEndian.PutUint32(out[12:], h3)
}

49
vendor/golang.org/x/crypto/poly1305/sum_s390x.go generated vendored Normal file
View file

@ -0,0 +1,49 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build s390x,go1.11,!gccgo,!appengine
package poly1305
// hasVectorFacility reports whether the machine supports
// the vector facility (vx).
func hasVectorFacility() bool
// hasVMSLFacility reports whether the machine supports
// Vector Multiply Sum Logical (VMSL).
func hasVMSLFacility() bool
var hasVX = hasVectorFacility()
var hasVMSL = hasVMSLFacility()
// poly1305vx is an assembly implementation of Poly1305 that uses vector
// instructions. It must only be called if the vector facility (vx) is
// available.
//go:noescape
func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
// instructions, including VMSL. It must only be called if the vector facility (vx) is
// available and if VMSL is supported.
//go:noescape
func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
// Sum generates an authenticator for m using a one-time key and puts the
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
if hasVX {
var mPtr *byte
if len(m) > 0 {
mPtr = &m[0]
}
if hasVMSL && len(m) > 256 {
poly1305vmsl(out, mPtr, uint64(len(m)), key)
} else {
poly1305vx(out, mPtr, uint64(len(m)), key)
}
} else {
sumGeneric(out, m, key)
}
}

400
vendor/golang.org/x/crypto/poly1305/sum_s390x.s generated vendored Normal file
View file

@ -0,0 +1,400 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build s390x,go1.11,!gccgo,!appengine
#include "textflag.h"
// Implementation of Poly1305 using the vector facility (vx).
// constants
#define MOD26 V0
#define EX0 V1
#define EX1 V2
#define EX2 V3
// temporaries
#define T_0 V4
#define T_1 V5
#define T_2 V6
#define T_3 V7
#define T_4 V8
// key (r)
#define R_0 V9
#define R_1 V10
#define R_2 V11
#define R_3 V12
#define R_4 V13
#define R5_1 V14
#define R5_2 V15
#define R5_3 V16
#define R5_4 V17
#define RSAVE_0 R5
#define RSAVE_1 R6
#define RSAVE_2 R7
#define RSAVE_3 R8
#define RSAVE_4 R9
#define R5SAVE_1 V28
#define R5SAVE_2 V29
#define R5SAVE_3 V30
#define R5SAVE_4 V31
// message block
#define F_0 V18
#define F_1 V19
#define F_2 V20
#define F_3 V21
#define F_4 V22
// accumulator
#define H_0 V23
#define H_1 V24
#define H_2 V25
#define H_3 V26
#define H_4 V27
GLOBL ·keyMask<>(SB), RODATA, $16
DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
GLOBL ·bswapMask<>(SB), RODATA, $16
DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
GLOBL ·constants<>(SB), RODATA, $64
// MOD26
DATA ·constants<>+0(SB)/8, $0x3ffffff
DATA ·constants<>+8(SB)/8, $0x3ffffff
// EX0
DATA ·constants<>+16(SB)/8, $0x0006050403020100
DATA ·constants<>+24(SB)/8, $0x1016151413121110
// EX1
DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
// EX2
DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
// h = (f*g) % (2**130-5) [partial reduction]
#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
VMLOF f0, g0, h0 \
VMLOF f0, g1, h1 \
VMLOF f0, g2, h2 \
VMLOF f0, g3, h3 \
VMLOF f0, g4, h4 \
VMLOF f1, g54, T_0 \
VMLOF f1, g0, T_1 \
VMLOF f1, g1, T_2 \
VMLOF f1, g2, T_3 \
VMLOF f1, g3, T_4 \
VMALOF f2, g53, h0, h0 \
VMALOF f2, g54, h1, h1 \
VMALOF f2, g0, h2, h2 \
VMALOF f2, g1, h3, h3 \
VMALOF f2, g2, h4, h4 \
VMALOF f3, g52, T_0, T_0 \
VMALOF f3, g53, T_1, T_1 \
VMALOF f3, g54, T_2, T_2 \
VMALOF f3, g0, T_3, T_3 \
VMALOF f3, g1, T_4, T_4 \
VMALOF f4, g51, h0, h0 \
VMALOF f4, g52, h1, h1 \
VMALOF f4, g53, h2, h2 \
VMALOF f4, g54, h3, h3 \
VMALOF f4, g0, h4, h4 \
VAG T_0, h0, h0 \
VAG T_1, h1, h1 \
VAG T_2, h2, h2 \
VAG T_3, h3, h3 \
VAG T_4, h4, h4
// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
#define REDUCE(h0, h1, h2, h3, h4) \
VESRLG $26, h0, T_0 \
VESRLG $26, h3, T_1 \
VN MOD26, h0, h0 \
VN MOD26, h3, h3 \
VAG T_0, h1, h1 \
VAG T_1, h4, h4 \
VESRLG $26, h1, T_2 \
VESRLG $26, h4, T_3 \
VN MOD26, h1, h1 \
VN MOD26, h4, h4 \
VESLG $2, T_3, T_4 \
VAG T_3, T_4, T_4 \
VAG T_2, h2, h2 \
VAG T_4, h0, h0 \
VESRLG $26, h2, T_0 \
VESRLG $26, h0, T_1 \
VN MOD26, h2, h2 \
VN MOD26, h0, h0 \
VAG T_0, h3, h3 \
VAG T_1, h1, h1 \
VESRLG $26, h3, T_2 \
VN MOD26, h3, h3 \
VAG T_2, h4, h4
// expand in0 into d[0] and in1 into d[1]
#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
VGBM $0x0707, d1 \ // d1=tmp
VPERM in0, in1, EX2, d4 \
VPERM in0, in1, EX0, d0 \
VPERM in0, in1, EX1, d2 \
VN d1, d4, d4 \
VESRLG $26, d0, d1 \
VESRLG $30, d2, d3 \
VESRLG $4, d2, d2 \
VN MOD26, d0, d0 \
VN MOD26, d1, d1 \
VN MOD26, d2, d2 \
VN MOD26, d3, d3
// pack h4:h0 into h1:h0 (no carry)
#define PACK(h0, h1, h2, h3, h4) \
VESLG $26, h1, h1 \
VESLG $26, h3, h3 \
VO h0, h1, h0 \
VO h2, h3, h2 \
VESLG $4, h2, h2 \
VLEIB $7, $48, h1 \
VSLB h1, h2, h2 \
VO h0, h2, h0 \
VLEIB $7, $104, h1 \
VSLB h1, h4, h3 \
VO h3, h0, h0 \
VLEIB $7, $24, h1 \
VSRLB h1, h4, h1
// if h > 2**130-5 then h -= 2**130-5
#define MOD(h0, h1, t0, t1, t2) \
VZERO t0 \
VLEIG $1, $5, t0 \
VACCQ h0, t0, t1 \
VAQ h0, t0, t0 \
VONE t2 \
VLEIG $1, $-4, t2 \
VAQ t2, t1, t1 \
VACCQ h1, t1, t1 \
VONE t2 \
VAQ t2, t1, t1 \
VN h0, t1, t2 \
VNC t0, t1, t1 \
VO t1, t2, h0
// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
TEXT ·poly1305vx(SB), $0-32
// This code processes up to 2 blocks (32 bytes) per iteration
// using the algorithm described in:
// NEON crypto, Daniel J. Bernstein & Peter Schwabe
// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
// load MOD26, EX0, EX1 and EX2
MOVD $·constants<>(SB), R5
VLM (R5), MOD26, EX2
// setup r
VL (R4), T_0
MOVD $·keyMask<>(SB), R6
VL (R6), T_1
VN T_0, T_1, T_0
EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
// setup r*5
VLEIG $0, $5, T_0
VLEIG $1, $5, T_0
// store r (for final block)
VMLOF T_0, R_1, R5SAVE_1
VMLOF T_0, R_2, R5SAVE_2
VMLOF T_0, R_3, R5SAVE_3
VMLOF T_0, R_4, R5SAVE_4
VLGVG $0, R_0, RSAVE_0
VLGVG $0, R_1, RSAVE_1
VLGVG $0, R_2, RSAVE_2
VLGVG $0, R_3, RSAVE_3
VLGVG $0, R_4, RSAVE_4
// skip r**2 calculation
CMPBLE R3, $16, skip
// calculate r**2
MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
REDUCE(H_0, H_1, H_2, H_3, H_4)
VLEIG $0, $5, T_0
VLEIG $1, $5, T_0
VMLOF T_0, H_1, R5_1
VMLOF T_0, H_2, R5_2
VMLOF T_0, H_3, R5_3
VMLOF T_0, H_4, R5_4
VLR H_0, R_0
VLR H_1, R_1
VLR H_2, R_2
VLR H_3, R_3
VLR H_4, R_4
// initialize h
VZERO H_0
VZERO H_1
VZERO H_2
VZERO H_3
VZERO H_4
loop:
CMPBLE R3, $32, b2
VLM (R2), T_0, T_1
SUB $32, R3
MOVD $32(R2), R2
EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
VLEIB $4, $1, F_4
VLEIB $12, $1, F_4
multiply:
VAG H_0, F_0, F_0
VAG H_1, F_1, F_1
VAG H_2, F_2, F_2
VAG H_3, F_3, F_3
VAG H_4, F_4, F_4
MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
REDUCE(H_0, H_1, H_2, H_3, H_4)
CMPBNE R3, $0, loop
finish:
// sum vectors
VZERO T_0
VSUMQG H_0, T_0, H_0
VSUMQG H_1, T_0, H_1
VSUMQG H_2, T_0, H_2
VSUMQG H_3, T_0, H_3
VSUMQG H_4, T_0, H_4
// h may be >= 2*(2**130-5) so we need to reduce it again
REDUCE(H_0, H_1, H_2, H_3, H_4)
// carry h1->h4
VESRLG $26, H_1, T_1
VN MOD26, H_1, H_1
VAQ T_1, H_2, H_2
VESRLG $26, H_2, T_2
VN MOD26, H_2, H_2
VAQ T_2, H_3, H_3
VESRLG $26, H_3, T_3
VN MOD26, H_3, H_3
VAQ T_3, H_4, H_4
// h is now < 2*(2**130-5)
// pack h into h1 (hi) and h0 (lo)
PACK(H_0, H_1, H_2, H_3, H_4)
// if h > 2**130-5 then h -= 2**130-5
MOD(H_0, H_1, T_0, T_1, T_2)
// h += s
MOVD $·bswapMask<>(SB), R5
VL (R5), T_1
VL 16(R4), T_0
VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
VAQ T_0, H_0, H_0
VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little)
VST H_0, (R1)
RET
b2:
CMPBLE R3, $16, b1
// 2 blocks remaining
SUB $17, R3
VL (R2), T_0
VLL R3, 16(R2), T_1
ADD $1, R3
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, T_1
EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
CMPBNE R3, $16, 2(PC)
VLEIB $12, $1, F_4
VLEIB $4, $1, F_4
// setup [r²,r]
VLVGG $1, RSAVE_0, R_0
VLVGG $1, RSAVE_1, R_1
VLVGG $1, RSAVE_2, R_2
VLVGG $1, RSAVE_3, R_3
VLVGG $1, RSAVE_4, R_4
VPDI $0, R5_1, R5SAVE_1, R5_1
VPDI $0, R5_2, R5SAVE_2, R5_2
VPDI $0, R5_3, R5SAVE_3, R5_3
VPDI $0, R5_4, R5SAVE_4, R5_4
MOVD $0, R3
BR multiply
skip:
VZERO H_0
VZERO H_1
VZERO H_2
VZERO H_3
VZERO H_4
CMPBEQ R3, $0, finish
b1:
// 1 block remaining
SUB $1, R3
VLL R3, (R2), T_0
ADD $1, R3
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, T_0
VZERO T_1
EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
CMPBNE R3, $16, 2(PC)
VLEIB $4, $1, F_4
VLEIG $1, $1, R_0
VZERO R_1
VZERO R_2
VZERO R_3
VZERO R_4
VZERO R5_1
VZERO R5_2
VZERO R5_3
VZERO R5_4
// setup [r, 1]
VLVGG $0, RSAVE_0, R_0
VLVGG $0, RSAVE_1, R_1
VLVGG $0, RSAVE_2, R_2
VLVGG $0, RSAVE_3, R_3
VLVGG $0, RSAVE_4, R_4
VPDI $0, R5SAVE_1, R5_1, R5_1
VPDI $0, R5SAVE_2, R5_2, R5_2
VPDI $0, R5SAVE_3, R5_3, R5_3
VPDI $0, R5SAVE_4, R5_4, R5_4
MOVD $0, R3
BR multiply
TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
MOVD $x-24(SP), R1
XC $24, 0(R1), 0(R1) // clear the storage
MOVD $2, R0 // R0 is the number of double words stored -1
WORD $0xB2B01000 // STFLE 0(R1)
XOR R0, R0 // reset the value of R0
MOVBZ z-8(SP), R1
AND $0x40, R1
BEQ novector
vectorinstalled:
// check if the vector instruction has been enabled
VLEIB $0, $0xF, V16
VLGVB $0, V16, R1
CMPBNE R1, $0xF, novector
MOVB $1, ret+0(FP) // have vx
RET
novector:
MOVB $0, ret+0(FP) // no vx
RET

931
vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s generated vendored Normal file
View file

@ -0,0 +1,931 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build s390x,go1.11,!gccgo,!appengine
#include "textflag.h"
// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
// constants
#define EX0 V1
#define EX1 V2
#define EX2 V3
// temporaries
#define T_0 V4
#define T_1 V5
#define T_2 V6
#define T_3 V7
#define T_4 V8
#define T_5 V9
#define T_6 V10
#define T_7 V11
#define T_8 V12
#define T_9 V13
#define T_10 V14
// r**2 & r**4
#define R_0 V15
#define R_1 V16
#define R_2 V17
#define R5_1 V18
#define R5_2 V19
// key (r)
#define RSAVE_0 R7
#define RSAVE_1 R8
#define RSAVE_2 R9
#define R5SAVE_1 R10
#define R5SAVE_2 R11
// message block
#define M0 V20
#define M1 V21
#define M2 V22
#define M3 V23
#define M4 V24
#define M5 V25
// accumulator
#define H0_0 V26
#define H1_0 V27
#define H2_0 V28
#define H0_1 V29
#define H1_1 V30
#define H2_1 V31
GLOBL ·keyMask<>(SB), RODATA, $16
DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
GLOBL ·bswapMask<>(SB), RODATA, $16
DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
GLOBL ·constants<>(SB), RODATA, $48
// EX0
DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
DATA ·constants<>+8(SB)/8, $0x0000050403020100
// EX1
DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
DATA ·constants<>+24(SB)/8, $0x00000a0908070605
// EX2
DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
GLOBL ·c<>(SB), RODATA, $48
// EX0
DATA ·c<>+0(SB)/8, $0x0000050403020100
DATA ·c<>+8(SB)/8, $0x0000151413121110
// EX1
DATA ·c<>+16(SB)/8, $0x00000a0908070605
DATA ·c<>+24(SB)/8, $0x00001a1918171615
// EX2
DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
GLOBL ·reduce<>(SB), RODATA, $32
// 44 bit
DATA ·reduce<>+0(SB)/8, $0x0
DATA ·reduce<>+8(SB)/8, $0xfffffffffff
// 42 bit
DATA ·reduce<>+16(SB)/8, $0x0
DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
// h = (f*g) % (2**130-5) [partial reduction]
// uses T_0...T_9 temporary registers
// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
\ // Eliminate the dependency for the last 2 VMSLs
VMSLG m02_0, r_2, m4_2, m4_2 \
VMSLG m13_0, r_2, m5_2, m5_2 \ // 8 VMSLs pipelined
VMSLG m02_0, r_0, m4_0, m4_0 \
VMSLG m02_1, r5_2, V0, T_0 \
VMSLG m02_0, r_1, m4_1, m4_1 \
VMSLG m02_1, r_0, V0, T_1 \
VMSLG m02_1, r_1, V0, T_2 \
VMSLG m02_2, r5_1, V0, T_3 \
VMSLG m02_2, r5_2, V0, T_4 \
VMSLG m13_0, r_0, m5_0, m5_0 \
VMSLG m13_1, r5_2, V0, T_5 \
VMSLG m13_0, r_1, m5_1, m5_1 \
VMSLG m13_1, r_0, V0, T_6 \
VMSLG m13_1, r_1, V0, T_7 \
VMSLG m13_2, r5_1, V0, T_8 \
VMSLG m13_2, r5_2, V0, T_9 \
VMSLG m02_2, r_0, m4_2, m4_2 \
VMSLG m13_2, r_0, m5_2, m5_2 \
VAQ m4_0, T_0, m02_0 \
VAQ m4_1, T_1, m02_1 \
VAQ m5_0, T_5, m13_0 \
VAQ m5_1, T_6, m13_1 \
VAQ m02_0, T_3, m02_0 \
VAQ m02_1, T_4, m02_1 \
VAQ m13_0, T_8, m13_0 \
VAQ m13_1, T_9, m13_1 \
VAQ m4_2, T_2, m02_2 \
VAQ m5_2, T_7, m13_2 \
// SQUARE uses three limbs of r and r_2*5 to output square of r
// uses T_1, T_5 and T_7 temporary registers
// input: r_0, r_1, r_2, r5_2
// temp: TEMP0, TEMP1, TEMP2
// output: p0, p1, p2
#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
VMSLG r_0, r_0, p0, p0 \
VMSLG r_1, r5_2, V0, TEMP0 \
VMSLG r_2, r5_2, p1, p1 \
VMSLG r_0, r_1, V0, TEMP1 \
VMSLG r_1, r_1, p2, p2 \
VMSLG r_0, r_2, V0, TEMP2 \
VAQ TEMP0, p0, p0 \
VAQ TEMP1, p1, p1 \
VAQ TEMP2, p2, p2 \
VAQ TEMP0, p0, p0 \
VAQ TEMP1, p1, p1 \
VAQ TEMP2, p2, p2 \
// carry h0->h1->h2->h0 || h3->h4->h5->h3
// uses T_2, T_4, T_5, T_7, T_8, T_9
// t6, t7, t8, t9, t10, t11
// input: h0, h1, h2, h3, h4, h5
// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
// output: h0, h1, h2, h3, h4, h5
#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
VLM (R12), t6, t7 \ // 44 and 42 bit clear mask
VLEIB $7, $0x28, t10 \ // 5 byte shift mask
VREPIB $4, t8 \ // 4 bit shift mask
VREPIB $2, t11 \ // 2 bit shift mask
VSRLB t10, h0, t0 \ // h0 byte shift
VSRLB t10, h1, t1 \ // h1 byte shift
VSRLB t10, h2, t2 \ // h2 byte shift
VSRLB t10, h3, t3 \ // h3 byte shift
VSRLB t10, h4, t4 \ // h4 byte shift
VSRLB t10, h5, t5 \ // h5 byte shift
VSRL t8, t0, t0 \ // h0 bit shift
VSRL t8, t1, t1 \ // h2 bit shift
VSRL t11, t2, t2 \ // h2 bit shift
VSRL t8, t3, t3 \ // h3 bit shift
VSRL t8, t4, t4 \ // h4 bit shift
VESLG $2, t2, t9 \ // h2 carry x5
VSRL t11, t5, t5 \ // h5 bit shift
VN t6, h0, h0 \ // h0 clear carry
VAQ t2, t9, t2 \ // h2 carry x5
VESLG $2, t5, t9 \ // h5 carry x5
VN t6, h1, h1 \ // h1 clear carry
VN t7, h2, h2 \ // h2 clear carry
VAQ t5, t9, t5 \ // h5 carry x5
VN t6, h3, h3 \ // h3 clear carry
VN t6, h4, h4 \ // h4 clear carry
VN t7, h5, h5 \ // h5 clear carry
VAQ t0, h1, h1 \ // h0->h1
VAQ t3, h4, h4 \ // h3->h4
VAQ t1, h2, h2 \ // h1->h2
VAQ t4, h5, h5 \ // h4->h5
VAQ t2, h0, h0 \ // h2->h0
VAQ t5, h3, h3 \ // h5->h3
VREPG $1, t6, t6 \ // 44 and 42 bit masks across both halves
VREPG $1, t7, t7 \
VSLDB $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
VSLDB $8, h1, h1, h1 \
VSLDB $8, h2, h2, h2 \
VO h0, h3, h3 \
VO h1, h4, h4 \
VO h2, h5, h5 \
VESRLG $44, h3, t0 \ // 44 bit shift right
VESRLG $44, h4, t1 \
VESRLG $42, h5, t2 \
VN t6, h3, h3 \ // clear carry bits
VN t6, h4, h4 \
VN t7, h5, h5 \
VESLG $2, t2, t9 \ // multiply carry by 5
VAQ t9, t2, t2 \
VAQ t0, h4, h4 \
VAQ t1, h5, h5 \
VAQ t2, h3, h3 \
// carry h0->h1->h2->h0
// input: h0, h1, h2
// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
// output: h0, h1, h2
#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
VLEIB $7, $0x28, t3 \ // 5 byte shift mask
VREPIB $4, t4 \ // 4 bit shift mask
VREPIB $2, t7 \ // 2 bit shift mask
VGBM $0x003F, t5 \ // mask to clear carry bits
VSRLB t3, h0, t0 \
VSRLB t3, h1, t1 \
VSRLB t3, h2, t2 \
VESRLG $4, t5, t5 \ // 44 bit clear mask
VSRL t4, t0, t0 \
VSRL t4, t1, t1 \
VSRL t7, t2, t2 \
VESRLG $2, t5, t6 \ // 42 bit clear mask
VESLG $2, t2, t8 \
VAQ t8, t2, t2 \
VN t5, h0, h0 \
VN t5, h1, h1 \
VN t6, h2, h2 \
VAQ t0, h1, h1 \
VAQ t1, h2, h2 \
VAQ t2, h0, h0 \
VSRLB t3, h0, t0 \
VSRLB t3, h1, t1 \
VSRLB t3, h2, t2 \
VSRL t4, t0, t0 \
VSRL t4, t1, t1 \
VSRL t7, t2, t2 \
VN t5, h0, h0 \
VN t5, h1, h1 \
VESLG $2, t2, t8 \
VN t6, h2, h2 \
VAQ t0, h1, h1 \
VAQ t8, t2, t2 \
VAQ t1, h2, h2 \
VAQ t2, h0, h0 \
// expands two message blocks into the lower halfs of the d registers
// moves the contents of the d registers into upper halfs
// input: in1, in2, d0, d1, d2, d3, d4, d5
// temp: TEMP0, TEMP1, TEMP2, TEMP3
// output: d0, d1, d2, d3, d4, d5
#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
VGBM $0xff3f, TEMP0 \
VGBM $0xff1f, TEMP1 \
VESLG $4, d1, TEMP2 \
VESLG $4, d4, TEMP3 \
VESRLG $4, TEMP0, TEMP0 \
VPERM in1, d0, EX0, d0 \
VPERM in2, d3, EX0, d3 \
VPERM in1, d2, EX2, d2 \
VPERM in2, d5, EX2, d5 \
VPERM in1, TEMP2, EX1, d1 \
VPERM in2, TEMP3, EX1, d4 \
VN TEMP0, d0, d0 \
VN TEMP0, d3, d3 \
VESRLG $4, d1, d1 \
VESRLG $4, d4, d4 \
VN TEMP1, d2, d2 \
VN TEMP1, d5, d5 \
VN TEMP0, d1, d1 \
VN TEMP0, d4, d4 \
// expands one message block into the lower halfs of the d registers
// moves the contents of the d registers into upper halfs
// input: in, d0, d1, d2
// temp: TEMP0, TEMP1, TEMP2
// output: d0, d1, d2
#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
VGBM $0xff3f, TEMP0 \
VESLG $4, d1, TEMP2 \
VGBM $0xff1f, TEMP1 \
VPERM in, d0, EX0, d0 \
VESRLG $4, TEMP0, TEMP0 \
VPERM in, d2, EX2, d2 \
VPERM in, TEMP2, EX1, d1 \
VN TEMP0, d0, d0 \
VN TEMP1, d2, d2 \
VESRLG $4, d1, d1 \
VN TEMP0, d1, d1 \
// pack h2:h0 into h1:h0 (no carry)
// input: h0, h1, h2
// output: h0, h1, h2
#define PACK(h0, h1, h2) \
VMRLG h1, h2, h2 \ // copy h1 to upper half h2
VESLG $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
VO h0, h1, h0 \ // combine h0 with 20 bits from limb 1
VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
VLEIG $1, $0, h1 \ // clear h2 stuff from lower half of h1
VO h0, h1, h0 \ // h0 now has 88 bits (limb 0 and 1)
VLEIG $0, $0, h2 \ // clear upper half of h2
VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
VLEIB $7, $88, h1 \ // for byte shift (11 bytes)
VSLB h1, h2, h2 \ // shift h2 11 bytes to the left
VO h0, h2, h0 \ // combine h0 with 20 bits from limb 1
VLEIG $0, $0, h1 \ // clear upper half of h1
// if h > 2**130-5 then h -= 2**130-5
// input: h0, h1
// temp: t0, t1, t2
// output: h0
#define MOD(h0, h1, t0, t1, t2) \
VZERO t0 \
VLEIG $1, $5, t0 \
VACCQ h0, t0, t1 \
VAQ h0, t0, t0 \
VONE t2 \
VLEIG $1, $-4, t2 \
VAQ t2, t1, t1 \
VACCQ h1, t1, t1 \
VONE t2 \
VAQ t2, t1, t1 \
VN h0, t1, t2 \
VNC t0, t1, t1 \
VO t1, t2, h0 \
// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
TEXT ·poly1305vmsl(SB), $0-32
// This code processes 6 + up to 4 blocks (32 bytes) per iteration
// using the algorithm described in:
// NEON crypto, Daniel J. Bernstein & Peter Schwabe
// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
// And as moddified for VMSL as described in
// Accelerating Poly1305 Cryptographic Message Authentication on the z14
// O'Farrell et al, CASCON 2017, p48-55
// https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
VZERO V0 // c
// load EX0, EX1 and EX2
MOVD $·constants<>(SB), R5
VLM (R5), EX0, EX2 // c
// setup r
VL (R4), T_0
MOVD $·keyMask<>(SB), R6
VL (R6), T_1
VN T_0, T_1, T_0
VZERO T_2 // limbs for r
VZERO T_3
VZERO T_4
EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
// T_2, T_3, T_4: [0, r]
// setup r*20
VLEIG $0, $0, T_0
VLEIG $1, $20, T_0 // T_0: [0, 20]
VZERO T_5
VZERO T_6
VMSLG T_0, T_3, T_5, T_5
VMSLG T_0, T_4, T_6, T_6
// store r for final block in GR
VLGVG $1, T_2, RSAVE_0 // c
VLGVG $1, T_3, RSAVE_1 // c
VLGVG $1, T_4, RSAVE_2 // c
VLGVG $1, T_5, R5SAVE_1 // c
VLGVG $1, T_6, R5SAVE_2 // c
// initialize h
VZERO H0_0
VZERO H1_0
VZERO H2_0
VZERO H0_1
VZERO H1_1
VZERO H2_1
// initialize pointer for reduce constants
MOVD $·reduce<>(SB), R12
// calculate r**2 and 20*(r**2)
VZERO R_0
VZERO R_1
VZERO R_2
SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
VZERO R5_1
VZERO R5_2
VMSLG T_0, R_1, R5_1, R5_1
VMSLG T_0, R_2, R5_2, R5_2
// skip r**4 calculation if 3 blocks or less
CMPBLE R3, $48, b4
// calculate r**4 and 20*(r**4)
VZERO T_8
VZERO T_9
VZERO T_10
SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
VZERO T_2
VZERO T_3
VMSLG T_0, T_9, T_2, T_2
VMSLG T_0, T_10, T_3, T_3
// put r**2 to the right and r**4 to the left of R_0, R_1, R_2
VSLDB $8, T_8, T_8, T_8
VSLDB $8, T_9, T_9, T_9
VSLDB $8, T_10, T_10, T_10
VSLDB $8, T_2, T_2, T_2
VSLDB $8, T_3, T_3, T_3
VO T_8, R_0, R_0
VO T_9, R_1, R_1
VO T_10, R_2, R_2
VO T_2, R5_1, R5_1
VO T_3, R5_2, R5_2
CMPBLE R3, $80, load // less than or equal to 5 blocks in message
// 6(or 5+1) blocks
SUB $81, R3
VLM (R2), M0, M4
VLL R3, 80(R2), M5
ADD $1, R3
MOVBZ $1, R0
CMPBGE R3, $16, 2(PC)
VLVGB R3, R0, M5
MOVD $96(R2), R2
EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
VLEIB $2, $1, H2_0
VLEIB $2, $1, H2_1
VLEIB $10, $1, H2_0
VLEIB $10, $1, H2_1
VZERO M0
VZERO M1
VZERO M2
VZERO M3
VZERO T_4
VZERO T_10
EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
VLR T_4, M4
VLEIB $10, $1, M2
CMPBLT R3, $16, 2(PC)
VLEIB $10, $1, T_10
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
VMRHG V0, H0_1, H0_0
VMRHG V0, H1_1, H1_0
VMRHG V0, H2_1, H2_0
VMRLG V0, H0_1, H0_1
VMRLG V0, H1_1, H1_1
VMRLG V0, H2_1, H2_1
SUB $16, R3
CMPBLE R3, $0, square
load:
// load EX0, EX1 and EX2
MOVD $·c<>(SB), R5
VLM (R5), EX0, EX2
loop:
CMPBLE R3, $64, add // b4 // last 4 or less blocks left
// next 4 full blocks
VLM (R2), M2, M5
SUB $64, R3
MOVD $64(R2), R2
REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
// expacc in-lined to create [m2, m3] limbs
VGBM $0x3f3f, T_0 // 44 bit clear mask
VGBM $0x1f1f, T_1 // 40 bit clear mask
VPERM M2, M3, EX0, T_3
VESRLG $4, T_0, T_0 // 44 bit clear mask ready
VPERM M2, M3, EX1, T_4
VPERM M2, M3, EX2, T_5
VN T_0, T_3, T_3
VESRLG $4, T_4, T_4
VN T_1, T_5, T_5
VN T_0, T_4, T_4
VMRHG H0_1, T_3, H0_0
VMRHG H1_1, T_4, H1_0
VMRHG H2_1, T_5, H2_0
VMRLG H0_1, T_3, H0_1
VMRLG H1_1, T_4, H1_1
VMRLG H2_1, T_5, H2_1
VLEIB $10, $1, H2_0
VLEIB $10, $1, H2_1
VPERM M4, M5, EX0, T_3
VPERM M4, M5, EX1, T_4
VPERM M4, M5, EX2, T_5
VN T_0, T_3, T_3
VESRLG $4, T_4, T_4
VN T_1, T_5, T_5
VN T_0, T_4, T_4
VMRHG V0, T_3, M0
VMRHG V0, T_4, M1
VMRHG V0, T_5, M2
VMRLG V0, T_3, M3
VMRLG V0, T_4, M4
VMRLG V0, T_5, M5
VLEIB $10, $1, M2
VLEIB $10, $1, M5
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
CMPBNE R3, $0, loop
REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
VMRHG V0, H0_1, H0_0
VMRHG V0, H1_1, H1_0
VMRHG V0, H2_1, H2_0
VMRLG V0, H0_1, H0_1
VMRLG V0, H1_1, H1_1
VMRLG V0, H2_1, H2_1
// load EX0, EX1, EX2
MOVD $·constants<>(SB), R5
VLM (R5), EX0, EX2
// sum vectors
VAQ H0_0, H0_1, H0_0
VAQ H1_0, H1_1, H1_0
VAQ H2_0, H2_1, H2_0
// h may be >= 2*(2**130-5) so we need to reduce it again
// M0...M4 are used as temps here
REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
next: // carry h1->h2
VLEIB $7, $0x28, T_1
VREPIB $4, T_2
VGBM $0x003F, T_3
VESRLG $4, T_3
// byte shift
VSRLB T_1, H1_0, T_4
// bit shift
VSRL T_2, T_4, T_4
// clear h1 carry bits
VN T_3, H1_0, H1_0
// add carry
VAQ T_4, H2_0, H2_0
// h is now < 2*(2**130-5)
// pack h into h1 (hi) and h0 (lo)
PACK(H0_0, H1_0, H2_0)
// if h > 2**130-5 then h -= 2**130-5
MOD(H0_0, H1_0, T_0, T_1, T_2)
// h += s
MOVD $·bswapMask<>(SB), R5
VL (R5), T_1
VL 16(R4), T_0
VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
VAQ T_0, H0_0, H0_0
VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
VST H0_0, (R1)
RET
add:
// load EX0, EX1, EX2
MOVD $·constants<>(SB), R5
VLM (R5), EX0, EX2
REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
VMRHG V0, H0_1, H0_0
VMRHG V0, H1_1, H1_0
VMRHG V0, H2_1, H2_0
VMRLG V0, H0_1, H0_1
VMRLG V0, H1_1, H1_1
VMRLG V0, H2_1, H2_1
CMPBLE R3, $64, b4
b4:
CMPBLE R3, $48, b3 // 3 blocks or less
// 4(3+1) blocks remaining
SUB $49, R3
VLM (R2), M0, M2
VLL R3, 48(R2), M3
ADD $1, R3
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, M3
MOVD $64(R2), R2
EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
VLEIB $10, $1, H2_0
VLEIB $10, $1, H2_1
VZERO M0
VZERO M1
VZERO M4
VZERO M5
VZERO T_4
VZERO T_10
EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
VLR T_4, M2
VLEIB $10, $1, M4
CMPBNE R3, $16, 2(PC)
VLEIB $10, $1, T_10
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
VMRHG V0, H0_1, H0_0
VMRHG V0, H1_1, H1_0
VMRHG V0, H2_1, H2_0
VMRLG V0, H0_1, H0_1
VMRLG V0, H1_1, H1_1
VMRLG V0, H2_1, H2_1
SUB $16, R3
CMPBLE R3, $0, square // this condition must always hold true!
b3:
CMPBLE R3, $32, b2
// 3 blocks remaining
// setup [r²,r]
VSLDB $8, R_0, R_0, R_0
VSLDB $8, R_1, R_1, R_1
VSLDB $8, R_2, R_2, R_2
VSLDB $8, R5_1, R5_1, R5_1
VSLDB $8, R5_2, R5_2, R5_2
VLVGG $1, RSAVE_0, R_0
VLVGG $1, RSAVE_1, R_1
VLVGG $1, RSAVE_2, R_2
VLVGG $1, R5SAVE_1, R5_1
VLVGG $1, R5SAVE_2, R5_2
// setup [h0, h1]
VSLDB $8, H0_0, H0_0, H0_0
VSLDB $8, H1_0, H1_0, H1_0
VSLDB $8, H2_0, H2_0, H2_0
VO H0_1, H0_0, H0_0
VO H1_1, H1_0, H1_0
VO H2_1, H2_0, H2_0
VZERO H0_1
VZERO H1_1
VZERO H2_1
VZERO M0
VZERO M1
VZERO M2
VZERO M3
VZERO M4
VZERO M5
// H*[r**2, r]
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
SUB $33, R3
VLM (R2), M0, M1
VLL R3, 32(R2), M2
ADD $1, R3
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, M2
// H += m0
VZERO T_1
VZERO T_2
VZERO T_3
EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
VLEIB $10, $1, T_3
VAG H0_0, T_1, H0_0
VAG H1_0, T_2, H1_0
VAG H2_0, T_3, H2_0
VZERO M0
VZERO M3
VZERO M4
VZERO M5
VZERO T_10
// (H+m0)*r
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
// H += m1
VZERO V0
VZERO T_1
VZERO T_2
VZERO T_3
EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
VLEIB $10, $1, T_3
VAQ H0_0, T_1, H0_0
VAQ H1_0, T_2, H1_0
VAQ H2_0, T_3, H2_0
REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
// [H, m2] * [r**2, r]
EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
CMPBNE R3, $16, 2(PC)
VLEIB $10, $1, H2_0
VZERO M0
VZERO M1
VZERO M2
VZERO M3
VZERO M4
VZERO M5
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
SUB $16, R3
CMPBLE R3, $0, next // this condition must always hold true!
b2:
CMPBLE R3, $16, b1
// 2 blocks remaining
// setup [r²,r]
VSLDB $8, R_0, R_0, R_0
VSLDB $8, R_1, R_1, R_1
VSLDB $8, R_2, R_2, R_2
VSLDB $8, R5_1, R5_1, R5_1
VSLDB $8, R5_2, R5_2, R5_2
VLVGG $1, RSAVE_0, R_0
VLVGG $1, RSAVE_1, R_1
VLVGG $1, RSAVE_2, R_2
VLVGG $1, R5SAVE_1, R5_1
VLVGG $1, R5SAVE_2, R5_2
// setup [h0, h1]
VSLDB $8, H0_0, H0_0, H0_0
VSLDB $8, H1_0, H1_0, H1_0
VSLDB $8, H2_0, H2_0, H2_0
VO H0_1, H0_0, H0_0
VO H1_1, H1_0, H1_0
VO H2_1, H2_0, H2_0
VZERO H0_1
VZERO H1_1
VZERO H2_1
VZERO M0
VZERO M1
VZERO M2
VZERO M3
VZERO M4
VZERO M5
// H*[r**2, r]
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
VMRHG V0, H0_1, H0_0
VMRHG V0, H1_1, H1_0
VMRHG V0, H2_1, H2_0
VMRLG V0, H0_1, H0_1
VMRLG V0, H1_1, H1_1
VMRLG V0, H2_1, H2_1
// move h to the left and 0s at the right
VSLDB $8, H0_0, H0_0, H0_0
VSLDB $8, H1_0, H1_0, H1_0
VSLDB $8, H2_0, H2_0, H2_0
// get message blocks and append 1 to start
SUB $17, R3
VL (R2), M0
VLL R3, 16(R2), M1
ADD $1, R3
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, M1
VZERO T_6
VZERO T_7
VZERO T_8
EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
VLEIB $2, $1, T_8
CMPBNE R3, $16, 2(PC)
VLEIB $10, $1, T_8
// add [m0, m1] to h
VAG H0_0, T_6, H0_0
VAG H1_0, T_7, H1_0
VAG H2_0, T_8, H2_0
VZERO M2
VZERO M3
VZERO M4
VZERO M5
VZERO T_10
VZERO M0
// at this point R_0 .. R5_2 look like [r**2, r]
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
SUB $16, R3, R3
CMPBLE R3, $0, next
b1:
CMPBLE R3, $0, next
// 1 block remaining
// setup [r²,r]
VSLDB $8, R_0, R_0, R_0
VSLDB $8, R_1, R_1, R_1
VSLDB $8, R_2, R_2, R_2
VSLDB $8, R5_1, R5_1, R5_1
VSLDB $8, R5_2, R5_2, R5_2
VLVGG $1, RSAVE_0, R_0
VLVGG $1, RSAVE_1, R_1
VLVGG $1, RSAVE_2, R_2
VLVGG $1, R5SAVE_1, R5_1
VLVGG $1, R5SAVE_2, R5_2
// setup [h0, h1]
VSLDB $8, H0_0, H0_0, H0_0
VSLDB $8, H1_0, H1_0, H1_0
VSLDB $8, H2_0, H2_0, H2_0
VO H0_1, H0_0, H0_0
VO H1_1, H1_0, H1_0
VO H2_1, H2_0, H2_0
VZERO H0_1
VZERO H1_1
VZERO H2_1
VZERO M0
VZERO M1
VZERO M2
VZERO M3
VZERO M4
VZERO M5
// H*[r**2, r]
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
// set up [0, m0] limbs
SUB $1, R3
VLL R3, (R2), M0
ADD $1, R3
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, M0
VZERO T_1
VZERO T_2
VZERO T_3
EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
CMPBNE R3, $16, 2(PC)
VLEIB $10, $1, T_3
// h+m0
VAQ H0_0, T_1, H0_0
VAQ H1_0, T_2, H1_0
VAQ H2_0, T_3, H2_0
VZERO M0
VZERO M1
VZERO M2
VZERO M3
VZERO M4
VZERO M5
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
BR next
square:
// setup [r²,r]
VSLDB $8, R_0, R_0, R_0
VSLDB $8, R_1, R_1, R_1
VSLDB $8, R_2, R_2, R_2
VSLDB $8, R5_1, R5_1, R5_1
VSLDB $8, R5_2, R5_2, R5_2
VLVGG $1, RSAVE_0, R_0
VLVGG $1, RSAVE_1, R_1
VLVGG $1, RSAVE_2, R_2
VLVGG $1, R5SAVE_1, R5_1
VLVGG $1, R5SAVE_2, R5_2
// setup [h0, h1]
VSLDB $8, H0_0, H0_0, H0_0
VSLDB $8, H1_0, H1_0, H1_0
VSLDB $8, H2_0, H2_0, H2_0
VO H0_1, H0_0, H0_0
VO H1_1, H1_0, H1_0
VO H2_1, H2_0, H2_0
VZERO H0_1
VZERO H1_1
VZERO H2_1
VZERO M0
VZERO M1
VZERO M2
VZERO M3
VZERO M4
VZERO M5
// (h0*r**2) + (h1*r)
MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
BR next
TEXT ·hasVMSLFacility(SB), NOSPLIT, $24-1
MOVD $x-24(SP), R1
XC $24, 0(R1), 0(R1) // clear the storage
MOVD $2, R0 // R0 is the number of double words stored -1
WORD $0xB2B01000 // STFLE 0(R1)
XOR R0, R0 // reset the value of R0
MOVBZ z-8(SP), R1
AND $0x01, R1
BEQ novmsl
vectorinstalled:
// check if the vector instruction has been enabled
VLEIB $0, $0xF, V16
VLGVB $0, V16, R1
CMPBNE R1, $0xF, novmsl
MOVB $1, ret+0(FP) // have vx
RET
novmsl:
MOVB $0, ret+0(FP) // no vx
RET

2943
vendor/golang.org/x/crypto/poly1305/vectors_test.go generated vendored Normal file

File diff suppressed because it is too large Load diff