Coverage Report

Created: 2026-05-06 07:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/tmp/bitcoin/src/crypto/sha256_arm_shani.cpp
Line
Count
Source
1
// Copyright (c) 2022-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6
// Written and placed in public domain by Jeffrey Walton.
7
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8
// Barry O'Rourke for the mbedTLS project.
9
// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11
#ifdef ENABLE_ARM_SHANI
12
13
#include <array>
14
#include <cstdint>
15
#include <cstddef>
16
#include <arm_neon.h>
17
18
namespace {
19
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20
{
21
    0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22
    0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23
    0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24
    0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25
    0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26
    0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27
    0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28
    0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29
    0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30
    0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31
    0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32
    0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33
    0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34
    0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35
    0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37
};
38
}
39
40
namespace sha256_arm_shani {
41
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42
51.5M
{
43
51.5M
    uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
44
51.5M
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
45
51.5M
    uint32x4_t TMP0, TMP2;
46
47
    // Load state
48
51.5M
    STATE0 = vld1q_u32(&s[0]);
49
51.5M
    STATE1 = vld1q_u32(&s[4]);
50
51
223M
    while (blocks--)
52
171M
    {
53
        // Save state
54
171M
        ABEF_SAVE = STATE0;
55
171M
        CDGH_SAVE = STATE1;
56
57
        // Load and convert input chunk to Big Endian
58
171M
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59
171M
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60
171M
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61
171M
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62
171M
        chunk += 64;
63
64
        // Original implementation preloaded message and constant addition which was 1-3% slower.
65
        // Now included as first step in quad round code saving one Q Neon register
66
        // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68
        // Rounds 1-4
69
171M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70
171M
        TMP2 = STATE0;
71
171M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
72
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74
171M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75
76
        // Rounds 5-8
77
171M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78
171M
        TMP2 = STATE0;
79
171M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
80
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82
171M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83
84
        // Rounds 9-12
85
171M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86
171M
        TMP2 = STATE0;
87
171M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
88
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90
171M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91
92
        // Rounds 13-16
93
171M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94
171M
        TMP2 = STATE0;
95
171M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
96
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98
171M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99
100
        // Rounds 17-20
101
171M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102
171M
        TMP2 = STATE0;
103
171M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
104
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106
171M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107
108
        // Rounds 21-24
109
171M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110
171M
        TMP2 = STATE0;
111
171M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
112
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114
171M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115
116
        // Rounds 25-28
117
171M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118
171M
        TMP2 = STATE0;
119
171M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
120
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122
171M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123
124
        // Rounds 29-32
125
171M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126
171M
        TMP2 = STATE0;
127
171M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
128
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130
171M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131
132
        // Rounds 33-36
133
171M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134
171M
        TMP2 = STATE0;
135
171M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
136
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138
171M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139
140
        // Rounds 37-40
141
171M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142
171M
        TMP2 = STATE0;
143
171M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
144
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146
171M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147
148
        // Rounds 41-44
149
171M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150
171M
        TMP2 = STATE0;
151
171M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
152
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154
171M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155
156
        // Rounds 45-48
157
171M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158
171M
        TMP2 = STATE0;
159
171M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
160
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162
171M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163
164
        // Rounds 49-52
165
171M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166
171M
        TMP2 = STATE0;
167
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169
170
        // Rounds 53-56
171
171M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172
171M
        TMP2 = STATE0;
173
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175
176
        // Rounds 57-60
177
171M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178
171M
        TMP2 = STATE0;
179
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181
182
        // Rounds 61-64
183
171M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184
171M
        TMP2 = STATE0;
185
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187
188
        // Update state
189
171M
        STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
190
171M
        STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
191
171M
    }
192
193
    // Save final state
194
51.5M
    vst1q_u32(&s[0], STATE0);
195
51.5M
    vst1q_u32(&s[4], STATE1);
196
51.5M
}
197
}
198
199
namespace sha256d64_arm_shani {
200
void Transform_2way(unsigned char* output, const unsigned char* input)
201
151k
{
202
    /* Initial state. */
203
151k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204
151k
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205
151k
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206
151k
    };
207
208
    /* Precomputed message schedule for the 2nd transform. */
209
151k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210
151k
        0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211
151k
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212
151k
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213
151k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214
151k
        0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215
151k
        0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216
151k
        0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217
151k
        0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218
151k
        0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219
151k
        0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220
151k
        0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221
151k
        0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222
151k
        0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223
151k
        0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224
151k
        0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225
151k
        0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226
151k
    };
227
228
    /* A few precomputed message schedule values for the 3rd transform. */
229
151k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230
151k
        0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231
151k
        0x80000000, 0x00000000, 0x00000000, 0x00000000,
232
151k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233
151k
    };
234
235
    /* Padding processed in the 3rd transform (byteswapped). */
236
151k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
238
151k
    uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
239
151k
    uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240
151k
    uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241
242
    // Transform 1: Load state
243
151k
    STATE0A = vld1q_u32(&INIT[0]);
244
151k
    STATE0B = STATE0A;
245
151k
    STATE1A = vld1q_u32(&INIT[4]);
246
151k
    STATE1B = STATE1A;
247
248
    // Transform 1: Load and convert input chunk to Big Endian
249
151k
    MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250
151k
    MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251
151k
    MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252
151k
    MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253
151k
    MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254
151k
    MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255
151k
    MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256
151k
    MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257
258
    // Transform 1: Rounds 1-4
259
151k
    TMP = vld1q_u32(&K[0]);
260
151k
    TMP0A = vaddq_u32(MSG0A, TMP);
261
151k
    TMP0B = vaddq_u32(MSG0B, TMP);
262
151k
    TMP2A = STATE0A;
263
151k
    TMP2B = STATE0B;
264
151k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265
151k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270
151k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271
151k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272
273
    // Transform 1: Rounds 5-8
274
151k
    TMP = vld1q_u32(&K[4]);
275
151k
    TMP0A = vaddq_u32(MSG1A, TMP);
276
151k
    TMP0B = vaddq_u32(MSG1B, TMP);
277
151k
    TMP2A = STATE0A;
278
151k
    TMP2B = STATE0B;
279
151k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280
151k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285
151k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286
151k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287
288
    // Transform 1: Rounds 9-12
289
151k
    TMP = vld1q_u32(&K[8]);
290
151k
    TMP0A = vaddq_u32(MSG2A, TMP);
291
151k
    TMP0B = vaddq_u32(MSG2B, TMP);
292
151k
    TMP2A = STATE0A;
293
151k
    TMP2B = STATE0B;
294
151k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295
151k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300
151k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301
151k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302
303
    // Transform 1: Rounds 13-16
304
151k
    TMP = vld1q_u32(&K[12]);
305
151k
    TMP0A = vaddq_u32(MSG3A, TMP);
306
151k
    TMP0B = vaddq_u32(MSG3B, TMP);
307
151k
    TMP2A = STATE0A;
308
151k
    TMP2B = STATE0B;
309
151k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310
151k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315
151k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316
151k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317
318
    // Transform 1: Rounds 17-20
319
151k
    TMP = vld1q_u32(&K[16]);
320
151k
    TMP0A = vaddq_u32(MSG0A, TMP);
321
151k
    TMP0B = vaddq_u32(MSG0B, TMP);
322
151k
    TMP2A = STATE0A;
323
151k
    TMP2B = STATE0B;
324
151k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325
151k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330
151k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331
151k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332
333
    // Transform 1: Rounds 21-24
334
151k
    TMP = vld1q_u32(&K[20]);
335
151k
    TMP0A = vaddq_u32(MSG1A, TMP);
336
151k
    TMP0B = vaddq_u32(MSG1B, TMP);
337
151k
    TMP2A = STATE0A;
338
151k
    TMP2B = STATE0B;
339
151k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340
151k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345
151k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346
151k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347
348
    // Transform 1: Rounds 25-28
349
151k
    TMP = vld1q_u32(&K[24]);
350
151k
    TMP0A = vaddq_u32(MSG2A, TMP);
351
151k
    TMP0B = vaddq_u32(MSG2B, TMP);
352
151k
    TMP2A = STATE0A;
353
151k
    TMP2B = STATE0B;
354
151k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355
151k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360
151k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361
151k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362
363
    // Transform 1: Rounds 29-32
364
151k
    TMP = vld1q_u32(&K[28]);
365
151k
    TMP0A = vaddq_u32(MSG3A, TMP);
366
151k
    TMP0B = vaddq_u32(MSG3B, TMP);
367
151k
    TMP2A = STATE0A;
368
151k
    TMP2B = STATE0B;
369
151k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370
151k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375
151k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376
151k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377
378
    // Transform 1: Rounds 33-36
379
151k
    TMP = vld1q_u32(&K[32]);
380
151k
    TMP0A = vaddq_u32(MSG0A, TMP);
381
151k
    TMP0B = vaddq_u32(MSG0B, TMP);
382
151k
    TMP2A = STATE0A;
383
151k
    TMP2B = STATE0B;
384
151k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385
151k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390
151k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391
151k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392
393
    // Transform 1: Rounds 37-40
394
151k
    TMP = vld1q_u32(&K[36]);
395
151k
    TMP0A = vaddq_u32(MSG1A, TMP);
396
151k
    TMP0B = vaddq_u32(MSG1B, TMP);
397
151k
    TMP2A = STATE0A;
398
151k
    TMP2B = STATE0B;
399
151k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400
151k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405
151k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406
151k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407
408
    // Transform 1: Rounds 41-44
409
151k
    TMP = vld1q_u32(&K[40]);
410
151k
    TMP0A = vaddq_u32(MSG2A, TMP);
411
151k
    TMP0B = vaddq_u32(MSG2B, TMP);
412
151k
    TMP2A = STATE0A;
413
151k
    TMP2B = STATE0B;
414
151k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415
151k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420
151k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421
151k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422
423
    // Transform 1: Rounds 45-48
424
151k
    TMP = vld1q_u32(&K[44]);
425
151k
    TMP0A = vaddq_u32(MSG3A, TMP);
426
151k
    TMP0B = vaddq_u32(MSG3B, TMP);
427
151k
    TMP2A = STATE0A;
428
151k
    TMP2B = STATE0B;
429
151k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430
151k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435
151k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436
151k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437
438
    // Transform 1: Rounds 49-52
439
151k
    TMP = vld1q_u32(&K[48]);
440
151k
    TMP0A = vaddq_u32(MSG0A, TMP);
441
151k
    TMP0B = vaddq_u32(MSG0B, TMP);
442
151k
    TMP2A = STATE0A;
443
151k
    TMP2B = STATE0B;
444
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448
449
    // Transform 1: Rounds 53-56
450
151k
    TMP = vld1q_u32(&K[52]);
451
151k
    TMP0A = vaddq_u32(MSG1A, TMP);
452
151k
    TMP0B = vaddq_u32(MSG1B, TMP);
453
151k
    TMP2A = STATE0A;
454
151k
    TMP2B = STATE0B;
455
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459
460
    // Transform 1: Rounds 57-60
461
151k
    TMP = vld1q_u32(&K[56]);
462
151k
    TMP0A = vaddq_u32(MSG2A, TMP);
463
151k
    TMP0B = vaddq_u32(MSG2B, TMP);
464
151k
    TMP2A = STATE0A;
465
151k
    TMP2B = STATE0B;
466
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470
471
    // Transform 1: Rounds 61-64
472
151k
    TMP = vld1q_u32(&K[60]);
473
151k
    TMP0A = vaddq_u32(MSG3A, TMP);
474
151k
    TMP0B = vaddq_u32(MSG3B, TMP);
475
151k
    TMP2A = STATE0A;
476
151k
    TMP2B = STATE0B;
477
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481
482
    // Transform 1: Update state
483
151k
    TMP = vld1q_u32(&INIT[0]);
484
151k
    STATE0A = vaddq_u32(STATE0A, TMP);
485
151k
    STATE0B = vaddq_u32(STATE0B, TMP);
486
151k
    TMP = vld1q_u32(&INIT[4]);
487
151k
    STATE1A = vaddq_u32(STATE1A, TMP);
488
151k
    STATE1B = vaddq_u32(STATE1B, TMP);
489
490
    // Transform 2: Save state
491
151k
    ABEF_SAVEA = STATE0A;
492
151k
    ABEF_SAVEB = STATE0B;
493
151k
    CDGH_SAVEA = STATE1A;
494
151k
    CDGH_SAVEB = STATE1B;
495
496
    // Transform 2: Rounds 1-4
497
151k
    TMP = vld1q_u32(&MIDS[0]);
498
151k
    TMP2A = STATE0A;
499
151k
    TMP2B = STATE0B;
500
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504
505
    // Transform 2: Rounds 5-8
506
151k
    TMP = vld1q_u32(&MIDS[4]);
507
151k
    TMP2A = STATE0A;
508
151k
    TMP2B = STATE0B;
509
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513
514
    // Transform 2: Rounds 9-12
515
151k
    TMP = vld1q_u32(&MIDS[8]);
516
151k
    TMP2A = STATE0A;
517
151k
    TMP2B = STATE0B;
518
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522
523
    // Transform 2: Rounds 13-16
524
151k
    TMP = vld1q_u32(&MIDS[12]);
525
151k
    TMP2A = STATE0A;
526
151k
    TMP2B = STATE0B;
527
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531
532
    // Transform 2: Rounds 17-20
533
151k
    TMP = vld1q_u32(&MIDS[16]);
534
151k
    TMP2A = STATE0A;
535
151k
    TMP2B = STATE0B;
536
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540
541
    // Transform 2: Rounds 21-24
542
151k
    TMP = vld1q_u32(&MIDS[20]);
543
151k
    TMP2A = STATE0A;
544
151k
    TMP2B = STATE0B;
545
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549
550
    // Transform 2: Rounds 25-28
551
151k
    TMP = vld1q_u32(&MIDS[24]);
552
151k
    TMP2A = STATE0A;
553
151k
    TMP2B = STATE0B;
554
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558
559
    // Transform 2: Rounds 29-32
560
151k
    TMP = vld1q_u32(&MIDS[28]);
561
151k
    TMP2A = STATE0A;
562
151k
    TMP2B = STATE0B;
563
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567
568
    // Transform 2: Rounds 33-36
569
151k
    TMP = vld1q_u32(&MIDS[32]);
570
151k
    TMP2A = STATE0A;
571
151k
    TMP2B = STATE0B;
572
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576
577
    // Transform 2: Rounds 37-40
578
151k
    TMP = vld1q_u32(&MIDS[36]);
579
151k
    TMP2A = STATE0A;
580
151k
    TMP2B = STATE0B;
581
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585
586
    // Transform 2: Rounds 41-44
587
151k
    TMP = vld1q_u32(&MIDS[40]);
588
151k
    TMP2A = STATE0A;
589
151k
    TMP2B = STATE0B;
590
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594
595
    // Transform 2: Rounds 45-48
596
151k
    TMP = vld1q_u32(&MIDS[44]);
597
151k
    TMP2A = STATE0A;
598
151k
    TMP2B = STATE0B;
599
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603
604
    // Transform 2: Rounds 49-52
605
151k
    TMP = vld1q_u32(&MIDS[48]);
606
151k
    TMP2A = STATE0A;
607
151k
    TMP2B = STATE0B;
608
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612
613
    // Transform 2: Rounds 53-56
614
151k
    TMP = vld1q_u32(&MIDS[52]);
615
151k
    TMP2A = STATE0A;
616
151k
    TMP2B = STATE0B;
617
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621
622
    // Transform 2: Rounds 57-60
623
151k
    TMP = vld1q_u32(&MIDS[56]);
624
151k
    TMP2A = STATE0A;
625
151k
    TMP2B = STATE0B;
626
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630
631
    // Transform 2: Rounds 61-64
632
151k
    TMP = vld1q_u32(&MIDS[60]);
633
151k
    TMP2A = STATE0A;
634
151k
    TMP2B = STATE0B;
635
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639
640
    // Transform 2: Update state
641
151k
    STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
642
151k
    STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
643
151k
    STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
644
151k
    STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
645
646
    // Transform 3: Pad previous output
647
151k
    MSG0A = STATE0A;
648
151k
    MSG0B = STATE0B;
649
151k
    MSG1A = STATE1A;
650
151k
    MSG1B = STATE1B;
651
151k
    MSG2A = vld1q_u32(&FINAL[0]);
652
151k
    MSG2B = MSG2A;
653
151k
    MSG3A = vld1q_u32(&FINAL[4]);
654
151k
    MSG3B = MSG3A;
655
656
    // Transform 3: Load state
657
151k
    STATE0A = vld1q_u32(&INIT[0]);
658
151k
    STATE0B = STATE0A;
659
151k
    STATE1A = vld1q_u32(&INIT[4]);
660
151k
    STATE1B = STATE1A;
661
662
    // Transform 3: Rounds 1-4
663
151k
    TMP = vld1q_u32(&K[0]);
664
151k
    TMP0A = vaddq_u32(MSG0A, TMP);
665
151k
    TMP0B = vaddq_u32(MSG0B, TMP);
666
151k
    TMP2A = STATE0A;
667
151k
    TMP2B = STATE0B;
668
151k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669
151k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674
151k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675
151k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676
677
    // Transform 3: Rounds 5-8
678
151k
    TMP = vld1q_u32(&K[4]);
679
151k
    TMP0A = vaddq_u32(MSG1A, TMP);
680
151k
    TMP0B = vaddq_u32(MSG1B, TMP);
681
151k
    TMP2A = STATE0A;
682
151k
    TMP2B = STATE0B;
683
151k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684
151k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689
151k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690
151k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691
692
    // Transform 3: Rounds 9-12
693
151k
    TMP = vld1q_u32(&FINS[0]);
694
151k
    TMP2A = STATE0A;
695
151k
    TMP2B = STATE0B;
696
151k
    MSG2A = vld1q_u32(&FINS[4]);
697
151k
    MSG2B = MSG2A;
698
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702
151k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703
151k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704
705
    // Transform 3: Rounds 13-16
706
151k
    TMP = vld1q_u32(&FINS[8]);
707
151k
    TMP2A = STATE0A;
708
151k
    TMP2B = STATE0B;
709
151k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710
151k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715
151k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716
151k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717
718
    // Transform 3: Rounds 17-20
719
151k
    TMP = vld1q_u32(&K[16]);
720
151k
    TMP0A = vaddq_u32(MSG0A, TMP);
721
151k
    TMP0B = vaddq_u32(MSG0B, TMP);
722
151k
    TMP2A = STATE0A;
723
151k
    TMP2B = STATE0B;
724
151k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725
151k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730
151k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731
151k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732
733
    // Transform 3: Rounds 21-24
734
151k
    TMP = vld1q_u32(&K[20]);
735
151k
    TMP0A = vaddq_u32(MSG1A, TMP);
736
151k
    TMP0B = vaddq_u32(MSG1B, TMP);
737
151k
    TMP2A = STATE0A;
738
151k
    TMP2B = STATE0B;
739
151k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740
151k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745
151k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746
151k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747
748
    // Transform 3: Rounds 25-28
749
151k
    TMP = vld1q_u32(&K[24]);
750
151k
    TMP0A = vaddq_u32(MSG2A, TMP);
751
151k
    TMP0B = vaddq_u32(MSG2B, TMP);
752
151k
    TMP2A = STATE0A;
753
151k
    TMP2B = STATE0B;
754
151k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755
151k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760
151k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761
151k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762
763
    // Transform 3: Rounds 29-32
764
151k
    TMP = vld1q_u32(&K[28]);
765
151k
    TMP0A = vaddq_u32(MSG3A, TMP);
766
151k
    TMP0B = vaddq_u32(MSG3B, TMP);
767
151k
    TMP2A = STATE0A;
768
151k
    TMP2B = STATE0B;
769
151k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770
151k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775
151k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776
151k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777
778
    // Transform 3: Rounds 33-36
779
151k
    TMP = vld1q_u32(&K[32]);
780
151k
    TMP0A = vaddq_u32(MSG0A, TMP);
781
151k
    TMP0B = vaddq_u32(MSG0B, TMP);
782
151k
    TMP2A = STATE0A;
783
151k
    TMP2B = STATE0B;
784
151k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785
151k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790
151k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791
151k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792
793
    // Transform 3: Rounds 37-40
794
151k
    TMP = vld1q_u32(&K[36]);
795
151k
    TMP0A = vaddq_u32(MSG1A, TMP);
796
151k
    TMP0B = vaddq_u32(MSG1B, TMP);
797
151k
    TMP2A = STATE0A;
798
151k
    TMP2B = STATE0B;
799
151k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800
151k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805
151k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806
151k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807
808
    // Transform 3: Rounds 41-44
809
151k
    TMP = vld1q_u32(&K[40]);
810
151k
    TMP0A = vaddq_u32(MSG2A, TMP);
811
151k
    TMP0B = vaddq_u32(MSG2B, TMP);
812
151k
    TMP2A = STATE0A;
813
151k
    TMP2B = STATE0B;
814
151k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815
151k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820
151k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821
151k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822
823
    // Transform 3: Rounds 45-48
824
151k
    TMP = vld1q_u32(&K[44]);
825
151k
    TMP0A = vaddq_u32(MSG3A, TMP);
826
151k
    TMP0B = vaddq_u32(MSG3B, TMP);
827
151k
    TMP2A = STATE0A;
828
151k
    TMP2B = STATE0B;
829
151k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830
151k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835
151k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836
151k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837
838
    // Transform 3: Rounds 49-52
839
151k
    TMP = vld1q_u32(&K[48]);
840
151k
    TMP0A = vaddq_u32(MSG0A, TMP);
841
151k
    TMP0B = vaddq_u32(MSG0B, TMP);
842
151k
    TMP2A = STATE0A;
843
151k
    TMP2B = STATE0B;
844
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848
849
    // Transform 3: Rounds 53-56
850
151k
    TMP = vld1q_u32(&K[52]);
851
151k
    TMP0A = vaddq_u32(MSG1A, TMP);
852
151k
    TMP0B = vaddq_u32(MSG1B, TMP);
853
151k
    TMP2A = STATE0A;
854
151k
    TMP2B = STATE0B;
855
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859
860
    // Transform 3: Rounds 57-60
861
151k
    TMP = vld1q_u32(&K[56]);
862
151k
    TMP0A = vaddq_u32(MSG2A, TMP);
863
151k
    TMP0B = vaddq_u32(MSG2B, TMP);
864
151k
    TMP2A = STATE0A;
865
151k
    TMP2B = STATE0B;
866
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870
871
    // Transform 3: Rounds 61-64
872
151k
    TMP = vld1q_u32(&K[60]);
873
151k
    TMP0A = vaddq_u32(MSG3A, TMP);
874
151k
    TMP0B = vaddq_u32(MSG3B, TMP);
875
151k
    TMP2A = STATE0A;
876
151k
    TMP2B = STATE0B;
877
151k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878
151k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879
151k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880
151k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881
882
    // Transform 3: Update state
883
151k
    TMP = vld1q_u32(&INIT[0]);
884
151k
    STATE0A = vaddq_u32(STATE0A, TMP);
885
151k
    STATE0B = vaddq_u32(STATE0B, TMP);
886
151k
    TMP = vld1q_u32(&INIT[4]);
887
151k
    STATE1A = vaddq_u32(STATE1A, TMP);
888
151k
    STATE1B = vaddq_u32(STATE1B, TMP);
889
890
    // Store result
891
151k
    vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892
151k
    vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893
151k
    vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894
    vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895
151k
}
896
}
897
898
#endif