Coverage Report

Created: 2026-06-16 16:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/tmp/bitcoin/src/crypto/sha256_arm_shani.cpp
Line
Count
Source
1
// Copyright (c) 2022-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6
// Written and placed in public domain by Jeffrey Walton.
7
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8
// Barry O'Rourke for the mbedTLS project.
9
// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11
#ifdef ENABLE_ARM_SHANI
12
13
#include <array>
14
#include <cstdint>
15
#include <cstddef>
16
#include <arm_neon.h>
17
18
namespace {
19
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20
{
21
    0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22
    0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23
    0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24
    0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25
    0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26
    0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27
    0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28
    0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29
    0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30
    0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31
    0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32
    0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33
    0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34
    0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35
    0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37
};
38
}
39
40
namespace sha256_arm_shani {
41
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42
52.7M
{
43
52.7M
    uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE;
44
52.7M
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
45
52.7M
    uint32x4_t TMP0, TMP2;
46
47
    // Load state
48
52.7M
    STATE0 = vld1q_u32(&s[0]);
49
52.7M
    STATE1 = vld1q_u32(&s[4]);
50
51
225M
    while (blocks--)
52
172M
    {
53
        // Save state
54
172M
        ABCD_SAVE = STATE0;
55
172M
        EFGH_SAVE = STATE1;
56
57
        // Load and convert input chunk to Big Endian
58
172M
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59
172M
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60
172M
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61
172M
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62
172M
        chunk += 64;
63
64
        // Original implementation preloaded message and constant addition which was 1-3% slower.
65
        // Now included as first step in quad round code saving one Q Neon register
66
        // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68
        // Rounds 1-4
69
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70
172M
        TMP2 = STATE0;
71
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
72
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75
76
        // Rounds 5-8
77
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78
172M
        TMP2 = STATE0;
79
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
80
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83
84
        // Rounds 9-12
85
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86
172M
        TMP2 = STATE0;
87
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
88
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91
92
        // Rounds 13-16
93
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94
172M
        TMP2 = STATE0;
95
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
96
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99
100
        // Rounds 17-20
101
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102
172M
        TMP2 = STATE0;
103
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
104
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107
108
        // Rounds 21-24
109
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110
172M
        TMP2 = STATE0;
111
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
112
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115
116
        // Rounds 25-28
117
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118
172M
        TMP2 = STATE0;
119
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
120
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123
124
        // Rounds 29-32
125
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126
172M
        TMP2 = STATE0;
127
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
128
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131
132
        // Rounds 33-36
133
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134
172M
        TMP2 = STATE0;
135
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
136
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139
140
        // Rounds 37-40
141
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142
172M
        TMP2 = STATE0;
143
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
144
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147
148
        // Rounds 41-44
149
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150
172M
        TMP2 = STATE0;
151
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
152
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155
156
        // Rounds 45-48
157
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158
172M
        TMP2 = STATE0;
159
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
160
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163
164
        // Rounds 49-52
165
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166
172M
        TMP2 = STATE0;
167
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169
170
        // Rounds 53-56
171
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172
172M
        TMP2 = STATE0;
173
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175
176
        // Rounds 57-60
177
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178
172M
        TMP2 = STATE0;
179
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181
182
        // Rounds 61-64
183
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184
172M
        TMP2 = STATE0;
185
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187
188
        // Update state
189
172M
        STATE0 = vaddq_u32(STATE0, ABCD_SAVE);
190
172M
        STATE1 = vaddq_u32(STATE1, EFGH_SAVE);
191
172M
    }
192
193
    // Save final state
194
52.7M
    vst1q_u32(&s[0], STATE0);
195
52.7M
    vst1q_u32(&s[4], STATE1);
196
52.7M
}
197
}
198
199
namespace sha256d64_arm_shani {
200
void Transform_2way(unsigned char* output, const unsigned char* input)
201
199k
{
202
    /* Initial state. */
203
199k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204
199k
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205
199k
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206
199k
    };
207
208
    /* Precomputed message schedule for the 2nd transform. */
209
199k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210
199k
        0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211
199k
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212
199k
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213
199k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214
199k
        0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215
199k
        0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216
199k
        0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217
199k
        0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218
199k
        0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219
199k
        0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220
199k
        0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221
199k
        0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222
199k
        0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223
199k
        0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224
199k
        0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225
199k
        0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226
199k
    };
227
228
    /* A few precomputed message schedule values for the 3rd transform. */
229
199k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230
199k
        0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231
199k
        0x80000000, 0x00000000, 0x00000000, 0x00000000,
232
199k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233
199k
    };
234
235
    /* Padding processed in the 3rd transform (byteswapped). */
236
199k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
238
199k
    uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABCD_SAVEA, ABCD_SAVEB, EFGH_SAVEA, EFGH_SAVEB;
239
199k
    uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240
199k
    uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241
242
    // Transform 1: Load state
243
199k
    STATE0A = vld1q_u32(&INIT[0]);
244
199k
    STATE0B = STATE0A;
245
199k
    STATE1A = vld1q_u32(&INIT[4]);
246
199k
    STATE1B = STATE1A;
247
248
    // Transform 1: Load and convert input chunk to Big Endian
249
199k
    MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250
199k
    MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251
199k
    MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252
199k
    MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253
199k
    MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254
199k
    MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255
199k
    MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256
199k
    MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257
258
    // Transform 1: Rounds 1-4
259
199k
    TMP = vld1q_u32(&K[0]);
260
199k
    TMP0A = vaddq_u32(MSG0A, TMP);
261
199k
    TMP0B = vaddq_u32(MSG0B, TMP);
262
199k
    TMP2A = STATE0A;
263
199k
    TMP2B = STATE0B;
264
199k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265
199k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270
199k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271
199k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272
273
    // Transform 1: Rounds 5-8
274
199k
    TMP = vld1q_u32(&K[4]);
275
199k
    TMP0A = vaddq_u32(MSG1A, TMP);
276
199k
    TMP0B = vaddq_u32(MSG1B, TMP);
277
199k
    TMP2A = STATE0A;
278
199k
    TMP2B = STATE0B;
279
199k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280
199k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285
199k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286
199k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287
288
    // Transform 1: Rounds 9-12
289
199k
    TMP = vld1q_u32(&K[8]);
290
199k
    TMP0A = vaddq_u32(MSG2A, TMP);
291
199k
    TMP0B = vaddq_u32(MSG2B, TMP);
292
199k
    TMP2A = STATE0A;
293
199k
    TMP2B = STATE0B;
294
199k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295
199k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300
199k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301
199k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302
303
    // Transform 1: Rounds 13-16
304
199k
    TMP = vld1q_u32(&K[12]);
305
199k
    TMP0A = vaddq_u32(MSG3A, TMP);
306
199k
    TMP0B = vaddq_u32(MSG3B, TMP);
307
199k
    TMP2A = STATE0A;
308
199k
    TMP2B = STATE0B;
309
199k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310
199k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315
199k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316
199k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317
318
    // Transform 1: Rounds 17-20
319
199k
    TMP = vld1q_u32(&K[16]);
320
199k
    TMP0A = vaddq_u32(MSG0A, TMP);
321
199k
    TMP0B = vaddq_u32(MSG0B, TMP);
322
199k
    TMP2A = STATE0A;
323
199k
    TMP2B = STATE0B;
324
199k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325
199k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330
199k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331
199k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332
333
    // Transform 1: Rounds 21-24
334
199k
    TMP = vld1q_u32(&K[20]);
335
199k
    TMP0A = vaddq_u32(MSG1A, TMP);
336
199k
    TMP0B = vaddq_u32(MSG1B, TMP);
337
199k
    TMP2A = STATE0A;
338
199k
    TMP2B = STATE0B;
339
199k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340
199k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345
199k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346
199k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347
348
    // Transform 1: Rounds 25-28
349
199k
    TMP = vld1q_u32(&K[24]);
350
199k
    TMP0A = vaddq_u32(MSG2A, TMP);
351
199k
    TMP0B = vaddq_u32(MSG2B, TMP);
352
199k
    TMP2A = STATE0A;
353
199k
    TMP2B = STATE0B;
354
199k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355
199k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360
199k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361
199k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362
363
    // Transform 1: Rounds 29-32
364
199k
    TMP = vld1q_u32(&K[28]);
365
199k
    TMP0A = vaddq_u32(MSG3A, TMP);
366
199k
    TMP0B = vaddq_u32(MSG3B, TMP);
367
199k
    TMP2A = STATE0A;
368
199k
    TMP2B = STATE0B;
369
199k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370
199k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375
199k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376
199k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377
378
    // Transform 1: Rounds 33-36
379
199k
    TMP = vld1q_u32(&K[32]);
380
199k
    TMP0A = vaddq_u32(MSG0A, TMP);
381
199k
    TMP0B = vaddq_u32(MSG0B, TMP);
382
199k
    TMP2A = STATE0A;
383
199k
    TMP2B = STATE0B;
384
199k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385
199k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390
199k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391
199k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392
393
    // Transform 1: Rounds 37-40
394
199k
    TMP = vld1q_u32(&K[36]);
395
199k
    TMP0A = vaddq_u32(MSG1A, TMP);
396
199k
    TMP0B = vaddq_u32(MSG1B, TMP);
397
199k
    TMP2A = STATE0A;
398
199k
    TMP2B = STATE0B;
399
199k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400
199k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405
199k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406
199k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407
408
    // Transform 1: Rounds 41-44
409
199k
    TMP = vld1q_u32(&K[40]);
410
199k
    TMP0A = vaddq_u32(MSG2A, TMP);
411
199k
    TMP0B = vaddq_u32(MSG2B, TMP);
412
199k
    TMP2A = STATE0A;
413
199k
    TMP2B = STATE0B;
414
199k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415
199k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420
199k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421
199k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422
423
    // Transform 1: Rounds 45-48
424
199k
    TMP = vld1q_u32(&K[44]);
425
199k
    TMP0A = vaddq_u32(MSG3A, TMP);
426
199k
    TMP0B = vaddq_u32(MSG3B, TMP);
427
199k
    TMP2A = STATE0A;
428
199k
    TMP2B = STATE0B;
429
199k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430
199k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435
199k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436
199k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437
438
    // Transform 1: Rounds 49-52
439
199k
    TMP = vld1q_u32(&K[48]);
440
199k
    TMP0A = vaddq_u32(MSG0A, TMP);
441
199k
    TMP0B = vaddq_u32(MSG0B, TMP);
442
199k
    TMP2A = STATE0A;
443
199k
    TMP2B = STATE0B;
444
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448
449
    // Transform 1: Rounds 53-56
450
199k
    TMP = vld1q_u32(&K[52]);
451
199k
    TMP0A = vaddq_u32(MSG1A, TMP);
452
199k
    TMP0B = vaddq_u32(MSG1B, TMP);
453
199k
    TMP2A = STATE0A;
454
199k
    TMP2B = STATE0B;
455
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459
460
    // Transform 1: Rounds 57-60
461
199k
    TMP = vld1q_u32(&K[56]);
462
199k
    TMP0A = vaddq_u32(MSG2A, TMP);
463
199k
    TMP0B = vaddq_u32(MSG2B, TMP);
464
199k
    TMP2A = STATE0A;
465
199k
    TMP2B = STATE0B;
466
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470
471
    // Transform 1: Rounds 61-64
472
199k
    TMP = vld1q_u32(&K[60]);
473
199k
    TMP0A = vaddq_u32(MSG3A, TMP);
474
199k
    TMP0B = vaddq_u32(MSG3B, TMP);
475
199k
    TMP2A = STATE0A;
476
199k
    TMP2B = STATE0B;
477
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481
482
    // Transform 1: Update state
483
199k
    TMP = vld1q_u32(&INIT[0]);
484
199k
    STATE0A = vaddq_u32(STATE0A, TMP);
485
199k
    STATE0B = vaddq_u32(STATE0B, TMP);
486
199k
    TMP = vld1q_u32(&INIT[4]);
487
199k
    STATE1A = vaddq_u32(STATE1A, TMP);
488
199k
    STATE1B = vaddq_u32(STATE1B, TMP);
489
490
    // Transform 2: Save state
491
199k
    ABCD_SAVEA = STATE0A;
492
199k
    ABCD_SAVEB = STATE0B;
493
199k
    EFGH_SAVEA = STATE1A;
494
199k
    EFGH_SAVEB = STATE1B;
495
496
    // Transform 2: Rounds 1-4
497
199k
    TMP = vld1q_u32(&MIDS[0]);
498
199k
    TMP2A = STATE0A;
499
199k
    TMP2B = STATE0B;
500
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504
505
    // Transform 2: Rounds 5-8
506
199k
    TMP = vld1q_u32(&MIDS[4]);
507
199k
    TMP2A = STATE0A;
508
199k
    TMP2B = STATE0B;
509
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513
514
    // Transform 2: Rounds 9-12
515
199k
    TMP = vld1q_u32(&MIDS[8]);
516
199k
    TMP2A = STATE0A;
517
199k
    TMP2B = STATE0B;
518
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522
523
    // Transform 2: Rounds 13-16
524
199k
    TMP = vld1q_u32(&MIDS[12]);
525
199k
    TMP2A = STATE0A;
526
199k
    TMP2B = STATE0B;
527
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531
532
    // Transform 2: Rounds 17-20
533
199k
    TMP = vld1q_u32(&MIDS[16]);
534
199k
    TMP2A = STATE0A;
535
199k
    TMP2B = STATE0B;
536
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540
541
    // Transform 2: Rounds 21-24
542
199k
    TMP = vld1q_u32(&MIDS[20]);
543
199k
    TMP2A = STATE0A;
544
199k
    TMP2B = STATE0B;
545
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549
550
    // Transform 2: Rounds 25-28
551
199k
    TMP = vld1q_u32(&MIDS[24]);
552
199k
    TMP2A = STATE0A;
553
199k
    TMP2B = STATE0B;
554
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558
559
    // Transform 2: Rounds 29-32
560
199k
    TMP = vld1q_u32(&MIDS[28]);
561
199k
    TMP2A = STATE0A;
562
199k
    TMP2B = STATE0B;
563
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567
568
    // Transform 2: Rounds 33-36
569
199k
    TMP = vld1q_u32(&MIDS[32]);
570
199k
    TMP2A = STATE0A;
571
199k
    TMP2B = STATE0B;
572
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576
577
    // Transform 2: Rounds 37-40
578
199k
    TMP = vld1q_u32(&MIDS[36]);
579
199k
    TMP2A = STATE0A;
580
199k
    TMP2B = STATE0B;
581
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585
586
    // Transform 2: Rounds 41-44
587
199k
    TMP = vld1q_u32(&MIDS[40]);
588
199k
    TMP2A = STATE0A;
589
199k
    TMP2B = STATE0B;
590
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594
595
    // Transform 2: Rounds 45-48
596
199k
    TMP = vld1q_u32(&MIDS[44]);
597
199k
    TMP2A = STATE0A;
598
199k
    TMP2B = STATE0B;
599
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603
604
    // Transform 2: Rounds 49-52
605
199k
    TMP = vld1q_u32(&MIDS[48]);
606
199k
    TMP2A = STATE0A;
607
199k
    TMP2B = STATE0B;
608
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612
613
    // Transform 2: Rounds 53-56
614
199k
    TMP = vld1q_u32(&MIDS[52]);
615
199k
    TMP2A = STATE0A;
616
199k
    TMP2B = STATE0B;
617
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621
622
    // Transform 2: Rounds 57-60
623
199k
    TMP = vld1q_u32(&MIDS[56]);
624
199k
    TMP2A = STATE0A;
625
199k
    TMP2B = STATE0B;
626
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630
631
    // Transform 2: Rounds 61-64
632
199k
    TMP = vld1q_u32(&MIDS[60]);
633
199k
    TMP2A = STATE0A;
634
199k
    TMP2B = STATE0B;
635
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639
640
    // Transform 2: Update state
641
199k
    STATE0A = vaddq_u32(STATE0A, ABCD_SAVEA);
642
199k
    STATE0B = vaddq_u32(STATE0B, ABCD_SAVEB);
643
199k
    STATE1A = vaddq_u32(STATE1A, EFGH_SAVEA);
644
199k
    STATE1B = vaddq_u32(STATE1B, EFGH_SAVEB);
645
646
    // Transform 3: Pad previous output
647
199k
    MSG0A = STATE0A;
648
199k
    MSG0B = STATE0B;
649
199k
    MSG1A = STATE1A;
650
199k
    MSG1B = STATE1B;
651
199k
    MSG2A = vld1q_u32(&FINAL[0]);
652
199k
    MSG2B = MSG2A;
653
199k
    MSG3A = vld1q_u32(&FINAL[4]);
654
199k
    MSG3B = MSG3A;
655
656
    // Transform 3: Load state
657
199k
    STATE0A = vld1q_u32(&INIT[0]);
658
199k
    STATE0B = STATE0A;
659
199k
    STATE1A = vld1q_u32(&INIT[4]);
660
199k
    STATE1B = STATE1A;
661
662
    // Transform 3: Rounds 1-4
663
199k
    TMP = vld1q_u32(&K[0]);
664
199k
    TMP0A = vaddq_u32(MSG0A, TMP);
665
199k
    TMP0B = vaddq_u32(MSG0B, TMP);
666
199k
    TMP2A = STATE0A;
667
199k
    TMP2B = STATE0B;
668
199k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669
199k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674
199k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675
199k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676
677
    // Transform 3: Rounds 5-8
678
199k
    TMP = vld1q_u32(&K[4]);
679
199k
    TMP0A = vaddq_u32(MSG1A, TMP);
680
199k
    TMP0B = vaddq_u32(MSG1B, TMP);
681
199k
    TMP2A = STATE0A;
682
199k
    TMP2B = STATE0B;
683
199k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684
199k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689
199k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690
199k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691
692
    // Transform 3: Rounds 9-12
693
199k
    TMP = vld1q_u32(&FINS[0]);
694
199k
    TMP2A = STATE0A;
695
199k
    TMP2B = STATE0B;
696
199k
    MSG2A = vld1q_u32(&FINS[4]);
697
199k
    MSG2B = MSG2A;
698
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702
199k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703
199k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704
705
    // Transform 3: Rounds 13-16
706
199k
    TMP = vld1q_u32(&FINS[8]);
707
199k
    TMP2A = STATE0A;
708
199k
    TMP2B = STATE0B;
709
199k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710
199k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715
199k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716
199k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717
718
    // Transform 3: Rounds 17-20
719
199k
    TMP = vld1q_u32(&K[16]);
720
199k
    TMP0A = vaddq_u32(MSG0A, TMP);
721
199k
    TMP0B = vaddq_u32(MSG0B, TMP);
722
199k
    TMP2A = STATE0A;
723
199k
    TMP2B = STATE0B;
724
199k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725
199k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730
199k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731
199k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732
733
    // Transform 3: Rounds 21-24
734
199k
    TMP = vld1q_u32(&K[20]);
735
199k
    TMP0A = vaddq_u32(MSG1A, TMP);
736
199k
    TMP0B = vaddq_u32(MSG1B, TMP);
737
199k
    TMP2A = STATE0A;
738
199k
    TMP2B = STATE0B;
739
199k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740
199k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745
199k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746
199k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747
748
    // Transform 3: Rounds 25-28
749
199k
    TMP = vld1q_u32(&K[24]);
750
199k
    TMP0A = vaddq_u32(MSG2A, TMP);
751
199k
    TMP0B = vaddq_u32(MSG2B, TMP);
752
199k
    TMP2A = STATE0A;
753
199k
    TMP2B = STATE0B;
754
199k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755
199k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760
199k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761
199k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762
763
    // Transform 3: Rounds 29-32
764
199k
    TMP = vld1q_u32(&K[28]);
765
199k
    TMP0A = vaddq_u32(MSG3A, TMP);
766
199k
    TMP0B = vaddq_u32(MSG3B, TMP);
767
199k
    TMP2A = STATE0A;
768
199k
    TMP2B = STATE0B;
769
199k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770
199k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775
199k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776
199k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777
778
    // Transform 3: Rounds 33-36
779
199k
    TMP = vld1q_u32(&K[32]);
780
199k
    TMP0A = vaddq_u32(MSG0A, TMP);
781
199k
    TMP0B = vaddq_u32(MSG0B, TMP);
782
199k
    TMP2A = STATE0A;
783
199k
    TMP2B = STATE0B;
784
199k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785
199k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790
199k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791
199k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792
793
    // Transform 3: Rounds 37-40
794
199k
    TMP = vld1q_u32(&K[36]);
795
199k
    TMP0A = vaddq_u32(MSG1A, TMP);
796
199k
    TMP0B = vaddq_u32(MSG1B, TMP);
797
199k
    TMP2A = STATE0A;
798
199k
    TMP2B = STATE0B;
799
199k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800
199k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805
199k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806
199k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807
808
    // Transform 3: Rounds 41-44
809
199k
    TMP = vld1q_u32(&K[40]);
810
199k
    TMP0A = vaddq_u32(MSG2A, TMP);
811
199k
    TMP0B = vaddq_u32(MSG2B, TMP);
812
199k
    TMP2A = STATE0A;
813
199k
    TMP2B = STATE0B;
814
199k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815
199k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820
199k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821
199k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822
823
    // Transform 3: Rounds 45-48
824
199k
    TMP = vld1q_u32(&K[44]);
825
199k
    TMP0A = vaddq_u32(MSG3A, TMP);
826
199k
    TMP0B = vaddq_u32(MSG3B, TMP);
827
199k
    TMP2A = STATE0A;
828
199k
    TMP2B = STATE0B;
829
199k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830
199k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835
199k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836
199k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837
838
    // Transform 3: Rounds 49-52
839
199k
    TMP = vld1q_u32(&K[48]);
840
199k
    TMP0A = vaddq_u32(MSG0A, TMP);
841
199k
    TMP0B = vaddq_u32(MSG0B, TMP);
842
199k
    TMP2A = STATE0A;
843
199k
    TMP2B = STATE0B;
844
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848
849
    // Transform 3: Rounds 53-56
850
199k
    TMP = vld1q_u32(&K[52]);
851
199k
    TMP0A = vaddq_u32(MSG1A, TMP);
852
199k
    TMP0B = vaddq_u32(MSG1B, TMP);
853
199k
    TMP2A = STATE0A;
854
199k
    TMP2B = STATE0B;
855
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859
860
    // Transform 3: Rounds 57-60
861
199k
    TMP = vld1q_u32(&K[56]);
862
199k
    TMP0A = vaddq_u32(MSG2A, TMP);
863
199k
    TMP0B = vaddq_u32(MSG2B, TMP);
864
199k
    TMP2A = STATE0A;
865
199k
    TMP2B = STATE0B;
866
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870
871
    // Transform 3: Rounds 61-64
872
199k
    TMP = vld1q_u32(&K[60]);
873
199k
    TMP0A = vaddq_u32(MSG3A, TMP);
874
199k
    TMP0B = vaddq_u32(MSG3B, TMP);
875
199k
    TMP2A = STATE0A;
876
199k
    TMP2B = STATE0B;
877
199k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878
199k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879
199k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880
199k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881
882
    // Transform 3: Update state
883
199k
    TMP = vld1q_u32(&INIT[0]);
884
199k
    STATE0A = vaddq_u32(STATE0A, TMP);
885
199k
    STATE0B = vaddq_u32(STATE0B, TMP);
886
199k
    TMP = vld1q_u32(&INIT[4]);
887
199k
    STATE1A = vaddq_u32(STATE1A, TMP);
888
199k
    STATE1B = vaddq_u32(STATE1B, TMP);
889
890
    // Store result
891
199k
    vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892
199k
    vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893
199k
    vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894
    vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895
199k
}
896
}
897
898
#endif