Coverage Report

Created: 2026-04-29 19:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/tmp/bitcoin/src/crypto/sha256_arm_shani.cpp
Line
Count
Source
1
// Copyright (c) 2022-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6
// Written and placed in public domain by Jeffrey Walton.
7
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8
// Barry O'Rourke for the mbedTLS project.
9
// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11
#ifdef ENABLE_ARM_SHANI
12
13
#include <array>
14
#include <cstdint>
15
#include <cstddef>
16
#include <arm_neon.h>
17
18
namespace {
19
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20
{
21
    0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22
    0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23
    0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24
    0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25
    0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26
    0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27
    0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28
    0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29
    0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30
    0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31
    0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32
    0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33
    0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34
    0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35
    0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37
};
38
}
39
40
namespace sha256_arm_shani {
41
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42
52.1M
{
43
52.1M
    uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
44
52.1M
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
45
52.1M
    uint32x4_t TMP0, TMP2;
46
47
    // Load state
48
52.1M
    STATE0 = vld1q_u32(&s[0]);
49
52.1M
    STATE1 = vld1q_u32(&s[4]);
50
51
224M
    while (blocks--)
52
172M
    {
53
        // Save state
54
172M
        ABEF_SAVE = STATE0;
55
172M
        CDGH_SAVE = STATE1;
56
57
        // Load and convert input chunk to Big Endian
58
172M
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59
172M
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60
172M
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61
172M
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62
172M
        chunk += 64;
63
64
        // Original implementation preloaded message and constant addition which was 1-3% slower.
65
        // Now included as first step in quad round code saving one Q Neon register
66
        // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68
        // Rounds 1-4
69
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70
172M
        TMP2 = STATE0;
71
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
72
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75
76
        // Rounds 5-8
77
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78
172M
        TMP2 = STATE0;
79
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
80
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83
84
        // Rounds 9-12
85
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86
172M
        TMP2 = STATE0;
87
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
88
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91
92
        // Rounds 13-16
93
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94
172M
        TMP2 = STATE0;
95
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
96
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99
100
        // Rounds 17-20
101
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102
172M
        TMP2 = STATE0;
103
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
104
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107
108
        // Rounds 21-24
109
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110
172M
        TMP2 = STATE0;
111
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
112
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115
116
        // Rounds 25-28
117
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118
172M
        TMP2 = STATE0;
119
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
120
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123
124
        // Rounds 29-32
125
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126
172M
        TMP2 = STATE0;
127
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
128
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131
132
        // Rounds 33-36
133
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134
172M
        TMP2 = STATE0;
135
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
136
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139
140
        // Rounds 37-40
141
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142
172M
        TMP2 = STATE0;
143
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
144
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147
148
        // Rounds 41-44
149
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150
172M
        TMP2 = STATE0;
151
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
152
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155
156
        // Rounds 45-48
157
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158
172M
        TMP2 = STATE0;
159
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
160
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163
164
        // Rounds 49-52
165
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166
172M
        TMP2 = STATE0;
167
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169
170
        // Rounds 53-56
171
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172
172M
        TMP2 = STATE0;
173
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175
176
        // Rounds 57-60
177
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178
172M
        TMP2 = STATE0;
179
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181
182
        // Rounds 61-64
183
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184
172M
        TMP2 = STATE0;
185
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187
188
        // Update state
189
172M
        STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
190
172M
        STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
191
172M
    }
192
193
    // Save final state
194
52.1M
    vst1q_u32(&s[0], STATE0);
195
52.1M
    vst1q_u32(&s[4], STATE1);
196
52.1M
}
197
}
198
199
namespace sha256d64_arm_shani {
200
void Transform_2way(unsigned char* output, const unsigned char* input)
201
173k
{
202
    /* Initial state. */
203
173k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204
173k
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205
173k
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206
173k
    };
207
208
    /* Precomputed message schedule for the 2nd transform. */
209
173k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210
173k
        0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211
173k
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212
173k
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213
173k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214
173k
        0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215
173k
        0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216
173k
        0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217
173k
        0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218
173k
        0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219
173k
        0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220
173k
        0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221
173k
        0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222
173k
        0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223
173k
        0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224
173k
        0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225
173k
        0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226
173k
    };
227
228
    /* A few precomputed message schedule values for the 3rd transform. */
229
173k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230
173k
        0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231
173k
        0x80000000, 0x00000000, 0x00000000, 0x00000000,
232
173k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233
173k
    };
234
235
    /* Padding processed in the 3rd transform (byteswapped). */
236
173k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
238
173k
    uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
239
173k
    uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240
173k
    uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241
242
    // Transform 1: Load state
243
173k
    STATE0A = vld1q_u32(&INIT[0]);
244
173k
    STATE0B = STATE0A;
245
173k
    STATE1A = vld1q_u32(&INIT[4]);
246
173k
    STATE1B = STATE1A;
247
248
    // Transform 1: Load and convert input chunk to Big Endian
249
173k
    MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250
173k
    MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251
173k
    MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252
173k
    MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253
173k
    MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254
173k
    MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255
173k
    MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256
173k
    MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257
258
    // Transform 1: Rounds 1-4
259
173k
    TMP = vld1q_u32(&K[0]);
260
173k
    TMP0A = vaddq_u32(MSG0A, TMP);
261
173k
    TMP0B = vaddq_u32(MSG0B, TMP);
262
173k
    TMP2A = STATE0A;
263
173k
    TMP2B = STATE0B;
264
173k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265
173k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270
173k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271
173k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272
273
    // Transform 1: Rounds 5-8
274
173k
    TMP = vld1q_u32(&K[4]);
275
173k
    TMP0A = vaddq_u32(MSG1A, TMP);
276
173k
    TMP0B = vaddq_u32(MSG1B, TMP);
277
173k
    TMP2A = STATE0A;
278
173k
    TMP2B = STATE0B;
279
173k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280
173k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285
173k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286
173k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287
288
    // Transform 1: Rounds 9-12
289
173k
    TMP = vld1q_u32(&K[8]);
290
173k
    TMP0A = vaddq_u32(MSG2A, TMP);
291
173k
    TMP0B = vaddq_u32(MSG2B, TMP);
292
173k
    TMP2A = STATE0A;
293
173k
    TMP2B = STATE0B;
294
173k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295
173k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300
173k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301
173k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302
303
    // Transform 1: Rounds 13-16
304
173k
    TMP = vld1q_u32(&K[12]);
305
173k
    TMP0A = vaddq_u32(MSG3A, TMP);
306
173k
    TMP0B = vaddq_u32(MSG3B, TMP);
307
173k
    TMP2A = STATE0A;
308
173k
    TMP2B = STATE0B;
309
173k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310
173k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315
173k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316
173k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317
318
    // Transform 1: Rounds 17-20
319
173k
    TMP = vld1q_u32(&K[16]);
320
173k
    TMP0A = vaddq_u32(MSG0A, TMP);
321
173k
    TMP0B = vaddq_u32(MSG0B, TMP);
322
173k
    TMP2A = STATE0A;
323
173k
    TMP2B = STATE0B;
324
173k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325
173k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330
173k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331
173k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332
333
    // Transform 1: Rounds 21-24
334
173k
    TMP = vld1q_u32(&K[20]);
335
173k
    TMP0A = vaddq_u32(MSG1A, TMP);
336
173k
    TMP0B = vaddq_u32(MSG1B, TMP);
337
173k
    TMP2A = STATE0A;
338
173k
    TMP2B = STATE0B;
339
173k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340
173k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345
173k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346
173k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347
348
    // Transform 1: Rounds 25-28
349
173k
    TMP = vld1q_u32(&K[24]);
350
173k
    TMP0A = vaddq_u32(MSG2A, TMP);
351
173k
    TMP0B = vaddq_u32(MSG2B, TMP);
352
173k
    TMP2A = STATE0A;
353
173k
    TMP2B = STATE0B;
354
173k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355
173k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360
173k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361
173k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362
363
    // Transform 1: Rounds 29-32
364
173k
    TMP = vld1q_u32(&K[28]);
365
173k
    TMP0A = vaddq_u32(MSG3A, TMP);
366
173k
    TMP0B = vaddq_u32(MSG3B, TMP);
367
173k
    TMP2A = STATE0A;
368
173k
    TMP2B = STATE0B;
369
173k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370
173k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375
173k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376
173k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377
378
    // Transform 1: Rounds 33-36
379
173k
    TMP = vld1q_u32(&K[32]);
380
173k
    TMP0A = vaddq_u32(MSG0A, TMP);
381
173k
    TMP0B = vaddq_u32(MSG0B, TMP);
382
173k
    TMP2A = STATE0A;
383
173k
    TMP2B = STATE0B;
384
173k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385
173k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390
173k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391
173k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392
393
    // Transform 1: Rounds 37-40
394
173k
    TMP = vld1q_u32(&K[36]);
395
173k
    TMP0A = vaddq_u32(MSG1A, TMP);
396
173k
    TMP0B = vaddq_u32(MSG1B, TMP);
397
173k
    TMP2A = STATE0A;
398
173k
    TMP2B = STATE0B;
399
173k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400
173k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405
173k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406
173k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407
408
    // Transform 1: Rounds 41-44
409
173k
    TMP = vld1q_u32(&K[40]);
410
173k
    TMP0A = vaddq_u32(MSG2A, TMP);
411
173k
    TMP0B = vaddq_u32(MSG2B, TMP);
412
173k
    TMP2A = STATE0A;
413
173k
    TMP2B = STATE0B;
414
173k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415
173k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420
173k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421
173k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422
423
    // Transform 1: Rounds 45-48
424
173k
    TMP = vld1q_u32(&K[44]);
425
173k
    TMP0A = vaddq_u32(MSG3A, TMP);
426
173k
    TMP0B = vaddq_u32(MSG3B, TMP);
427
173k
    TMP2A = STATE0A;
428
173k
    TMP2B = STATE0B;
429
173k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430
173k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435
173k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436
173k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437
438
    // Transform 1: Rounds 49-52
439
173k
    TMP = vld1q_u32(&K[48]);
440
173k
    TMP0A = vaddq_u32(MSG0A, TMP);
441
173k
    TMP0B = vaddq_u32(MSG0B, TMP);
442
173k
    TMP2A = STATE0A;
443
173k
    TMP2B = STATE0B;
444
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448
449
    // Transform 1: Rounds 53-56
450
173k
    TMP = vld1q_u32(&K[52]);
451
173k
    TMP0A = vaddq_u32(MSG1A, TMP);
452
173k
    TMP0B = vaddq_u32(MSG1B, TMP);
453
173k
    TMP2A = STATE0A;
454
173k
    TMP2B = STATE0B;
455
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459
460
    // Transform 1: Rounds 57-60
461
173k
    TMP = vld1q_u32(&K[56]);
462
173k
    TMP0A = vaddq_u32(MSG2A, TMP);
463
173k
    TMP0B = vaddq_u32(MSG2B, TMP);
464
173k
    TMP2A = STATE0A;
465
173k
    TMP2B = STATE0B;
466
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470
471
    // Transform 1: Rounds 61-64
472
173k
    TMP = vld1q_u32(&K[60]);
473
173k
    TMP0A = vaddq_u32(MSG3A, TMP);
474
173k
    TMP0B = vaddq_u32(MSG3B, TMP);
475
173k
    TMP2A = STATE0A;
476
173k
    TMP2B = STATE0B;
477
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481
482
    // Transform 1: Update state
483
173k
    TMP = vld1q_u32(&INIT[0]);
484
173k
    STATE0A = vaddq_u32(STATE0A, TMP);
485
173k
    STATE0B = vaddq_u32(STATE0B, TMP);
486
173k
    TMP = vld1q_u32(&INIT[4]);
487
173k
    STATE1A = vaddq_u32(STATE1A, TMP);
488
173k
    STATE1B = vaddq_u32(STATE1B, TMP);
489
490
    // Transform 2: Save state
491
173k
    ABEF_SAVEA = STATE0A;
492
173k
    ABEF_SAVEB = STATE0B;
493
173k
    CDGH_SAVEA = STATE1A;
494
173k
    CDGH_SAVEB = STATE1B;
495
496
    // Transform 2: Rounds 1-4
497
173k
    TMP = vld1q_u32(&MIDS[0]);
498
173k
    TMP2A = STATE0A;
499
173k
    TMP2B = STATE0B;
500
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504
505
    // Transform 2: Rounds 5-8
506
173k
    TMP = vld1q_u32(&MIDS[4]);
507
173k
    TMP2A = STATE0A;
508
173k
    TMP2B = STATE0B;
509
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513
514
    // Transform 2: Rounds 9-12
515
173k
    TMP = vld1q_u32(&MIDS[8]);
516
173k
    TMP2A = STATE0A;
517
173k
    TMP2B = STATE0B;
518
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522
523
    // Transform 2: Rounds 13-16
524
173k
    TMP = vld1q_u32(&MIDS[12]);
525
173k
    TMP2A = STATE0A;
526
173k
    TMP2B = STATE0B;
527
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531
532
    // Transform 2: Rounds 17-20
533
173k
    TMP = vld1q_u32(&MIDS[16]);
534
173k
    TMP2A = STATE0A;
535
173k
    TMP2B = STATE0B;
536
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540
541
    // Transform 2: Rounds 21-24
542
173k
    TMP = vld1q_u32(&MIDS[20]);
543
173k
    TMP2A = STATE0A;
544
173k
    TMP2B = STATE0B;
545
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549
550
    // Transform 2: Rounds 25-28
551
173k
    TMP = vld1q_u32(&MIDS[24]);
552
173k
    TMP2A = STATE0A;
553
173k
    TMP2B = STATE0B;
554
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558
559
    // Transform 2: Rounds 29-32
560
173k
    TMP = vld1q_u32(&MIDS[28]);
561
173k
    TMP2A = STATE0A;
562
173k
    TMP2B = STATE0B;
563
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567
568
    // Transform 2: Rounds 33-36
569
173k
    TMP = vld1q_u32(&MIDS[32]);
570
173k
    TMP2A = STATE0A;
571
173k
    TMP2B = STATE0B;
572
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576
577
    // Transform 2: Rounds 37-40
578
173k
    TMP = vld1q_u32(&MIDS[36]);
579
173k
    TMP2A = STATE0A;
580
173k
    TMP2B = STATE0B;
581
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585
586
    // Transform 2: Rounds 41-44
587
173k
    TMP = vld1q_u32(&MIDS[40]);
588
173k
    TMP2A = STATE0A;
589
173k
    TMP2B = STATE0B;
590
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594
595
    // Transform 2: Rounds 45-48
596
173k
    TMP = vld1q_u32(&MIDS[44]);
597
173k
    TMP2A = STATE0A;
598
173k
    TMP2B = STATE0B;
599
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603
604
    // Transform 2: Rounds 49-52
605
173k
    TMP = vld1q_u32(&MIDS[48]);
606
173k
    TMP2A = STATE0A;
607
173k
    TMP2B = STATE0B;
608
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612
613
    // Transform 2: Rounds 53-56
614
173k
    TMP = vld1q_u32(&MIDS[52]);
615
173k
    TMP2A = STATE0A;
616
173k
    TMP2B = STATE0B;
617
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621
622
    // Transform 2: Rounds 57-60
623
173k
    TMP = vld1q_u32(&MIDS[56]);
624
173k
    TMP2A = STATE0A;
625
173k
    TMP2B = STATE0B;
626
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630
631
    // Transform 2: Rounds 61-64
632
173k
    TMP = vld1q_u32(&MIDS[60]);
633
173k
    TMP2A = STATE0A;
634
173k
    TMP2B = STATE0B;
635
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639
640
    // Transform 2: Update state
641
173k
    STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
642
173k
    STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
643
173k
    STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
644
173k
    STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
645
646
    // Transform 3: Pad previous output
647
173k
    MSG0A = STATE0A;
648
173k
    MSG0B = STATE0B;
649
173k
    MSG1A = STATE1A;
650
173k
    MSG1B = STATE1B;
651
173k
    MSG2A = vld1q_u32(&FINAL[0]);
652
173k
    MSG2B = MSG2A;
653
173k
    MSG3A = vld1q_u32(&FINAL[4]);
654
173k
    MSG3B = MSG3A;
655
656
    // Transform 3: Load state
657
173k
    STATE0A = vld1q_u32(&INIT[0]);
658
173k
    STATE0B = STATE0A;
659
173k
    STATE1A = vld1q_u32(&INIT[4]);
660
173k
    STATE1B = STATE1A;
661
662
    // Transform 3: Rounds 1-4
663
173k
    TMP = vld1q_u32(&K[0]);
664
173k
    TMP0A = vaddq_u32(MSG0A, TMP);
665
173k
    TMP0B = vaddq_u32(MSG0B, TMP);
666
173k
    TMP2A = STATE0A;
667
173k
    TMP2B = STATE0B;
668
173k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669
173k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674
173k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675
173k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676
677
    // Transform 3: Rounds 5-8
678
173k
    TMP = vld1q_u32(&K[4]);
679
173k
    TMP0A = vaddq_u32(MSG1A, TMP);
680
173k
    TMP0B = vaddq_u32(MSG1B, TMP);
681
173k
    TMP2A = STATE0A;
682
173k
    TMP2B = STATE0B;
683
173k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684
173k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689
173k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690
173k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691
692
    // Transform 3: Rounds 9-12
693
173k
    TMP = vld1q_u32(&FINS[0]);
694
173k
    TMP2A = STATE0A;
695
173k
    TMP2B = STATE0B;
696
173k
    MSG2A = vld1q_u32(&FINS[4]);
697
173k
    MSG2B = MSG2A;
698
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702
173k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703
173k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704
705
    // Transform 3: Rounds 13-16
706
173k
    TMP = vld1q_u32(&FINS[8]);
707
173k
    TMP2A = STATE0A;
708
173k
    TMP2B = STATE0B;
709
173k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710
173k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715
173k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716
173k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717
718
    // Transform 3: Rounds 17-20
719
173k
    TMP = vld1q_u32(&K[16]);
720
173k
    TMP0A = vaddq_u32(MSG0A, TMP);
721
173k
    TMP0B = vaddq_u32(MSG0B, TMP);
722
173k
    TMP2A = STATE0A;
723
173k
    TMP2B = STATE0B;
724
173k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725
173k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730
173k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731
173k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732
733
    // Transform 3: Rounds 21-24
734
173k
    TMP = vld1q_u32(&K[20]);
735
173k
    TMP0A = vaddq_u32(MSG1A, TMP);
736
173k
    TMP0B = vaddq_u32(MSG1B, TMP);
737
173k
    TMP2A = STATE0A;
738
173k
    TMP2B = STATE0B;
739
173k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740
173k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745
173k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746
173k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747
748
    // Transform 3: Rounds 25-28
749
173k
    TMP = vld1q_u32(&K[24]);
750
173k
    TMP0A = vaddq_u32(MSG2A, TMP);
751
173k
    TMP0B = vaddq_u32(MSG2B, TMP);
752
173k
    TMP2A = STATE0A;
753
173k
    TMP2B = STATE0B;
754
173k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755
173k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760
173k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761
173k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762
763
    // Transform 3: Rounds 29-32
764
173k
    TMP = vld1q_u32(&K[28]);
765
173k
    TMP0A = vaddq_u32(MSG3A, TMP);
766
173k
    TMP0B = vaddq_u32(MSG3B, TMP);
767
173k
    TMP2A = STATE0A;
768
173k
    TMP2B = STATE0B;
769
173k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770
173k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775
173k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776
173k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777
778
    // Transform 3: Rounds 33-36
779
173k
    TMP = vld1q_u32(&K[32]);
780
173k
    TMP0A = vaddq_u32(MSG0A, TMP);
781
173k
    TMP0B = vaddq_u32(MSG0B, TMP);
782
173k
    TMP2A = STATE0A;
783
173k
    TMP2B = STATE0B;
784
173k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785
173k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790
173k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791
173k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792
793
    // Transform 3: Rounds 37-40
794
173k
    TMP = vld1q_u32(&K[36]);
795
173k
    TMP0A = vaddq_u32(MSG1A, TMP);
796
173k
    TMP0B = vaddq_u32(MSG1B, TMP);
797
173k
    TMP2A = STATE0A;
798
173k
    TMP2B = STATE0B;
799
173k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800
173k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805
173k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806
173k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807
808
    // Transform 3: Rounds 41-44
809
173k
    TMP = vld1q_u32(&K[40]);
810
173k
    TMP0A = vaddq_u32(MSG2A, TMP);
811
173k
    TMP0B = vaddq_u32(MSG2B, TMP);
812
173k
    TMP2A = STATE0A;
813
173k
    TMP2B = STATE0B;
814
173k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815
173k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820
173k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821
173k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822
823
    // Transform 3: Rounds 45-48
824
173k
    TMP = vld1q_u32(&K[44]);
825
173k
    TMP0A = vaddq_u32(MSG3A, TMP);
826
173k
    TMP0B = vaddq_u32(MSG3B, TMP);
827
173k
    TMP2A = STATE0A;
828
173k
    TMP2B = STATE0B;
829
173k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830
173k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835
173k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836
173k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837
838
    // Transform 3: Rounds 49-52
839
173k
    TMP = vld1q_u32(&K[48]);
840
173k
    TMP0A = vaddq_u32(MSG0A, TMP);
841
173k
    TMP0B = vaddq_u32(MSG0B, TMP);
842
173k
    TMP2A = STATE0A;
843
173k
    TMP2B = STATE0B;
844
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848
849
    // Transform 3: Rounds 53-56
850
173k
    TMP = vld1q_u32(&K[52]);
851
173k
    TMP0A = vaddq_u32(MSG1A, TMP);
852
173k
    TMP0B = vaddq_u32(MSG1B, TMP);
853
173k
    TMP2A = STATE0A;
854
173k
    TMP2B = STATE0B;
855
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859
860
    // Transform 3: Rounds 57-60
861
173k
    TMP = vld1q_u32(&K[56]);
862
173k
    TMP0A = vaddq_u32(MSG2A, TMP);
863
173k
    TMP0B = vaddq_u32(MSG2B, TMP);
864
173k
    TMP2A = STATE0A;
865
173k
    TMP2B = STATE0B;
866
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870
871
    // Transform 3: Rounds 61-64
872
173k
    TMP = vld1q_u32(&K[60]);
873
173k
    TMP0A = vaddq_u32(MSG3A, TMP);
874
173k
    TMP0B = vaddq_u32(MSG3B, TMP);
875
173k
    TMP2A = STATE0A;
876
173k
    TMP2B = STATE0B;
877
173k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878
173k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879
173k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880
173k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881
882
    // Transform 3: Update state
883
173k
    TMP = vld1q_u32(&INIT[0]);
884
173k
    STATE0A = vaddq_u32(STATE0A, TMP);
885
173k
    STATE0B = vaddq_u32(STATE0B, TMP);
886
173k
    TMP = vld1q_u32(&INIT[4]);
887
173k
    STATE1A = vaddq_u32(STATE1A, TMP);
888
173k
    STATE1B = vaddq_u32(STATE1B, TMP);
889
890
    // Store result
891
173k
    vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892
173k
    vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893
173k
    vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894
    vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895
173k
}
896
}
897
898
#endif