Coverage Report

Created: 2026-06-17 15:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/tmp/bitcoin/src/crypto/sha256_arm_shani.cpp
Line
Count
Source
1
// Copyright (c) 2022-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6
// Written and placed in public domain by Jeffrey Walton.
7
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8
// Barry O'Rourke for the mbedTLS project.
9
// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11
#ifdef ENABLE_ARM_SHANI
12
13
#include <array>
14
#include <cstdint>
15
#include <cstddef>
16
#include <arm_neon.h>
17
18
namespace {
19
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20
{
21
    0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22
    0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23
    0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24
    0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25
    0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26
    0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27
    0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28
    0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29
    0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30
    0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31
    0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32
    0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33
    0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34
    0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35
    0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37
};
38
}
39
40
namespace sha256_arm_shani {
41
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42
52.5M
{
43
52.5M
    uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE;
44
52.5M
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
45
52.5M
    uint32x4_t TMP0, TMP2;
46
47
    // Load state
48
52.5M
    STATE0 = vld1q_u32(&s[0]);
49
52.5M
    STATE1 = vld1q_u32(&s[4]);
50
51
225M
    while (blocks--)
52
172M
    {
53
        // Save state
54
172M
        ABCD_SAVE = STATE0;
55
172M
        EFGH_SAVE = STATE1;
56
57
        // Load and convert input chunk to Big Endian
58
172M
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59
172M
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60
172M
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61
172M
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62
172M
        chunk += 64;
63
64
        // Original implementation preloaded message and constant addition which was 1-3% slower.
65
        // Now included as first step in quad round code saving one Q Neon register
66
        // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68
        // Rounds 1-4
69
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70
172M
        TMP2 = STATE0;
71
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
72
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75
76
        // Rounds 5-8
77
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78
172M
        TMP2 = STATE0;
79
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
80
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83
84
        // Rounds 9-12
85
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86
172M
        TMP2 = STATE0;
87
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
88
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91
92
        // Rounds 13-16
93
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94
172M
        TMP2 = STATE0;
95
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
96
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99
100
        // Rounds 17-20
101
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102
172M
        TMP2 = STATE0;
103
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
104
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107
108
        // Rounds 21-24
109
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110
172M
        TMP2 = STATE0;
111
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
112
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115
116
        // Rounds 25-28
117
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118
172M
        TMP2 = STATE0;
119
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
120
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123
124
        // Rounds 29-32
125
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126
172M
        TMP2 = STATE0;
127
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
128
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131
132
        // Rounds 33-36
133
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134
172M
        TMP2 = STATE0;
135
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
136
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139
140
        // Rounds 37-40
141
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142
172M
        TMP2 = STATE0;
143
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
144
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147
148
        // Rounds 41-44
149
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150
172M
        TMP2 = STATE0;
151
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
152
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155
156
        // Rounds 45-48
157
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158
172M
        TMP2 = STATE0;
159
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
160
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163
164
        // Rounds 49-52
165
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166
172M
        TMP2 = STATE0;
167
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169
170
        // Rounds 53-56
171
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172
172M
        TMP2 = STATE0;
173
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175
176
        // Rounds 57-60
177
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178
172M
        TMP2 = STATE0;
179
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181
182
        // Rounds 61-64
183
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184
172M
        TMP2 = STATE0;
185
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187
188
        // Update state
189
172M
        STATE0 = vaddq_u32(STATE0, ABCD_SAVE);
190
172M
        STATE1 = vaddq_u32(STATE1, EFGH_SAVE);
191
172M
    }
192
193
    // Save final state
194
52.5M
    vst1q_u32(&s[0], STATE0);
195
52.5M
    vst1q_u32(&s[4], STATE1);
196
52.5M
}
197
}
198
199
namespace sha256d64_arm_shani {
200
void Transform_2way(unsigned char* output, const unsigned char* input)
201
177k
{
202
    /* Initial state. */
203
177k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204
177k
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205
177k
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206
177k
    };
207
208
    /* Precomputed message schedule for the 2nd transform. */
209
177k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210
177k
        0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211
177k
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212
177k
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213
177k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214
177k
        0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215
177k
        0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216
177k
        0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217
177k
        0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218
177k
        0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219
177k
        0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220
177k
        0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221
177k
        0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222
177k
        0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223
177k
        0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224
177k
        0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225
177k
        0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226
177k
    };
227
228
    /* A few precomputed message schedule values for the 3rd transform. */
229
177k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230
177k
        0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231
177k
        0x80000000, 0x00000000, 0x00000000, 0x00000000,
232
177k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233
177k
    };
234
235
    /* Padding processed in the 3rd transform (byteswapped). */
236
177k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
238
177k
    uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABCD_SAVEA, ABCD_SAVEB, EFGH_SAVEA, EFGH_SAVEB;
239
177k
    uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240
177k
    uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241
242
    // Transform 1: Load state
243
177k
    STATE0A = vld1q_u32(&INIT[0]);
244
177k
    STATE0B = STATE0A;
245
177k
    STATE1A = vld1q_u32(&INIT[4]);
246
177k
    STATE1B = STATE1A;
247
248
    // Transform 1: Load and convert input chunk to Big Endian
249
177k
    MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250
177k
    MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251
177k
    MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252
177k
    MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253
177k
    MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254
177k
    MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255
177k
    MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256
177k
    MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257
258
    // Transform 1: Rounds 1-4
259
177k
    TMP = vld1q_u32(&K[0]);
260
177k
    TMP0A = vaddq_u32(MSG0A, TMP);
261
177k
    TMP0B = vaddq_u32(MSG0B, TMP);
262
177k
    TMP2A = STATE0A;
263
177k
    TMP2B = STATE0B;
264
177k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265
177k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270
177k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271
177k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272
273
    // Transform 1: Rounds 5-8
274
177k
    TMP = vld1q_u32(&K[4]);
275
177k
    TMP0A = vaddq_u32(MSG1A, TMP);
276
177k
    TMP0B = vaddq_u32(MSG1B, TMP);
277
177k
    TMP2A = STATE0A;
278
177k
    TMP2B = STATE0B;
279
177k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280
177k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285
177k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286
177k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287
288
    // Transform 1: Rounds 9-12
289
177k
    TMP = vld1q_u32(&K[8]);
290
177k
    TMP0A = vaddq_u32(MSG2A, TMP);
291
177k
    TMP0B = vaddq_u32(MSG2B, TMP);
292
177k
    TMP2A = STATE0A;
293
177k
    TMP2B = STATE0B;
294
177k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295
177k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300
177k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301
177k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302
303
    // Transform 1: Rounds 13-16
304
177k
    TMP = vld1q_u32(&K[12]);
305
177k
    TMP0A = vaddq_u32(MSG3A, TMP);
306
177k
    TMP0B = vaddq_u32(MSG3B, TMP);
307
177k
    TMP2A = STATE0A;
308
177k
    TMP2B = STATE0B;
309
177k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310
177k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315
177k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316
177k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317
318
    // Transform 1: Rounds 17-20
319
177k
    TMP = vld1q_u32(&K[16]);
320
177k
    TMP0A = vaddq_u32(MSG0A, TMP);
321
177k
    TMP0B = vaddq_u32(MSG0B, TMP);
322
177k
    TMP2A = STATE0A;
323
177k
    TMP2B = STATE0B;
324
177k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325
177k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330
177k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331
177k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332
333
    // Transform 1: Rounds 21-24
334
177k
    TMP = vld1q_u32(&K[20]);
335
177k
    TMP0A = vaddq_u32(MSG1A, TMP);
336
177k
    TMP0B = vaddq_u32(MSG1B, TMP);
337
177k
    TMP2A = STATE0A;
338
177k
    TMP2B = STATE0B;
339
177k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340
177k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345
177k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346
177k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347
348
    // Transform 1: Rounds 25-28
349
177k
    TMP = vld1q_u32(&K[24]);
350
177k
    TMP0A = vaddq_u32(MSG2A, TMP);
351
177k
    TMP0B = vaddq_u32(MSG2B, TMP);
352
177k
    TMP2A = STATE0A;
353
177k
    TMP2B = STATE0B;
354
177k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355
177k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360
177k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361
177k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362
363
    // Transform 1: Rounds 29-32
364
177k
    TMP = vld1q_u32(&K[28]);
365
177k
    TMP0A = vaddq_u32(MSG3A, TMP);
366
177k
    TMP0B = vaddq_u32(MSG3B, TMP);
367
177k
    TMP2A = STATE0A;
368
177k
    TMP2B = STATE0B;
369
177k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370
177k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375
177k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376
177k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377
378
    // Transform 1: Rounds 33-36
379
177k
    TMP = vld1q_u32(&K[32]);
380
177k
    TMP0A = vaddq_u32(MSG0A, TMP);
381
177k
    TMP0B = vaddq_u32(MSG0B, TMP);
382
177k
    TMP2A = STATE0A;
383
177k
    TMP2B = STATE0B;
384
177k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385
177k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390
177k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391
177k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392
393
    // Transform 1: Rounds 37-40
394
177k
    TMP = vld1q_u32(&K[36]);
395
177k
    TMP0A = vaddq_u32(MSG1A, TMP);
396
177k
    TMP0B = vaddq_u32(MSG1B, TMP);
397
177k
    TMP2A = STATE0A;
398
177k
    TMP2B = STATE0B;
399
177k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400
177k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405
177k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406
177k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407
408
    // Transform 1: Rounds 41-44
409
177k
    TMP = vld1q_u32(&K[40]);
410
177k
    TMP0A = vaddq_u32(MSG2A, TMP);
411
177k
    TMP0B = vaddq_u32(MSG2B, TMP);
412
177k
    TMP2A = STATE0A;
413
177k
    TMP2B = STATE0B;
414
177k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415
177k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420
177k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421
177k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422
423
    // Transform 1: Rounds 45-48
424
177k
    TMP = vld1q_u32(&K[44]);
425
177k
    TMP0A = vaddq_u32(MSG3A, TMP);
426
177k
    TMP0B = vaddq_u32(MSG3B, TMP);
427
177k
    TMP2A = STATE0A;
428
177k
    TMP2B = STATE0B;
429
177k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430
177k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435
177k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436
177k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437
438
    // Transform 1: Rounds 49-52
439
177k
    TMP = vld1q_u32(&K[48]);
440
177k
    TMP0A = vaddq_u32(MSG0A, TMP);
441
177k
    TMP0B = vaddq_u32(MSG0B, TMP);
442
177k
    TMP2A = STATE0A;
443
177k
    TMP2B = STATE0B;
444
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448
449
    // Transform 1: Rounds 53-56
450
177k
    TMP = vld1q_u32(&K[52]);
451
177k
    TMP0A = vaddq_u32(MSG1A, TMP);
452
177k
    TMP0B = vaddq_u32(MSG1B, TMP);
453
177k
    TMP2A = STATE0A;
454
177k
    TMP2B = STATE0B;
455
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459
460
    // Transform 1: Rounds 57-60
461
177k
    TMP = vld1q_u32(&K[56]);
462
177k
    TMP0A = vaddq_u32(MSG2A, TMP);
463
177k
    TMP0B = vaddq_u32(MSG2B, TMP);
464
177k
    TMP2A = STATE0A;
465
177k
    TMP2B = STATE0B;
466
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470
471
    // Transform 1: Rounds 61-64
472
177k
    TMP = vld1q_u32(&K[60]);
473
177k
    TMP0A = vaddq_u32(MSG3A, TMP);
474
177k
    TMP0B = vaddq_u32(MSG3B, TMP);
475
177k
    TMP2A = STATE0A;
476
177k
    TMP2B = STATE0B;
477
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481
482
    // Transform 1: Update state
483
177k
    TMP = vld1q_u32(&INIT[0]);
484
177k
    STATE0A = vaddq_u32(STATE0A, TMP);
485
177k
    STATE0B = vaddq_u32(STATE0B, TMP);
486
177k
    TMP = vld1q_u32(&INIT[4]);
487
177k
    STATE1A = vaddq_u32(STATE1A, TMP);
488
177k
    STATE1B = vaddq_u32(STATE1B, TMP);
489
490
    // Transform 2: Save state
491
177k
    ABCD_SAVEA = STATE0A;
492
177k
    ABCD_SAVEB = STATE0B;
493
177k
    EFGH_SAVEA = STATE1A;
494
177k
    EFGH_SAVEB = STATE1B;
495
496
    // Transform 2: Rounds 1-4
497
177k
    TMP = vld1q_u32(&MIDS[0]);
498
177k
    TMP2A = STATE0A;
499
177k
    TMP2B = STATE0B;
500
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504
505
    // Transform 2: Rounds 5-8
506
177k
    TMP = vld1q_u32(&MIDS[4]);
507
177k
    TMP2A = STATE0A;
508
177k
    TMP2B = STATE0B;
509
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513
514
    // Transform 2: Rounds 9-12
515
177k
    TMP = vld1q_u32(&MIDS[8]);
516
177k
    TMP2A = STATE0A;
517
177k
    TMP2B = STATE0B;
518
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522
523
    // Transform 2: Rounds 13-16
524
177k
    TMP = vld1q_u32(&MIDS[12]);
525
177k
    TMP2A = STATE0A;
526
177k
    TMP2B = STATE0B;
527
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531
532
    // Transform 2: Rounds 17-20
533
177k
    TMP = vld1q_u32(&MIDS[16]);
534
177k
    TMP2A = STATE0A;
535
177k
    TMP2B = STATE0B;
536
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540
541
    // Transform 2: Rounds 21-24
542
177k
    TMP = vld1q_u32(&MIDS[20]);
543
177k
    TMP2A = STATE0A;
544
177k
    TMP2B = STATE0B;
545
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549
550
    // Transform 2: Rounds 25-28
551
177k
    TMP = vld1q_u32(&MIDS[24]);
552
177k
    TMP2A = STATE0A;
553
177k
    TMP2B = STATE0B;
554
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558
559
    // Transform 2: Rounds 29-32
560
177k
    TMP = vld1q_u32(&MIDS[28]);
561
177k
    TMP2A = STATE0A;
562
177k
    TMP2B = STATE0B;
563
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567
568
    // Transform 2: Rounds 33-36
569
177k
    TMP = vld1q_u32(&MIDS[32]);
570
177k
    TMP2A = STATE0A;
571
177k
    TMP2B = STATE0B;
572
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576
577
    // Transform 2: Rounds 37-40
578
177k
    TMP = vld1q_u32(&MIDS[36]);
579
177k
    TMP2A = STATE0A;
580
177k
    TMP2B = STATE0B;
581
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585
586
    // Transform 2: Rounds 41-44
587
177k
    TMP = vld1q_u32(&MIDS[40]);
588
177k
    TMP2A = STATE0A;
589
177k
    TMP2B = STATE0B;
590
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594
595
    // Transform 2: Rounds 45-48
596
177k
    TMP = vld1q_u32(&MIDS[44]);
597
177k
    TMP2A = STATE0A;
598
177k
    TMP2B = STATE0B;
599
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603
604
    // Transform 2: Rounds 49-52
605
177k
    TMP = vld1q_u32(&MIDS[48]);
606
177k
    TMP2A = STATE0A;
607
177k
    TMP2B = STATE0B;
608
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612
613
    // Transform 2: Rounds 53-56
614
177k
    TMP = vld1q_u32(&MIDS[52]);
615
177k
    TMP2A = STATE0A;
616
177k
    TMP2B = STATE0B;
617
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621
622
    // Transform 2: Rounds 57-60
623
177k
    TMP = vld1q_u32(&MIDS[56]);
624
177k
    TMP2A = STATE0A;
625
177k
    TMP2B = STATE0B;
626
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630
631
    // Transform 2: Rounds 61-64
632
177k
    TMP = vld1q_u32(&MIDS[60]);
633
177k
    TMP2A = STATE0A;
634
177k
    TMP2B = STATE0B;
635
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639
640
    // Transform 2: Update state
641
177k
    STATE0A = vaddq_u32(STATE0A, ABCD_SAVEA);
642
177k
    STATE0B = vaddq_u32(STATE0B, ABCD_SAVEB);
643
177k
    STATE1A = vaddq_u32(STATE1A, EFGH_SAVEA);
644
177k
    STATE1B = vaddq_u32(STATE1B, EFGH_SAVEB);
645
646
    // Transform 3: Pad previous output
647
177k
    MSG0A = STATE0A;
648
177k
    MSG0B = STATE0B;
649
177k
    MSG1A = STATE1A;
650
177k
    MSG1B = STATE1B;
651
177k
    MSG2A = vld1q_u32(&FINAL[0]);
652
177k
    MSG2B = MSG2A;
653
177k
    MSG3A = vld1q_u32(&FINAL[4]);
654
177k
    MSG3B = MSG3A;
655
656
    // Transform 3: Load state
657
177k
    STATE0A = vld1q_u32(&INIT[0]);
658
177k
    STATE0B = STATE0A;
659
177k
    STATE1A = vld1q_u32(&INIT[4]);
660
177k
    STATE1B = STATE1A;
661
662
    // Transform 3: Rounds 1-4
663
177k
    TMP = vld1q_u32(&K[0]);
664
177k
    TMP0A = vaddq_u32(MSG0A, TMP);
665
177k
    TMP0B = vaddq_u32(MSG0B, TMP);
666
177k
    TMP2A = STATE0A;
667
177k
    TMP2B = STATE0B;
668
177k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669
177k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674
177k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675
177k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676
677
    // Transform 3: Rounds 5-8
678
177k
    TMP = vld1q_u32(&K[4]);
679
177k
    TMP0A = vaddq_u32(MSG1A, TMP);
680
177k
    TMP0B = vaddq_u32(MSG1B, TMP);
681
177k
    TMP2A = STATE0A;
682
177k
    TMP2B = STATE0B;
683
177k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684
177k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689
177k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690
177k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691
692
    // Transform 3: Rounds 9-12
693
177k
    TMP = vld1q_u32(&FINS[0]);
694
177k
    TMP2A = STATE0A;
695
177k
    TMP2B = STATE0B;
696
177k
    MSG2A = vld1q_u32(&FINS[4]);
697
177k
    MSG2B = MSG2A;
698
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702
177k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703
177k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704
705
    // Transform 3: Rounds 13-16
706
177k
    TMP = vld1q_u32(&FINS[8]);
707
177k
    TMP2A = STATE0A;
708
177k
    TMP2B = STATE0B;
709
177k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710
177k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715
177k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716
177k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717
718
    // Transform 3: Rounds 17-20
719
177k
    TMP = vld1q_u32(&K[16]);
720
177k
    TMP0A = vaddq_u32(MSG0A, TMP);
721
177k
    TMP0B = vaddq_u32(MSG0B, TMP);
722
177k
    TMP2A = STATE0A;
723
177k
    TMP2B = STATE0B;
724
177k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725
177k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730
177k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731
177k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732
733
    // Transform 3: Rounds 21-24
734
177k
    TMP = vld1q_u32(&K[20]);
735
177k
    TMP0A = vaddq_u32(MSG1A, TMP);
736
177k
    TMP0B = vaddq_u32(MSG1B, TMP);
737
177k
    TMP2A = STATE0A;
738
177k
    TMP2B = STATE0B;
739
177k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740
177k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745
177k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746
177k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747
748
    // Transform 3: Rounds 25-28
749
177k
    TMP = vld1q_u32(&K[24]);
750
177k
    TMP0A = vaddq_u32(MSG2A, TMP);
751
177k
    TMP0B = vaddq_u32(MSG2B, TMP);
752
177k
    TMP2A = STATE0A;
753
177k
    TMP2B = STATE0B;
754
177k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755
177k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760
177k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761
177k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762
763
    // Transform 3: Rounds 29-32
764
177k
    TMP = vld1q_u32(&K[28]);
765
177k
    TMP0A = vaddq_u32(MSG3A, TMP);
766
177k
    TMP0B = vaddq_u32(MSG3B, TMP);
767
177k
    TMP2A = STATE0A;
768
177k
    TMP2B = STATE0B;
769
177k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770
177k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775
177k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776
177k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777
778
    // Transform 3: Rounds 33-36
779
177k
    TMP = vld1q_u32(&K[32]);
780
177k
    TMP0A = vaddq_u32(MSG0A, TMP);
781
177k
    TMP0B = vaddq_u32(MSG0B, TMP);
782
177k
    TMP2A = STATE0A;
783
177k
    TMP2B = STATE0B;
784
177k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785
177k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790
177k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791
177k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792
793
    // Transform 3: Rounds 37-40
794
177k
    TMP = vld1q_u32(&K[36]);
795
177k
    TMP0A = vaddq_u32(MSG1A, TMP);
796
177k
    TMP0B = vaddq_u32(MSG1B, TMP);
797
177k
    TMP2A = STATE0A;
798
177k
    TMP2B = STATE0B;
799
177k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800
177k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805
177k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806
177k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807
808
    // Transform 3: Rounds 41-44
809
177k
    TMP = vld1q_u32(&K[40]);
810
177k
    TMP0A = vaddq_u32(MSG2A, TMP);
811
177k
    TMP0B = vaddq_u32(MSG2B, TMP);
812
177k
    TMP2A = STATE0A;
813
177k
    TMP2B = STATE0B;
814
177k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815
177k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820
177k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821
177k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822
823
    // Transform 3: Rounds 45-48
824
177k
    TMP = vld1q_u32(&K[44]);
825
177k
    TMP0A = vaddq_u32(MSG3A, TMP);
826
177k
    TMP0B = vaddq_u32(MSG3B, TMP);
827
177k
    TMP2A = STATE0A;
828
177k
    TMP2B = STATE0B;
829
177k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830
177k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835
177k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836
177k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837
838
    // Transform 3: Rounds 49-52
839
177k
    TMP = vld1q_u32(&K[48]);
840
177k
    TMP0A = vaddq_u32(MSG0A, TMP);
841
177k
    TMP0B = vaddq_u32(MSG0B, TMP);
842
177k
    TMP2A = STATE0A;
843
177k
    TMP2B = STATE0B;
844
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848
849
    // Transform 3: Rounds 53-56
850
177k
    TMP = vld1q_u32(&K[52]);
851
177k
    TMP0A = vaddq_u32(MSG1A, TMP);
852
177k
    TMP0B = vaddq_u32(MSG1B, TMP);
853
177k
    TMP2A = STATE0A;
854
177k
    TMP2B = STATE0B;
855
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859
860
    // Transform 3: Rounds 57-60
861
177k
    TMP = vld1q_u32(&K[56]);
862
177k
    TMP0A = vaddq_u32(MSG2A, TMP);
863
177k
    TMP0B = vaddq_u32(MSG2B, TMP);
864
177k
    TMP2A = STATE0A;
865
177k
    TMP2B = STATE0B;
866
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870
871
    // Transform 3: Rounds 61-64
872
177k
    TMP = vld1q_u32(&K[60]);
873
177k
    TMP0A = vaddq_u32(MSG3A, TMP);
874
177k
    TMP0B = vaddq_u32(MSG3B, TMP);
875
177k
    TMP2A = STATE0A;
876
177k
    TMP2B = STATE0B;
877
177k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878
177k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879
177k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880
177k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881
882
    // Transform 3: Update state
883
177k
    TMP = vld1q_u32(&INIT[0]);
884
177k
    STATE0A = vaddq_u32(STATE0A, TMP);
885
177k
    STATE0B = vaddq_u32(STATE0B, TMP);
886
177k
    TMP = vld1q_u32(&INIT[4]);
887
177k
    STATE1A = vaddq_u32(STATE1A, TMP);
888
177k
    STATE1B = vaddq_u32(STATE1B, TMP);
889
890
    // Store result
891
177k
    vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892
177k
    vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893
177k
    vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894
    vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895
177k
}
896
}
897
898
#endif