Coverage Report

Created: 2026-06-03 10:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/tmp/bitcoin/src/crypto/sha256_arm_shani.cpp
Line
Count
Source
1
// Copyright (c) 2022-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6
// Written and placed in public domain by Jeffrey Walton.
7
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8
// Barry O'Rourke for the mbedTLS project.
9
// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11
#ifdef ENABLE_ARM_SHANI
12
13
#include <array>
14
#include <cstdint>
15
#include <cstddef>
16
#include <arm_neon.h>
17
18
namespace {
19
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20
{
21
    0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22
    0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23
    0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24
    0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25
    0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26
    0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27
    0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28
    0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29
    0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30
    0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31
    0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32
    0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33
    0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34
    0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35
    0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37
};
38
}
39
40
namespace sha256_arm_shani {
41
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42
51.8M
{
43
51.8M
    uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE;
44
51.8M
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
45
51.8M
    uint32x4_t TMP0, TMP2;
46
47
    // Load state
48
51.8M
    STATE0 = vld1q_u32(&s[0]);
49
51.8M
    STATE1 = vld1q_u32(&s[4]);
50
51
224M
    while (blocks--)
52
172M
    {
53
        // Save state
54
172M
        ABCD_SAVE = STATE0;
55
172M
        EFGH_SAVE = STATE1;
56
57
        // Load and convert input chunk to Big Endian
58
172M
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59
172M
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60
172M
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61
172M
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62
172M
        chunk += 64;
63
64
        // Original implementation preloaded message and constant addition which was 1-3% slower.
65
        // Now included as first step in quad round code saving one Q Neon register
66
        // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68
        // Rounds 1-4
69
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70
172M
        TMP2 = STATE0;
71
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
72
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75
76
        // Rounds 5-8
77
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78
172M
        TMP2 = STATE0;
79
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
80
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83
84
        // Rounds 9-12
85
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86
172M
        TMP2 = STATE0;
87
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
88
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91
92
        // Rounds 13-16
93
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94
172M
        TMP2 = STATE0;
95
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
96
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99
100
        // Rounds 17-20
101
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102
172M
        TMP2 = STATE0;
103
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
104
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107
108
        // Rounds 21-24
109
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110
172M
        TMP2 = STATE0;
111
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
112
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115
116
        // Rounds 25-28
117
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118
172M
        TMP2 = STATE0;
119
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
120
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123
124
        // Rounds 29-32
125
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126
172M
        TMP2 = STATE0;
127
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
128
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131
132
        // Rounds 33-36
133
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134
172M
        TMP2 = STATE0;
135
172M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
136
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138
172M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139
140
        // Rounds 37-40
141
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142
172M
        TMP2 = STATE0;
143
172M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
144
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146
172M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147
148
        // Rounds 41-44
149
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150
172M
        TMP2 = STATE0;
151
172M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
152
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154
172M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155
156
        // Rounds 45-48
157
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158
172M
        TMP2 = STATE0;
159
172M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
160
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162
172M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163
164
        // Rounds 49-52
165
172M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166
172M
        TMP2 = STATE0;
167
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169
170
        // Rounds 53-56
171
172M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172
172M
        TMP2 = STATE0;
173
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175
176
        // Rounds 57-60
177
172M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178
172M
        TMP2 = STATE0;
179
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181
182
        // Rounds 61-64
183
172M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184
172M
        TMP2 = STATE0;
185
172M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186
172M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187
188
        // Update state
189
172M
        STATE0 = vaddq_u32(STATE0, ABCD_SAVE);
190
172M
        STATE1 = vaddq_u32(STATE1, EFGH_SAVE);
191
172M
    }
192
193
    // Save final state
194
51.8M
    vst1q_u32(&s[0], STATE0);
195
51.8M
    vst1q_u32(&s[4], STATE1);
196
51.8M
}
197
}
198
199
namespace sha256d64_arm_shani {
200
void Transform_2way(unsigned char* output, const unsigned char* input)
201
141k
{
202
    /* Initial state. */
203
141k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204
141k
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205
141k
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206
141k
    };
207
208
    /* Precomputed message schedule for the 2nd transform. */
209
141k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210
141k
        0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211
141k
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212
141k
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213
141k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214
141k
        0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215
141k
        0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216
141k
        0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217
141k
        0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218
141k
        0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219
141k
        0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220
141k
        0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221
141k
        0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222
141k
        0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223
141k
        0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224
141k
        0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225
141k
        0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226
141k
    };
227
228
    /* A few precomputed message schedule values for the 3rd transform. */
229
141k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230
141k
        0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231
141k
        0x80000000, 0x00000000, 0x00000000, 0x00000000,
232
141k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233
141k
    };
234
235
    /* Padding processed in the 3rd transform (byteswapped). */
236
141k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
238
141k
    uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABCD_SAVEA, ABCD_SAVEB, EFGH_SAVEA, EFGH_SAVEB;
239
141k
    uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240
141k
    uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241
242
    // Transform 1: Load state
243
141k
    STATE0A = vld1q_u32(&INIT[0]);
244
141k
    STATE0B = STATE0A;
245
141k
    STATE1A = vld1q_u32(&INIT[4]);
246
141k
    STATE1B = STATE1A;
247
248
    // Transform 1: Load and convert input chunk to Big Endian
249
141k
    MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250
141k
    MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251
141k
    MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252
141k
    MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253
141k
    MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254
141k
    MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255
141k
    MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256
141k
    MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257
258
    // Transform 1: Rounds 1-4
259
141k
    TMP = vld1q_u32(&K[0]);
260
141k
    TMP0A = vaddq_u32(MSG0A, TMP);
261
141k
    TMP0B = vaddq_u32(MSG0B, TMP);
262
141k
    TMP2A = STATE0A;
263
141k
    TMP2B = STATE0B;
264
141k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265
141k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270
141k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271
141k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272
273
    // Transform 1: Rounds 5-8
274
141k
    TMP = vld1q_u32(&K[4]);
275
141k
    TMP0A = vaddq_u32(MSG1A, TMP);
276
141k
    TMP0B = vaddq_u32(MSG1B, TMP);
277
141k
    TMP2A = STATE0A;
278
141k
    TMP2B = STATE0B;
279
141k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280
141k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285
141k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286
141k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287
288
    // Transform 1: Rounds 9-12
289
141k
    TMP = vld1q_u32(&K[8]);
290
141k
    TMP0A = vaddq_u32(MSG2A, TMP);
291
141k
    TMP0B = vaddq_u32(MSG2B, TMP);
292
141k
    TMP2A = STATE0A;
293
141k
    TMP2B = STATE0B;
294
141k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295
141k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300
141k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301
141k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302
303
    // Transform 1: Rounds 13-16
304
141k
    TMP = vld1q_u32(&K[12]);
305
141k
    TMP0A = vaddq_u32(MSG3A, TMP);
306
141k
    TMP0B = vaddq_u32(MSG3B, TMP);
307
141k
    TMP2A = STATE0A;
308
141k
    TMP2B = STATE0B;
309
141k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310
141k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315
141k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316
141k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317
318
    // Transform 1: Rounds 17-20
319
141k
    TMP = vld1q_u32(&K[16]);
320
141k
    TMP0A = vaddq_u32(MSG0A, TMP);
321
141k
    TMP0B = vaddq_u32(MSG0B, TMP);
322
141k
    TMP2A = STATE0A;
323
141k
    TMP2B = STATE0B;
324
141k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325
141k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330
141k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331
141k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332
333
    // Transform 1: Rounds 21-24
334
141k
    TMP = vld1q_u32(&K[20]);
335
141k
    TMP0A = vaddq_u32(MSG1A, TMP);
336
141k
    TMP0B = vaddq_u32(MSG1B, TMP);
337
141k
    TMP2A = STATE0A;
338
141k
    TMP2B = STATE0B;
339
141k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340
141k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345
141k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346
141k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347
348
    // Transform 1: Rounds 25-28
349
141k
    TMP = vld1q_u32(&K[24]);
350
141k
    TMP0A = vaddq_u32(MSG2A, TMP);
351
141k
    TMP0B = vaddq_u32(MSG2B, TMP);
352
141k
    TMP2A = STATE0A;
353
141k
    TMP2B = STATE0B;
354
141k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355
141k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360
141k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361
141k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362
363
    // Transform 1: Rounds 29-32
364
141k
    TMP = vld1q_u32(&K[28]);
365
141k
    TMP0A = vaddq_u32(MSG3A, TMP);
366
141k
    TMP0B = vaddq_u32(MSG3B, TMP);
367
141k
    TMP2A = STATE0A;
368
141k
    TMP2B = STATE0B;
369
141k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370
141k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375
141k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376
141k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377
378
    // Transform 1: Rounds 33-36
379
141k
    TMP = vld1q_u32(&K[32]);
380
141k
    TMP0A = vaddq_u32(MSG0A, TMP);
381
141k
    TMP0B = vaddq_u32(MSG0B, TMP);
382
141k
    TMP2A = STATE0A;
383
141k
    TMP2B = STATE0B;
384
141k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385
141k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390
141k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391
141k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392
393
    // Transform 1: Rounds 37-40
394
141k
    TMP = vld1q_u32(&K[36]);
395
141k
    TMP0A = vaddq_u32(MSG1A, TMP);
396
141k
    TMP0B = vaddq_u32(MSG1B, TMP);
397
141k
    TMP2A = STATE0A;
398
141k
    TMP2B = STATE0B;
399
141k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400
141k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405
141k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406
141k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407
408
    // Transform 1: Rounds 41-44
409
141k
    TMP = vld1q_u32(&K[40]);
410
141k
    TMP0A = vaddq_u32(MSG2A, TMP);
411
141k
    TMP0B = vaddq_u32(MSG2B, TMP);
412
141k
    TMP2A = STATE0A;
413
141k
    TMP2B = STATE0B;
414
141k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415
141k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420
141k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421
141k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422
423
    // Transform 1: Rounds 45-48
424
141k
    TMP = vld1q_u32(&K[44]);
425
141k
    TMP0A = vaddq_u32(MSG3A, TMP);
426
141k
    TMP0B = vaddq_u32(MSG3B, TMP);
427
141k
    TMP2A = STATE0A;
428
141k
    TMP2B = STATE0B;
429
141k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430
141k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435
141k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436
141k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437
438
    // Transform 1: Rounds 49-52
439
141k
    TMP = vld1q_u32(&K[48]);
440
141k
    TMP0A = vaddq_u32(MSG0A, TMP);
441
141k
    TMP0B = vaddq_u32(MSG0B, TMP);
442
141k
    TMP2A = STATE0A;
443
141k
    TMP2B = STATE0B;
444
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448
449
    // Transform 1: Rounds 53-56
450
141k
    TMP = vld1q_u32(&K[52]);
451
141k
    TMP0A = vaddq_u32(MSG1A, TMP);
452
141k
    TMP0B = vaddq_u32(MSG1B, TMP);
453
141k
    TMP2A = STATE0A;
454
141k
    TMP2B = STATE0B;
455
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459
460
    // Transform 1: Rounds 57-60
461
141k
    TMP = vld1q_u32(&K[56]);
462
141k
    TMP0A = vaddq_u32(MSG2A, TMP);
463
141k
    TMP0B = vaddq_u32(MSG2B, TMP);
464
141k
    TMP2A = STATE0A;
465
141k
    TMP2B = STATE0B;
466
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470
471
    // Transform 1: Rounds 61-64
472
141k
    TMP = vld1q_u32(&K[60]);
473
141k
    TMP0A = vaddq_u32(MSG3A, TMP);
474
141k
    TMP0B = vaddq_u32(MSG3B, TMP);
475
141k
    TMP2A = STATE0A;
476
141k
    TMP2B = STATE0B;
477
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481
482
    // Transform 1: Update state
483
141k
    TMP = vld1q_u32(&INIT[0]);
484
141k
    STATE0A = vaddq_u32(STATE0A, TMP);
485
141k
    STATE0B = vaddq_u32(STATE0B, TMP);
486
141k
    TMP = vld1q_u32(&INIT[4]);
487
141k
    STATE1A = vaddq_u32(STATE1A, TMP);
488
141k
    STATE1B = vaddq_u32(STATE1B, TMP);
489
490
    // Transform 2: Save state
491
141k
    ABCD_SAVEA = STATE0A;
492
141k
    ABCD_SAVEB = STATE0B;
493
141k
    EFGH_SAVEA = STATE1A;
494
141k
    EFGH_SAVEB = STATE1B;
495
496
    // Transform 2: Rounds 1-4
497
141k
    TMP = vld1q_u32(&MIDS[0]);
498
141k
    TMP2A = STATE0A;
499
141k
    TMP2B = STATE0B;
500
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504
505
    // Transform 2: Rounds 5-8
506
141k
    TMP = vld1q_u32(&MIDS[4]);
507
141k
    TMP2A = STATE0A;
508
141k
    TMP2B = STATE0B;
509
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513
514
    // Transform 2: Rounds 9-12
515
141k
    TMP = vld1q_u32(&MIDS[8]);
516
141k
    TMP2A = STATE0A;
517
141k
    TMP2B = STATE0B;
518
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522
523
    // Transform 2: Rounds 13-16
524
141k
    TMP = vld1q_u32(&MIDS[12]);
525
141k
    TMP2A = STATE0A;
526
141k
    TMP2B = STATE0B;
527
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531
532
    // Transform 2: Rounds 17-20
533
141k
    TMP = vld1q_u32(&MIDS[16]);
534
141k
    TMP2A = STATE0A;
535
141k
    TMP2B = STATE0B;
536
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540
541
    // Transform 2: Rounds 21-24
542
141k
    TMP = vld1q_u32(&MIDS[20]);
543
141k
    TMP2A = STATE0A;
544
141k
    TMP2B = STATE0B;
545
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549
550
    // Transform 2: Rounds 25-28
551
141k
    TMP = vld1q_u32(&MIDS[24]);
552
141k
    TMP2A = STATE0A;
553
141k
    TMP2B = STATE0B;
554
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558
559
    // Transform 2: Rounds 29-32
560
141k
    TMP = vld1q_u32(&MIDS[28]);
561
141k
    TMP2A = STATE0A;
562
141k
    TMP2B = STATE0B;
563
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567
568
    // Transform 2: Rounds 33-36
569
141k
    TMP = vld1q_u32(&MIDS[32]);
570
141k
    TMP2A = STATE0A;
571
141k
    TMP2B = STATE0B;
572
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576
577
    // Transform 2: Rounds 37-40
578
141k
    TMP = vld1q_u32(&MIDS[36]);
579
141k
    TMP2A = STATE0A;
580
141k
    TMP2B = STATE0B;
581
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585
586
    // Transform 2: Rounds 41-44
587
141k
    TMP = vld1q_u32(&MIDS[40]);
588
141k
    TMP2A = STATE0A;
589
141k
    TMP2B = STATE0B;
590
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594
595
    // Transform 2: Rounds 45-48
596
141k
    TMP = vld1q_u32(&MIDS[44]);
597
141k
    TMP2A = STATE0A;
598
141k
    TMP2B = STATE0B;
599
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603
604
    // Transform 2: Rounds 49-52
605
141k
    TMP = vld1q_u32(&MIDS[48]);
606
141k
    TMP2A = STATE0A;
607
141k
    TMP2B = STATE0B;
608
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612
613
    // Transform 2: Rounds 53-56
614
141k
    TMP = vld1q_u32(&MIDS[52]);
615
141k
    TMP2A = STATE0A;
616
141k
    TMP2B = STATE0B;
617
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621
622
    // Transform 2: Rounds 57-60
623
141k
    TMP = vld1q_u32(&MIDS[56]);
624
141k
    TMP2A = STATE0A;
625
141k
    TMP2B = STATE0B;
626
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630
631
    // Transform 2: Rounds 61-64
632
141k
    TMP = vld1q_u32(&MIDS[60]);
633
141k
    TMP2A = STATE0A;
634
141k
    TMP2B = STATE0B;
635
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639
640
    // Transform 2: Update state
641
141k
    STATE0A = vaddq_u32(STATE0A, ABCD_SAVEA);
642
141k
    STATE0B = vaddq_u32(STATE0B, ABCD_SAVEB);
643
141k
    STATE1A = vaddq_u32(STATE1A, EFGH_SAVEA);
644
141k
    STATE1B = vaddq_u32(STATE1B, EFGH_SAVEB);
645
646
    // Transform 3: Pad previous output
647
141k
    MSG0A = STATE0A;
648
141k
    MSG0B = STATE0B;
649
141k
    MSG1A = STATE1A;
650
141k
    MSG1B = STATE1B;
651
141k
    MSG2A = vld1q_u32(&FINAL[0]);
652
141k
    MSG2B = MSG2A;
653
141k
    MSG3A = vld1q_u32(&FINAL[4]);
654
141k
    MSG3B = MSG3A;
655
656
    // Transform 3: Load state
657
141k
    STATE0A = vld1q_u32(&INIT[0]);
658
141k
    STATE0B = STATE0A;
659
141k
    STATE1A = vld1q_u32(&INIT[4]);
660
141k
    STATE1B = STATE1A;
661
662
    // Transform 3: Rounds 1-4
663
141k
    TMP = vld1q_u32(&K[0]);
664
141k
    TMP0A = vaddq_u32(MSG0A, TMP);
665
141k
    TMP0B = vaddq_u32(MSG0B, TMP);
666
141k
    TMP2A = STATE0A;
667
141k
    TMP2B = STATE0B;
668
141k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669
141k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674
141k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675
141k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676
677
    // Transform 3: Rounds 5-8
678
141k
    TMP = vld1q_u32(&K[4]);
679
141k
    TMP0A = vaddq_u32(MSG1A, TMP);
680
141k
    TMP0B = vaddq_u32(MSG1B, TMP);
681
141k
    TMP2A = STATE0A;
682
141k
    TMP2B = STATE0B;
683
141k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684
141k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689
141k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690
141k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691
692
    // Transform 3: Rounds 9-12
693
141k
    TMP = vld1q_u32(&FINS[0]);
694
141k
    TMP2A = STATE0A;
695
141k
    TMP2B = STATE0B;
696
141k
    MSG2A = vld1q_u32(&FINS[4]);
697
141k
    MSG2B = MSG2A;
698
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702
141k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703
141k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704
705
    // Transform 3: Rounds 13-16
706
141k
    TMP = vld1q_u32(&FINS[8]);
707
141k
    TMP2A = STATE0A;
708
141k
    TMP2B = STATE0B;
709
141k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710
141k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715
141k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716
141k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717
718
    // Transform 3: Rounds 17-20
719
141k
    TMP = vld1q_u32(&K[16]);
720
141k
    TMP0A = vaddq_u32(MSG0A, TMP);
721
141k
    TMP0B = vaddq_u32(MSG0B, TMP);
722
141k
    TMP2A = STATE0A;
723
141k
    TMP2B = STATE0B;
724
141k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725
141k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730
141k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731
141k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732
733
    // Transform 3: Rounds 21-24
734
141k
    TMP = vld1q_u32(&K[20]);
735
141k
    TMP0A = vaddq_u32(MSG1A, TMP);
736
141k
    TMP0B = vaddq_u32(MSG1B, TMP);
737
141k
    TMP2A = STATE0A;
738
141k
    TMP2B = STATE0B;
739
141k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740
141k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745
141k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746
141k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747
748
    // Transform 3: Rounds 25-28
749
141k
    TMP = vld1q_u32(&K[24]);
750
141k
    TMP0A = vaddq_u32(MSG2A, TMP);
751
141k
    TMP0B = vaddq_u32(MSG2B, TMP);
752
141k
    TMP2A = STATE0A;
753
141k
    TMP2B = STATE0B;
754
141k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755
141k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760
141k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761
141k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762
763
    // Transform 3: Rounds 29-32
764
141k
    TMP = vld1q_u32(&K[28]);
765
141k
    TMP0A = vaddq_u32(MSG3A, TMP);
766
141k
    TMP0B = vaddq_u32(MSG3B, TMP);
767
141k
    TMP2A = STATE0A;
768
141k
    TMP2B = STATE0B;
769
141k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770
141k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775
141k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776
141k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777
778
    // Transform 3: Rounds 33-36
779
141k
    TMP = vld1q_u32(&K[32]);
780
141k
    TMP0A = vaddq_u32(MSG0A, TMP);
781
141k
    TMP0B = vaddq_u32(MSG0B, TMP);
782
141k
    TMP2A = STATE0A;
783
141k
    TMP2B = STATE0B;
784
141k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785
141k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790
141k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791
141k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792
793
    // Transform 3: Rounds 37-40
794
141k
    TMP = vld1q_u32(&K[36]);
795
141k
    TMP0A = vaddq_u32(MSG1A, TMP);
796
141k
    TMP0B = vaddq_u32(MSG1B, TMP);
797
141k
    TMP2A = STATE0A;
798
141k
    TMP2B = STATE0B;
799
141k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800
141k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805
141k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806
141k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807
808
    // Transform 3: Rounds 41-44
809
141k
    TMP = vld1q_u32(&K[40]);
810
141k
    TMP0A = vaddq_u32(MSG2A, TMP);
811
141k
    TMP0B = vaddq_u32(MSG2B, TMP);
812
141k
    TMP2A = STATE0A;
813
141k
    TMP2B = STATE0B;
814
141k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815
141k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820
141k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821
141k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822
823
    // Transform 3: Rounds 45-48
824
141k
    TMP = vld1q_u32(&K[44]);
825
141k
    TMP0A = vaddq_u32(MSG3A, TMP);
826
141k
    TMP0B = vaddq_u32(MSG3B, TMP);
827
141k
    TMP2A = STATE0A;
828
141k
    TMP2B = STATE0B;
829
141k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830
141k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835
141k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836
141k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837
838
    // Transform 3: Rounds 49-52
839
141k
    TMP = vld1q_u32(&K[48]);
840
141k
    TMP0A = vaddq_u32(MSG0A, TMP);
841
141k
    TMP0B = vaddq_u32(MSG0B, TMP);
842
141k
    TMP2A = STATE0A;
843
141k
    TMP2B = STATE0B;
844
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848
849
    // Transform 3: Rounds 53-56
850
141k
    TMP = vld1q_u32(&K[52]);
851
141k
    TMP0A = vaddq_u32(MSG1A, TMP);
852
141k
    TMP0B = vaddq_u32(MSG1B, TMP);
853
141k
    TMP2A = STATE0A;
854
141k
    TMP2B = STATE0B;
855
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859
860
    // Transform 3: Rounds 57-60
861
141k
    TMP = vld1q_u32(&K[56]);
862
141k
    TMP0A = vaddq_u32(MSG2A, TMP);
863
141k
    TMP0B = vaddq_u32(MSG2B, TMP);
864
141k
    TMP2A = STATE0A;
865
141k
    TMP2B = STATE0B;
866
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870
871
    // Transform 3: Rounds 61-64
872
141k
    TMP = vld1q_u32(&K[60]);
873
141k
    TMP0A = vaddq_u32(MSG3A, TMP);
874
141k
    TMP0B = vaddq_u32(MSG3B, TMP);
875
141k
    TMP2A = STATE0A;
876
141k
    TMP2B = STATE0B;
877
141k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878
141k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879
141k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880
141k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881
882
    // Transform 3: Update state
883
141k
    TMP = vld1q_u32(&INIT[0]);
884
141k
    STATE0A = vaddq_u32(STATE0A, TMP);
885
141k
    STATE0B = vaddq_u32(STATE0B, TMP);
886
141k
    TMP = vld1q_u32(&INIT[4]);
887
141k
    STATE1A = vaddq_u32(STATE1A, TMP);
888
141k
    STATE1B = vaddq_u32(STATE1B, TMP);
889
890
    // Store result
891
141k
    vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892
141k
    vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893
141k
    vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894
    vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895
141k
}
896
}
897
898
#endif