Coverage Report

Created: 2026-05-08 10:34

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/tmp/bitcoin/src/crypto/sha256_arm_shani.cpp
Line
Count
Source
1
// Copyright (c) 2022-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6
// Written and placed in public domain by Jeffrey Walton.
7
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8
// Barry O'Rourke for the mbedTLS project.
9
// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11
#ifdef ENABLE_ARM_SHANI
12
13
#include <array>
14
#include <cstdint>
15
#include <cstddef>
16
#include <arm_neon.h>
17
18
namespace {
19
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20
{
21
    0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22
    0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23
    0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24
    0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25
    0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26
    0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27
    0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28
    0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29
    0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30
    0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31
    0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32
    0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33
    0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34
    0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35
    0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37
};
38
}
39
40
namespace sha256_arm_shani {
41
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42
51.7M
{
43
51.7M
    uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
44
51.7M
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
45
51.7M
    uint32x4_t TMP0, TMP2;
46
47
    // Load state
48
51.7M
    STATE0 = vld1q_u32(&s[0]);
49
51.7M
    STATE1 = vld1q_u32(&s[4]);
50
51
223M
    while (blocks--)
52
171M
    {
53
        // Save state
54
171M
        ABEF_SAVE = STATE0;
55
171M
        CDGH_SAVE = STATE1;
56
57
        // Load and convert input chunk to Big Endian
58
171M
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59
171M
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60
171M
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61
171M
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62
171M
        chunk += 64;
63
64
        // Original implementation preloaded message and constant addition which was 1-3% slower.
65
        // Now included as first step in quad round code saving one Q Neon register
66
        // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68
        // Rounds 1-4
69
171M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70
171M
        TMP2 = STATE0;
71
171M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
72
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74
171M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75
76
        // Rounds 5-8
77
171M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78
171M
        TMP2 = STATE0;
79
171M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
80
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82
171M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83
84
        // Rounds 9-12
85
171M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86
171M
        TMP2 = STATE0;
87
171M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
88
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90
171M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91
92
        // Rounds 13-16
93
171M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94
171M
        TMP2 = STATE0;
95
171M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
96
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98
171M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99
100
        // Rounds 17-20
101
171M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102
171M
        TMP2 = STATE0;
103
171M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
104
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106
171M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107
108
        // Rounds 21-24
109
171M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110
171M
        TMP2 = STATE0;
111
171M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
112
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114
171M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115
116
        // Rounds 25-28
117
171M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118
171M
        TMP2 = STATE0;
119
171M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
120
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122
171M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123
124
        // Rounds 29-32
125
171M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126
171M
        TMP2 = STATE0;
127
171M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
128
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130
171M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131
132
        // Rounds 33-36
133
171M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134
171M
        TMP2 = STATE0;
135
171M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
136
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138
171M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139
140
        // Rounds 37-40
141
171M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142
171M
        TMP2 = STATE0;
143
171M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
144
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146
171M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147
148
        // Rounds 41-44
149
171M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150
171M
        TMP2 = STATE0;
151
171M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
152
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154
171M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155
156
        // Rounds 45-48
157
171M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158
171M
        TMP2 = STATE0;
159
171M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
160
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162
171M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163
164
        // Rounds 49-52
165
171M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166
171M
        TMP2 = STATE0;
167
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169
170
        // Rounds 53-56
171
171M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172
171M
        TMP2 = STATE0;
173
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175
176
        // Rounds 57-60
177
171M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178
171M
        TMP2 = STATE0;
179
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181
182
        // Rounds 61-64
183
171M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184
171M
        TMP2 = STATE0;
185
171M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186
171M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187
188
        // Update state
189
171M
        STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
190
171M
        STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
191
171M
    }
192
193
    // Save final state
194
51.7M
    vst1q_u32(&s[0], STATE0);
195
51.7M
    vst1q_u32(&s[4], STATE1);
196
51.7M
}
197
}
198
199
namespace sha256d64_arm_shani {
200
void Transform_2way(unsigned char* output, const unsigned char* input)
201
157k
{
202
    /* Initial state. */
203
157k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204
157k
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205
157k
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206
157k
    };
207
208
    /* Precomputed message schedule for the 2nd transform. */
209
157k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210
157k
        0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211
157k
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212
157k
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213
157k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214
157k
        0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215
157k
        0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216
157k
        0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217
157k
        0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218
157k
        0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219
157k
        0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220
157k
        0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221
157k
        0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222
157k
        0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223
157k
        0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224
157k
        0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225
157k
        0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226
157k
    };
227
228
    /* A few precomputed message schedule values for the 3rd transform. */
229
157k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230
157k
        0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231
157k
        0x80000000, 0x00000000, 0x00000000, 0x00000000,
232
157k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233
157k
    };
234
235
    /* Padding processed in the 3rd transform (byteswapped). */
236
157k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
238
157k
    uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
239
157k
    uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240
157k
    uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241
242
    // Transform 1: Load state
243
157k
    STATE0A = vld1q_u32(&INIT[0]);
244
157k
    STATE0B = STATE0A;
245
157k
    STATE1A = vld1q_u32(&INIT[4]);
246
157k
    STATE1B = STATE1A;
247
248
    // Transform 1: Load and convert input chunk to Big Endian
249
157k
    MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250
157k
    MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251
157k
    MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252
157k
    MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253
157k
    MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254
157k
    MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255
157k
    MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256
157k
    MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257
258
    // Transform 1: Rounds 1-4
259
157k
    TMP = vld1q_u32(&K[0]);
260
157k
    TMP0A = vaddq_u32(MSG0A, TMP);
261
157k
    TMP0B = vaddq_u32(MSG0B, TMP);
262
157k
    TMP2A = STATE0A;
263
157k
    TMP2B = STATE0B;
264
157k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265
157k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270
157k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271
157k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272
273
    // Transform 1: Rounds 5-8
274
157k
    TMP = vld1q_u32(&K[4]);
275
157k
    TMP0A = vaddq_u32(MSG1A, TMP);
276
157k
    TMP0B = vaddq_u32(MSG1B, TMP);
277
157k
    TMP2A = STATE0A;
278
157k
    TMP2B = STATE0B;
279
157k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280
157k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285
157k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286
157k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287
288
    // Transform 1: Rounds 9-12
289
157k
    TMP = vld1q_u32(&K[8]);
290
157k
    TMP0A = vaddq_u32(MSG2A, TMP);
291
157k
    TMP0B = vaddq_u32(MSG2B, TMP);
292
157k
    TMP2A = STATE0A;
293
157k
    TMP2B = STATE0B;
294
157k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295
157k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300
157k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301
157k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302
303
    // Transform 1: Rounds 13-16
304
157k
    TMP = vld1q_u32(&K[12]);
305
157k
    TMP0A = vaddq_u32(MSG3A, TMP);
306
157k
    TMP0B = vaddq_u32(MSG3B, TMP);
307
157k
    TMP2A = STATE0A;
308
157k
    TMP2B = STATE0B;
309
157k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310
157k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315
157k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316
157k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317
318
    // Transform 1: Rounds 17-20
319
157k
    TMP = vld1q_u32(&K[16]);
320
157k
    TMP0A = vaddq_u32(MSG0A, TMP);
321
157k
    TMP0B = vaddq_u32(MSG0B, TMP);
322
157k
    TMP2A = STATE0A;
323
157k
    TMP2B = STATE0B;
324
157k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325
157k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330
157k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331
157k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332
333
    // Transform 1: Rounds 21-24
334
157k
    TMP = vld1q_u32(&K[20]);
335
157k
    TMP0A = vaddq_u32(MSG1A, TMP);
336
157k
    TMP0B = vaddq_u32(MSG1B, TMP);
337
157k
    TMP2A = STATE0A;
338
157k
    TMP2B = STATE0B;
339
157k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340
157k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345
157k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346
157k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347
348
    // Transform 1: Rounds 25-28
349
157k
    TMP = vld1q_u32(&K[24]);
350
157k
    TMP0A = vaddq_u32(MSG2A, TMP);
351
157k
    TMP0B = vaddq_u32(MSG2B, TMP);
352
157k
    TMP2A = STATE0A;
353
157k
    TMP2B = STATE0B;
354
157k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355
157k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360
157k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361
157k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362
363
    // Transform 1: Rounds 29-32
364
157k
    TMP = vld1q_u32(&K[28]);
365
157k
    TMP0A = vaddq_u32(MSG3A, TMP);
366
157k
    TMP0B = vaddq_u32(MSG3B, TMP);
367
157k
    TMP2A = STATE0A;
368
157k
    TMP2B = STATE0B;
369
157k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370
157k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375
157k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376
157k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377
378
    // Transform 1: Rounds 33-36
379
157k
    TMP = vld1q_u32(&K[32]);
380
157k
    TMP0A = vaddq_u32(MSG0A, TMP);
381
157k
    TMP0B = vaddq_u32(MSG0B, TMP);
382
157k
    TMP2A = STATE0A;
383
157k
    TMP2B = STATE0B;
384
157k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385
157k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390
157k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391
157k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392
393
    // Transform 1: Rounds 37-40
394
157k
    TMP = vld1q_u32(&K[36]);
395
157k
    TMP0A = vaddq_u32(MSG1A, TMP);
396
157k
    TMP0B = vaddq_u32(MSG1B, TMP);
397
157k
    TMP2A = STATE0A;
398
157k
    TMP2B = STATE0B;
399
157k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400
157k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405
157k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406
157k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407
408
    // Transform 1: Rounds 41-44
409
157k
    TMP = vld1q_u32(&K[40]);
410
157k
    TMP0A = vaddq_u32(MSG2A, TMP);
411
157k
    TMP0B = vaddq_u32(MSG2B, TMP);
412
157k
    TMP2A = STATE0A;
413
157k
    TMP2B = STATE0B;
414
157k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415
157k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420
157k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421
157k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422
423
    // Transform 1: Rounds 45-48
424
157k
    TMP = vld1q_u32(&K[44]);
425
157k
    TMP0A = vaddq_u32(MSG3A, TMP);
426
157k
    TMP0B = vaddq_u32(MSG3B, TMP);
427
157k
    TMP2A = STATE0A;
428
157k
    TMP2B = STATE0B;
429
157k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430
157k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435
157k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436
157k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437
438
    // Transform 1: Rounds 49-52
439
157k
    TMP = vld1q_u32(&K[48]);
440
157k
    TMP0A = vaddq_u32(MSG0A, TMP);
441
157k
    TMP0B = vaddq_u32(MSG0B, TMP);
442
157k
    TMP2A = STATE0A;
443
157k
    TMP2B = STATE0B;
444
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448
449
    // Transform 1: Rounds 53-56
450
157k
    TMP = vld1q_u32(&K[52]);
451
157k
    TMP0A = vaddq_u32(MSG1A, TMP);
452
157k
    TMP0B = vaddq_u32(MSG1B, TMP);
453
157k
    TMP2A = STATE0A;
454
157k
    TMP2B = STATE0B;
455
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459
460
    // Transform 1: Rounds 57-60
461
157k
    TMP = vld1q_u32(&K[56]);
462
157k
    TMP0A = vaddq_u32(MSG2A, TMP);
463
157k
    TMP0B = vaddq_u32(MSG2B, TMP);
464
157k
    TMP2A = STATE0A;
465
157k
    TMP2B = STATE0B;
466
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470
471
    // Transform 1: Rounds 61-64
472
157k
    TMP = vld1q_u32(&K[60]);
473
157k
    TMP0A = vaddq_u32(MSG3A, TMP);
474
157k
    TMP0B = vaddq_u32(MSG3B, TMP);
475
157k
    TMP2A = STATE0A;
476
157k
    TMP2B = STATE0B;
477
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481
482
    // Transform 1: Update state
483
157k
    TMP = vld1q_u32(&INIT[0]);
484
157k
    STATE0A = vaddq_u32(STATE0A, TMP);
485
157k
    STATE0B = vaddq_u32(STATE0B, TMP);
486
157k
    TMP = vld1q_u32(&INIT[4]);
487
157k
    STATE1A = vaddq_u32(STATE1A, TMP);
488
157k
    STATE1B = vaddq_u32(STATE1B, TMP);
489
490
    // Transform 2: Save state
491
157k
    ABEF_SAVEA = STATE0A;
492
157k
    ABEF_SAVEB = STATE0B;
493
157k
    CDGH_SAVEA = STATE1A;
494
157k
    CDGH_SAVEB = STATE1B;
495
496
    // Transform 2: Rounds 1-4
497
157k
    TMP = vld1q_u32(&MIDS[0]);
498
157k
    TMP2A = STATE0A;
499
157k
    TMP2B = STATE0B;
500
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504
505
    // Transform 2: Rounds 5-8
506
157k
    TMP = vld1q_u32(&MIDS[4]);
507
157k
    TMP2A = STATE0A;
508
157k
    TMP2B = STATE0B;
509
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513
514
    // Transform 2: Rounds 9-12
515
157k
    TMP = vld1q_u32(&MIDS[8]);
516
157k
    TMP2A = STATE0A;
517
157k
    TMP2B = STATE0B;
518
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522
523
    // Transform 2: Rounds 13-16
524
157k
    TMP = vld1q_u32(&MIDS[12]);
525
157k
    TMP2A = STATE0A;
526
157k
    TMP2B = STATE0B;
527
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531
532
    // Transform 2: Rounds 17-20
533
157k
    TMP = vld1q_u32(&MIDS[16]);
534
157k
    TMP2A = STATE0A;
535
157k
    TMP2B = STATE0B;
536
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540
541
    // Transform 2: Rounds 21-24
542
157k
    TMP = vld1q_u32(&MIDS[20]);
543
157k
    TMP2A = STATE0A;
544
157k
    TMP2B = STATE0B;
545
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549
550
    // Transform 2: Rounds 25-28
551
157k
    TMP = vld1q_u32(&MIDS[24]);
552
157k
    TMP2A = STATE0A;
553
157k
    TMP2B = STATE0B;
554
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558
559
    // Transform 2: Rounds 29-32
560
157k
    TMP = vld1q_u32(&MIDS[28]);
561
157k
    TMP2A = STATE0A;
562
157k
    TMP2B = STATE0B;
563
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567
568
    // Transform 2: Rounds 33-36
569
157k
    TMP = vld1q_u32(&MIDS[32]);
570
157k
    TMP2A = STATE0A;
571
157k
    TMP2B = STATE0B;
572
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576
577
    // Transform 2: Rounds 37-40
578
157k
    TMP = vld1q_u32(&MIDS[36]);
579
157k
    TMP2A = STATE0A;
580
157k
    TMP2B = STATE0B;
581
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585
586
    // Transform 2: Rounds 41-44
587
157k
    TMP = vld1q_u32(&MIDS[40]);
588
157k
    TMP2A = STATE0A;
589
157k
    TMP2B = STATE0B;
590
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594
595
    // Transform 2: Rounds 45-48
596
157k
    TMP = vld1q_u32(&MIDS[44]);
597
157k
    TMP2A = STATE0A;
598
157k
    TMP2B = STATE0B;
599
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603
604
    // Transform 2: Rounds 49-52
605
157k
    TMP = vld1q_u32(&MIDS[48]);
606
157k
    TMP2A = STATE0A;
607
157k
    TMP2B = STATE0B;
608
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612
613
    // Transform 2: Rounds 53-56
614
157k
    TMP = vld1q_u32(&MIDS[52]);
615
157k
    TMP2A = STATE0A;
616
157k
    TMP2B = STATE0B;
617
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621
622
    // Transform 2: Rounds 57-60
623
157k
    TMP = vld1q_u32(&MIDS[56]);
624
157k
    TMP2A = STATE0A;
625
157k
    TMP2B = STATE0B;
626
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630
631
    // Transform 2: Rounds 61-64
632
157k
    TMP = vld1q_u32(&MIDS[60]);
633
157k
    TMP2A = STATE0A;
634
157k
    TMP2B = STATE0B;
635
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639
640
    // Transform 2: Update state
641
157k
    STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
642
157k
    STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
643
157k
    STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
644
157k
    STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
645
646
    // Transform 3: Pad previous output
647
157k
    MSG0A = STATE0A;
648
157k
    MSG0B = STATE0B;
649
157k
    MSG1A = STATE1A;
650
157k
    MSG1B = STATE1B;
651
157k
    MSG2A = vld1q_u32(&FINAL[0]);
652
157k
    MSG2B = MSG2A;
653
157k
    MSG3A = vld1q_u32(&FINAL[4]);
654
157k
    MSG3B = MSG3A;
655
656
    // Transform 3: Load state
657
157k
    STATE0A = vld1q_u32(&INIT[0]);
658
157k
    STATE0B = STATE0A;
659
157k
    STATE1A = vld1q_u32(&INIT[4]);
660
157k
    STATE1B = STATE1A;
661
662
    // Transform 3: Rounds 1-4
663
157k
    TMP = vld1q_u32(&K[0]);
664
157k
    TMP0A = vaddq_u32(MSG0A, TMP);
665
157k
    TMP0B = vaddq_u32(MSG0B, TMP);
666
157k
    TMP2A = STATE0A;
667
157k
    TMP2B = STATE0B;
668
157k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669
157k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674
157k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675
157k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676
677
    // Transform 3: Rounds 5-8
678
157k
    TMP = vld1q_u32(&K[4]);
679
157k
    TMP0A = vaddq_u32(MSG1A, TMP);
680
157k
    TMP0B = vaddq_u32(MSG1B, TMP);
681
157k
    TMP2A = STATE0A;
682
157k
    TMP2B = STATE0B;
683
157k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684
157k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689
157k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690
157k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691
692
    // Transform 3: Rounds 9-12
693
157k
    TMP = vld1q_u32(&FINS[0]);
694
157k
    TMP2A = STATE0A;
695
157k
    TMP2B = STATE0B;
696
157k
    MSG2A = vld1q_u32(&FINS[4]);
697
157k
    MSG2B = MSG2A;
698
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702
157k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703
157k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704
705
    // Transform 3: Rounds 13-16
706
157k
    TMP = vld1q_u32(&FINS[8]);
707
157k
    TMP2A = STATE0A;
708
157k
    TMP2B = STATE0B;
709
157k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710
157k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715
157k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716
157k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717
718
    // Transform 3: Rounds 17-20
719
157k
    TMP = vld1q_u32(&K[16]);
720
157k
    TMP0A = vaddq_u32(MSG0A, TMP);
721
157k
    TMP0B = vaddq_u32(MSG0B, TMP);
722
157k
    TMP2A = STATE0A;
723
157k
    TMP2B = STATE0B;
724
157k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725
157k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730
157k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731
157k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732
733
    // Transform 3: Rounds 21-24
734
157k
    TMP = vld1q_u32(&K[20]);
735
157k
    TMP0A = vaddq_u32(MSG1A, TMP);
736
157k
    TMP0B = vaddq_u32(MSG1B, TMP);
737
157k
    TMP2A = STATE0A;
738
157k
    TMP2B = STATE0B;
739
157k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740
157k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745
157k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746
157k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747
748
    // Transform 3: Rounds 25-28
749
157k
    TMP = vld1q_u32(&K[24]);
750
157k
    TMP0A = vaddq_u32(MSG2A, TMP);
751
157k
    TMP0B = vaddq_u32(MSG2B, TMP);
752
157k
    TMP2A = STATE0A;
753
157k
    TMP2B = STATE0B;
754
157k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755
157k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760
157k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761
157k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762
763
    // Transform 3: Rounds 29-32
764
157k
    TMP = vld1q_u32(&K[28]);
765
157k
    TMP0A = vaddq_u32(MSG3A, TMP);
766
157k
    TMP0B = vaddq_u32(MSG3B, TMP);
767
157k
    TMP2A = STATE0A;
768
157k
    TMP2B = STATE0B;
769
157k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770
157k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775
157k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776
157k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777
778
    // Transform 3: Rounds 33-36
779
157k
    TMP = vld1q_u32(&K[32]);
780
157k
    TMP0A = vaddq_u32(MSG0A, TMP);
781
157k
    TMP0B = vaddq_u32(MSG0B, TMP);
782
157k
    TMP2A = STATE0A;
783
157k
    TMP2B = STATE0B;
784
157k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785
157k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790
157k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791
157k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792
793
    // Transform 3: Rounds 37-40
794
157k
    TMP = vld1q_u32(&K[36]);
795
157k
    TMP0A = vaddq_u32(MSG1A, TMP);
796
157k
    TMP0B = vaddq_u32(MSG1B, TMP);
797
157k
    TMP2A = STATE0A;
798
157k
    TMP2B = STATE0B;
799
157k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800
157k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805
157k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806
157k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807
808
    // Transform 3: Rounds 41-44
809
157k
    TMP = vld1q_u32(&K[40]);
810
157k
    TMP0A = vaddq_u32(MSG2A, TMP);
811
157k
    TMP0B = vaddq_u32(MSG2B, TMP);
812
157k
    TMP2A = STATE0A;
813
157k
    TMP2B = STATE0B;
814
157k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815
157k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820
157k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821
157k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822
823
    // Transform 3: Rounds 45-48
824
157k
    TMP = vld1q_u32(&K[44]);
825
157k
    TMP0A = vaddq_u32(MSG3A, TMP);
826
157k
    TMP0B = vaddq_u32(MSG3B, TMP);
827
157k
    TMP2A = STATE0A;
828
157k
    TMP2B = STATE0B;
829
157k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830
157k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835
157k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836
157k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837
838
    // Transform 3: Rounds 49-52
839
157k
    TMP = vld1q_u32(&K[48]);
840
157k
    TMP0A = vaddq_u32(MSG0A, TMP);
841
157k
    TMP0B = vaddq_u32(MSG0B, TMP);
842
157k
    TMP2A = STATE0A;
843
157k
    TMP2B = STATE0B;
844
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848
849
    // Transform 3: Rounds 53-56
850
157k
    TMP = vld1q_u32(&K[52]);
851
157k
    TMP0A = vaddq_u32(MSG1A, TMP);
852
157k
    TMP0B = vaddq_u32(MSG1B, TMP);
853
157k
    TMP2A = STATE0A;
854
157k
    TMP2B = STATE0B;
855
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859
860
    // Transform 3: Rounds 57-60
861
157k
    TMP = vld1q_u32(&K[56]);
862
157k
    TMP0A = vaddq_u32(MSG2A, TMP);
863
157k
    TMP0B = vaddq_u32(MSG2B, TMP);
864
157k
    TMP2A = STATE0A;
865
157k
    TMP2B = STATE0B;
866
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870
871
    // Transform 3: Rounds 61-64
872
157k
    TMP = vld1q_u32(&K[60]);
873
157k
    TMP0A = vaddq_u32(MSG3A, TMP);
874
157k
    TMP0B = vaddq_u32(MSG3B, TMP);
875
157k
    TMP2A = STATE0A;
876
157k
    TMP2B = STATE0B;
877
157k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878
157k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879
157k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880
157k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881
882
    // Transform 3: Update state
883
157k
    TMP = vld1q_u32(&INIT[0]);
884
157k
    STATE0A = vaddq_u32(STATE0A, TMP);
885
157k
    STATE0B = vaddq_u32(STATE0B, TMP);
886
157k
    TMP = vld1q_u32(&INIT[4]);
887
157k
    STATE1A = vaddq_u32(STATE1A, TMP);
888
157k
    STATE1B = vaddq_u32(STATE1B, TMP);
889
890
    // Store result
891
157k
    vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892
157k
    vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893
157k
    vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894
    vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895
157k
}
896
}
897
898
#endif