Crypto++
sosemanuk.cpp
1 // sosemanuk.cpp - written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sosemanuk.cpp" to generate MASM code
4 
5 #include "pch.h"
6 
7 #ifndef CRYPTOPP_GENERATE_X64_MASM
8 
9 #include "sosemanuk.h"
10 #include "misc.h"
11 #include "cpu.h"
12 
13 #include "serpentp.h"
14 
15 NAMESPACE_BEGIN(CryptoPP)
16 
17 void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen)
18 {
19  Serpent_KeySchedule(m_key, 24, userKey, keylen);
20 }
21 
22 void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv, size_t length)
23 {
24  assert(length==16);
25 
26  word32 a, b, c, d, e;
27 
29  Block::Get(iv)(a)(b)(c)(d);
30 
31  const word32 *k = m_key;
32  unsigned int i=1;
33 
34  do
35  {
36  beforeS0(KX); beforeS0(S0); afterS0(LT);
37  afterS0(KX); afterS0(S1); afterS1(LT);
38  if (i == 3) // after 18th round
39  {
40  m_state[4] = b;
41  m_state[5] = e;
42  m_state[10] = c;
43  m_state[11] = a;
44  }
45  afterS1(KX); afterS1(S2); afterS2(LT);
46  afterS2(KX); afterS2(S3); afterS3(LT);
47  if (i == 2) // after 12th round
48  {
49  m_state[6] = c;
50  m_state[7] = d;
51  m_state[8] = b;
52  m_state[9] = e;
53  }
54  afterS3(KX); afterS3(S4); afterS4(LT);
55  afterS4(KX); afterS4(S5); afterS5(LT);
56  afterS5(KX); afterS5(S6); afterS6(LT);
57  afterS6(KX); afterS6(S7); afterS7(LT);
58 
59  if (i == 3)
60  break;
61 
62  ++i;
63  c = b;
64  b = e;
65  e = d;
66  d = a;
67  a = e;
68  k += 32;
69  }
70  while (true);
71 
72  afterS7(KX);
73 
74  m_state[0] = a;
75  m_state[1] = b;
76  m_state[2] = e;
77  m_state[3] = d;
78 
79 #define XMUX(c, x, y) (x ^ (y & (0 - (c & 1))))
80  m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
81  m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
82 }
83 
84 extern "C" {
85 word32 s_sosemanukMulTables[512] = {
86 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
87  0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836,
88  0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E,
89  0x05A7DC90, 0xE4381382, 0x6E30EBB4, 0x8FAF24A6,
90  0xD320B2D8, 0x32BF7DCA, 0xB8B785FC, 0x59284AEE,
91  0x0AE71189, 0xEB78DE9B, 0x617026AD, 0x80EFE9BF,
92  0xDC607FC1, 0x3DFFB0D3, 0xB7F748E5, 0x566887F7,
93  0x0F40CD19, 0xEEDF020B, 0x64D7FA3D, 0x8548352F,
94  0xD9C7A351, 0x38586C43, 0xB2509475, 0x53CF5B67,
95  0x146722BB, 0xF5F8EDA9, 0x7FF0159F, 0x9E6FDA8D,
96  0xC2E04CF3, 0x237F83E1, 0xA9777BD7, 0x48E8B4C5,
97  0x11C0FE2B, 0xF05F3139, 0x7A57C90F, 0x9BC8061D,
98  0xC7479063, 0x26D85F71, 0xACD0A747, 0x4D4F6855,
99  0x1E803332, 0xFF1FFC20, 0x75170416, 0x9488CB04,
100  0xC8075D7A, 0x29989268, 0xA3906A5E, 0x420FA54C,
101  0x1B27EFA2, 0xFAB820B0, 0x70B0D886, 0x912F1794,
102  0xCDA081EA, 0x2C3F4EF8, 0xA637B6CE, 0x47A879DC,
103  0x28CE44DF, 0xC9518BCD, 0x435973FB, 0xA2C6BCE9,
104  0xFE492A97, 0x1FD6E585, 0x95DE1DB3, 0x7441D2A1,
105  0x2D69984F, 0xCCF6575D, 0x46FEAF6B, 0xA7616079,
106  0xFBEEF607, 0x1A713915, 0x9079C123, 0x71E60E31,
107  0x22295556, 0xC3B69A44, 0x49BE6272, 0xA821AD60,
108  0xF4AE3B1E, 0x1531F40C, 0x9F390C3A, 0x7EA6C328,
109  0x278E89C6, 0xC61146D4, 0x4C19BEE2, 0xAD8671F0,
110  0xF109E78E, 0x1096289C, 0x9A9ED0AA, 0x7B011FB8,
111  0x3CA96664, 0xDD36A976, 0x573E5140, 0xB6A19E52,
112  0xEA2E082C, 0x0BB1C73E, 0x81B93F08, 0x6026F01A,
113  0x390EBAF4, 0xD89175E6, 0x52998DD0, 0xB30642C2,
114  0xEF89D4BC, 0x0E161BAE, 0x841EE398, 0x65812C8A,
115  0x364E77ED, 0xD7D1B8FF, 0x5DD940C9, 0xBC468FDB,
116  0xE0C919A5, 0x0156D6B7, 0x8B5E2E81, 0x6AC1E193,
117  0x33E9AB7D, 0xD276646F, 0x587E9C59, 0xB9E1534B,
118  0xE56EC535, 0x04F10A27, 0x8EF9F211, 0x6F663D03,
119  0x50358817, 0xB1AA4705, 0x3BA2BF33, 0xDA3D7021,
120  0x86B2E65F, 0x672D294D, 0xED25D17B, 0x0CBA1E69,
121  0x55925487, 0xB40D9B95, 0x3E0563A3, 0xDF9AACB1,
122  0x83153ACF, 0x628AF5DD, 0xE8820DEB, 0x091DC2F9,
123  0x5AD2999E, 0xBB4D568C, 0x3145AEBA, 0xD0DA61A8,
124  0x8C55F7D6, 0x6DCA38C4, 0xE7C2C0F2, 0x065D0FE0,
125  0x5F75450E, 0xBEEA8A1C, 0x34E2722A, 0xD57DBD38,
126  0x89F22B46, 0x686DE454, 0xE2651C62, 0x03FAD370,
127  0x4452AAAC, 0xA5CD65BE, 0x2FC59D88, 0xCE5A529A,
128  0x92D5C4E4, 0x734A0BF6, 0xF942F3C0, 0x18DD3CD2,
129  0x41F5763C, 0xA06AB92E, 0x2A624118, 0xCBFD8E0A,
130  0x97721874, 0x76EDD766, 0xFCE52F50, 0x1D7AE042,
131  0x4EB5BB25, 0xAF2A7437, 0x25228C01, 0xC4BD4313,
132  0x9832D56D, 0x79AD1A7F, 0xF3A5E249, 0x123A2D5B,
133  0x4B1267B5, 0xAA8DA8A7, 0x20855091, 0xC11A9F83,
134  0x9D9509FD, 0x7C0AC6EF, 0xF6023ED9, 0x179DF1CB,
135  0x78FBCCC8, 0x996403DA, 0x136CFBEC, 0xF2F334FE,
136  0xAE7CA280, 0x4FE36D92, 0xC5EB95A4, 0x24745AB6,
137  0x7D5C1058, 0x9CC3DF4A, 0x16CB277C, 0xF754E86E,
138  0xABDB7E10, 0x4A44B102, 0xC04C4934, 0x21D38626,
139  0x721CDD41, 0x93831253, 0x198BEA65, 0xF8142577,
140  0xA49BB309, 0x45047C1B, 0xCF0C842D, 0x2E934B3F,
141  0x77BB01D1, 0x9624CEC3, 0x1C2C36F5, 0xFDB3F9E7,
142  0xA13C6F99, 0x40A3A08B, 0xCAAB58BD, 0x2B3497AF,
143  0x6C9CEE73, 0x8D032161, 0x070BD957, 0xE6941645,
144  0xBA1B803B, 0x5B844F29, 0xD18CB71F, 0x3013780D,
145  0x693B32E3, 0x88A4FDF1, 0x02AC05C7, 0xE333CAD5,
146  0xBFBC5CAB, 0x5E2393B9, 0xD42B6B8F, 0x35B4A49D,
147  0x667BFFFA, 0x87E430E8, 0x0DECC8DE, 0xEC7307CC,
148  0xB0FC91B2, 0x51635EA0, 0xDB6BA696, 0x3AF46984,
149  0x63DC236A, 0x8243EC78, 0x084B144E, 0xE9D4DB5C,
150  0xB55B4D22, 0x54C48230, 0xDECC7A06, 0x3F53B514,
151 #else
152  0x00000000, 0xE19FCF13, 0x6B973726, 0x8A08F835,
153  0xD6876E4C, 0x3718A15F, 0xBD10596A, 0x5C8F9679,
154  0x05A7DC98, 0xE438138B, 0x6E30EBBE, 0x8FAF24AD,
155  0xD320B2D4, 0x32BF7DC7, 0xB8B785F2, 0x59284AE1,
156  0x0AE71199, 0xEB78DE8A, 0x617026BF, 0x80EFE9AC,
157  0xDC607FD5, 0x3DFFB0C6, 0xB7F748F3, 0x566887E0,
158  0x0F40CD01, 0xEEDF0212, 0x64D7FA27, 0x85483534,
159  0xD9C7A34D, 0x38586C5E, 0xB250946B, 0x53CF5B78,
160  0x1467229B, 0xF5F8ED88, 0x7FF015BD, 0x9E6FDAAE,
161  0xC2E04CD7, 0x237F83C4, 0xA9777BF1, 0x48E8B4E2,
162  0x11C0FE03, 0xF05F3110, 0x7A57C925, 0x9BC80636,
163  0xC747904F, 0x26D85F5C, 0xACD0A769, 0x4D4F687A,
164  0x1E803302, 0xFF1FFC11, 0x75170424, 0x9488CB37,
165  0xC8075D4E, 0x2998925D, 0xA3906A68, 0x420FA57B,
166  0x1B27EF9A, 0xFAB82089, 0x70B0D8BC, 0x912F17AF,
167  0xCDA081D6, 0x2C3F4EC5, 0xA637B6F0, 0x47A879E3,
168  0x28CE449F, 0xC9518B8C, 0x435973B9, 0xA2C6BCAA,
169  0xFE492AD3, 0x1FD6E5C0, 0x95DE1DF5, 0x7441D2E6,
170  0x2D699807, 0xCCF65714, 0x46FEAF21, 0xA7616032,
171  0xFBEEF64B, 0x1A713958, 0x9079C16D, 0x71E60E7E,
172  0x22295506, 0xC3B69A15, 0x49BE6220, 0xA821AD33,
173  0xF4AE3B4A, 0x1531F459, 0x9F390C6C, 0x7EA6C37F,
174  0x278E899E, 0xC611468D, 0x4C19BEB8, 0xAD8671AB,
175  0xF109E7D2, 0x109628C1, 0x9A9ED0F4, 0x7B011FE7,
176  0x3CA96604, 0xDD36A917, 0x573E5122, 0xB6A19E31,
177  0xEA2E0848, 0x0BB1C75B, 0x81B93F6E, 0x6026F07D,
178  0x390EBA9C, 0xD891758F, 0x52998DBA, 0xB30642A9,
179  0xEF89D4D0, 0x0E161BC3, 0x841EE3F6, 0x65812CE5,
180  0x364E779D, 0xD7D1B88E, 0x5DD940BB, 0xBC468FA8,
181  0xE0C919D1, 0x0156D6C2, 0x8B5E2EF7, 0x6AC1E1E4,
182  0x33E9AB05, 0xD2766416, 0x587E9C23, 0xB9E15330,
183  0xE56EC549, 0x04F10A5A, 0x8EF9F26F, 0x6F663D7C,
184  0x50358897, 0xB1AA4784, 0x3BA2BFB1, 0xDA3D70A2,
185  0x86B2E6DB, 0x672D29C8, 0xED25D1FD, 0x0CBA1EEE,
186  0x5592540F, 0xB40D9B1C, 0x3E056329, 0xDF9AAC3A,
187  0x83153A43, 0x628AF550, 0xE8820D65, 0x091DC276,
188  0x5AD2990E, 0xBB4D561D, 0x3145AE28, 0xD0DA613B,
189  0x8C55F742, 0x6DCA3851, 0xE7C2C064, 0x065D0F77,
190  0x5F754596, 0xBEEA8A85, 0x34E272B0, 0xD57DBDA3,
191  0x89F22BDA, 0x686DE4C9, 0xE2651CFC, 0x03FAD3EF,
192  0x4452AA0C, 0xA5CD651F, 0x2FC59D2A, 0xCE5A5239,
193  0x92D5C440, 0x734A0B53, 0xF942F366, 0x18DD3C75,
194  0x41F57694, 0xA06AB987, 0x2A6241B2, 0xCBFD8EA1,
195  0x977218D8, 0x76EDD7CB, 0xFCE52FFE, 0x1D7AE0ED,
196  0x4EB5BB95, 0xAF2A7486, 0x25228CB3, 0xC4BD43A0,
197  0x9832D5D9, 0x79AD1ACA, 0xF3A5E2FF, 0x123A2DEC,
198  0x4B12670D, 0xAA8DA81E, 0x2085502B, 0xC11A9F38,
199  0x9D950941, 0x7C0AC652, 0xF6023E67, 0x179DF174,
200  0x78FBCC08, 0x9964031B, 0x136CFB2E, 0xF2F3343D,
201  0xAE7CA244, 0x4FE36D57, 0xC5EB9562, 0x24745A71,
202  0x7D5C1090, 0x9CC3DF83, 0x16CB27B6, 0xF754E8A5,
203  0xABDB7EDC, 0x4A44B1CF, 0xC04C49FA, 0x21D386E9,
204  0x721CDD91, 0x93831282, 0x198BEAB7, 0xF81425A4,
205  0xA49BB3DD, 0x45047CCE, 0xCF0C84FB, 0x2E934BE8,
206  0x77BB0109, 0x9624CE1A, 0x1C2C362F, 0xFDB3F93C,
207  0xA13C6F45, 0x40A3A056, 0xCAAB5863, 0x2B349770,
208  0x6C9CEE93, 0x8D032180, 0x070BD9B5, 0xE69416A6,
209  0xBA1B80DF, 0x5B844FCC, 0xD18CB7F9, 0x301378EA,
210  0x693B320B, 0x88A4FD18, 0x02AC052D, 0xE333CA3E,
211  0xBFBC5C47, 0x5E239354, 0xD42B6B61, 0x35B4A472,
212  0x667BFF0A, 0x87E43019, 0x0DECC82C, 0xEC73073F,
213  0xB0FC9146, 0x51635E55, 0xDB6BA660, 0x3AF46973,
214  0x63DC2392, 0x8243EC81, 0x084B14B4, 0xE9D4DBA7,
215  0xB55B4DDE, 0x54C482CD, 0xDECC7AF8, 0x3F53B5EB,
216 #endif
217  0x00000000, 0x180F40CD, 0x301E8033, 0x2811C0FE,
218  0x603CA966, 0x7833E9AB, 0x50222955, 0x482D6998,
219  0xC078FBCC, 0xD877BB01, 0xF0667BFF, 0xE8693B32,
220  0xA04452AA, 0xB84B1267, 0x905AD299, 0x88559254,
221  0x29F05F31, 0x31FF1FFC, 0x19EEDF02, 0x01E19FCF,
222  0x49CCF657, 0x51C3B69A, 0x79D27664, 0x61DD36A9,
223  0xE988A4FD, 0xF187E430, 0xD99624CE, 0xC1996403,
224  0x89B40D9B, 0x91BB4D56, 0xB9AA8DA8, 0xA1A5CD65,
225  0x5249BE62, 0x4A46FEAF, 0x62573E51, 0x7A587E9C,
226  0x32751704, 0x2A7A57C9, 0x026B9737, 0x1A64D7FA,
227  0x923145AE, 0x8A3E0563, 0xA22FC59D, 0xBA208550,
228  0xF20DECC8, 0xEA02AC05, 0xC2136CFB, 0xDA1C2C36,
229  0x7BB9E153, 0x63B6A19E, 0x4BA76160, 0x53A821AD,
230  0x1B854835, 0x038A08F8, 0x2B9BC806, 0x339488CB,
231  0xBBC11A9F, 0xA3CE5A52, 0x8BDF9AAC, 0x93D0DA61,
232  0xDBFDB3F9, 0xC3F2F334, 0xEBE333CA, 0xF3EC7307,
233  0xA492D5C4, 0xBC9D9509, 0x948C55F7, 0x8C83153A,
234  0xC4AE7CA2, 0xDCA13C6F, 0xF4B0FC91, 0xECBFBC5C,
235  0x64EA2E08, 0x7CE56EC5, 0x54F4AE3B, 0x4CFBEEF6,
236  0x04D6876E, 0x1CD9C7A3, 0x34C8075D, 0x2CC74790,
237  0x8D628AF5, 0x956DCA38, 0xBD7C0AC6, 0xA5734A0B,
238  0xED5E2393, 0xF551635E, 0xDD40A3A0, 0xC54FE36D,
239  0x4D1A7139, 0x551531F4, 0x7D04F10A, 0x650BB1C7,
240  0x2D26D85F, 0x35299892, 0x1D38586C, 0x053718A1,
241  0xF6DB6BA6, 0xEED42B6B, 0xC6C5EB95, 0xDECAAB58,
242  0x96E7C2C0, 0x8EE8820D, 0xA6F942F3, 0xBEF6023E,
243  0x36A3906A, 0x2EACD0A7, 0x06BD1059, 0x1EB25094,
244  0x569F390C, 0x4E9079C1, 0x6681B93F, 0x7E8EF9F2,
245  0xDF2B3497, 0xC724745A, 0xEF35B4A4, 0xF73AF469,
246  0xBF179DF1, 0xA718DD3C, 0x8F091DC2, 0x97065D0F,
247  0x1F53CF5B, 0x075C8F96, 0x2F4D4F68, 0x37420FA5,
248  0x7F6F663D, 0x676026F0, 0x4F71E60E, 0x577EA6C3,
249  0xE18D0321, 0xF98243EC, 0xD1938312, 0xC99CC3DF,
250  0x81B1AA47, 0x99BEEA8A, 0xB1AF2A74, 0xA9A06AB9,
251  0x21F5F8ED, 0x39FAB820, 0x11EB78DE, 0x09E43813,
252  0x41C9518B, 0x59C61146, 0x71D7D1B8, 0x69D89175,
253  0xC87D5C10, 0xD0721CDD, 0xF863DC23, 0xE06C9CEE,
254  0xA841F576, 0xB04EB5BB, 0x985F7545, 0x80503588,
255  0x0805A7DC, 0x100AE711, 0x381B27EF, 0x20146722,
256  0x68390EBA, 0x70364E77, 0x58278E89, 0x4028CE44,
257  0xB3C4BD43, 0xABCBFD8E, 0x83DA3D70, 0x9BD57DBD,
258  0xD3F81425, 0xCBF754E8, 0xE3E69416, 0xFBE9D4DB,
259  0x73BC468F, 0x6BB30642, 0x43A2C6BC, 0x5BAD8671,
260  0x1380EFE9, 0x0B8FAF24, 0x239E6FDA, 0x3B912F17,
261  0x9A34E272, 0x823BA2BF, 0xAA2A6241, 0xB225228C,
262  0xFA084B14, 0xE2070BD9, 0xCA16CB27, 0xD2198BEA,
263  0x5A4C19BE, 0x42435973, 0x6A52998D, 0x725DD940,
264  0x3A70B0D8, 0x227FF015, 0x0A6E30EB, 0x12617026,
265  0x451FD6E5, 0x5D109628, 0x750156D6, 0x6D0E161B,
266  0x25237F83, 0x3D2C3F4E, 0x153DFFB0, 0x0D32BF7D,
267  0x85672D29, 0x9D686DE4, 0xB579AD1A, 0xAD76EDD7,
268  0xE55B844F, 0xFD54C482, 0xD545047C, 0xCD4A44B1,
269  0x6CEF89D4, 0x74E0C919, 0x5CF109E7, 0x44FE492A,
270  0x0CD320B2, 0x14DC607F, 0x3CCDA081, 0x24C2E04C,
271  0xAC977218, 0xB49832D5, 0x9C89F22B, 0x8486B2E6,
272  0xCCABDB7E, 0xD4A49BB3, 0xFCB55B4D, 0xE4BA1B80,
273  0x17566887, 0x0F59284A, 0x2748E8B4, 0x3F47A879,
274  0x776AC1E1, 0x6F65812C, 0x477441D2, 0x5F7B011F,
275  0xD72E934B, 0xCF21D386, 0xE7301378, 0xFF3F53B5,
276  0xB7123A2D, 0xAF1D7AE0, 0x870CBA1E, 0x9F03FAD3,
277  0x3EA637B6, 0x26A9777B, 0x0EB8B785, 0x16B7F748,
278  0x5E9A9ED0, 0x4695DE1D, 0x6E841EE3, 0x768B5E2E,
279  0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
280  0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
281 };
282 }
283 
284 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
285 unsigned int SosemanukPolicy::GetAlignment() const
286 {
287 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
288 #ifdef __INTEL_COMPILER
289  if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
290 #else
291  if (HasSSE2())
292 #endif
293  return 16;
294  else
295 #endif
296  return GetAlignmentOf<word32>();
297 }
298 
299 unsigned int SosemanukPolicy::GetOptimalBlockSize() const
300 {
301 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
302 #ifdef __INTEL_COMPILER
303  if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
304 #else
305  if (HasSSE2())
306 #endif
307  return 4*BYTES_PER_ITERATION;
308  else
309 #endif
310  return BYTES_PER_ITERATION;
311 }
312 #endif
313 
314 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
315 extern "C" {
316 void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state);
317 }
318 #endif
319 
320 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
321 
322 void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
323 {
324 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
325 
326 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
327  Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data());
328  return;
329 #endif
330 
331 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
332 #ifdef CRYPTOPP_GENERATE_X64_MASM
333  ALIGN 8
334  Sosemanuk_OperateKeystream PROC FRAME
335  rex_push_reg rsi
336  push_reg rdi
337  alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8)
338  save_xmm128 xmm6, 02f0h
339  save_xmm128 xmm7, 0300h
340  .endprolog
341  mov rdi, r8
342  mov rax, r9
343 #else
344 #ifdef __INTEL_COMPILER
345  if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
346 #else
347  if (HasSSE2())
348 #endif
349  {
350 #ifdef __GNUC__
351  #if CRYPTOPP_BOOL_X64
353  #endif
354  __asm__ __volatile__
355  (
356  ".intel_syntax noprefix;"
357  AS_PUSH_IF86( bx)
358 #else
359  word32 *state = m_state;
360  AS2( mov WORD_REG(ax), state)
361  AS2( mov WORD_REG(di), output)
362  AS2( mov WORD_REG(dx), input)
363  AS2( mov WORD_REG(cx), iterationCount)
364 #endif
365 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
366 
367 #if defined(__GNUC__) && CRYPTOPP_BOOL_X64
368  #define SSE2_workspace %5
369 #else
370  #define SSE2_workspace WORD_REG(sp)
371 #endif
372 
373 #define SSE2_output WORD_PTR [SSE2_workspace+1*WORD_SZ]
374 #define SSE2_input WORD_PTR [SSE2_workspace+2*WORD_SZ]
375 #define SSE2_wordsLeft WORD_PTR [SSE2_workspace+3*WORD_SZ]
376 #define SSE2_diEnd WORD_PTR [SSE2_workspace+4*WORD_SZ]
377 #define SSE2_pMulTables WORD_PTR [SSE2_workspace+5*WORD_SZ]
378 #define SSE2_state WORD_PTR [SSE2_workspace+6*WORD_SZ]
379 #define SSE2_wordsLeft2 WORD_PTR [SSE2_workspace+7*WORD_SZ]
380 #define SSE2_stateCopy SSE2_workspace + 8*WORD_SZ
381 #define SSE2_uvStart SSE2_stateCopy + 12*4
382 
383 #if CRYPTOPP_BOOL_X86
384  AS_PUSH_IF86( bp)
385  AS2( mov AS_REG_6, esp)
386  AS2( and esp, -16)
387  AS2( sub esp, 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
388  AS2( mov [esp], AS_REG_6)
389 #endif
390  AS2( mov SSE2_output, WORD_REG(di))
391  AS2( mov SSE2_input, WORD_REG(dx))
392  AS2( mov SSE2_state, WORD_REG(ax))
393 #ifndef _MSC_VER
394  AS2( mov SSE2_pMulTables, WORD_REG(si))
395 #endif
396  AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
397  AS2( lea WORD_REG(si), [4*WORD_REG(cx)])
398  AS2( mov SSE2_wordsLeft, WORD_REG(si))
399  AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
400  AS2( movdqa [SSE2_stateCopy+0*16], xmm0)
401  AS2( movdqa xmm0, [WORD_REG(ax)+1*16])
402  AS2( movdqa [SSE2_stateCopy+1*16], xmm0)
403  AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
404  AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
405  AS2( psrlq xmm0, 32)
406  AS2( movd AS_REG_6d, xmm0) // s(9)
407  AS2( mov ecx, [WORD_REG(ax)+10*4])
408  AS2( mov edx, [WORD_REG(ax)+11*4])
409  AS2( pcmpeqb xmm7, xmm7) // all ones
410 
411 #define s(i) SSE2_stateCopy + ASM_MOD(i,10)*4
412 #define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
413 #define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
414 
415 #define R10 ecx
416 #define R11 edx
417 #define R20 edx
418 #define R21 ecx
419 // workaround bug in GAS 2.15
420 #define R20r WORD_REG(dx)
421 #define R21r WORD_REG(cx)
422 
423 #define SSE2_STEP(i, j) \
424  AS2( mov eax, [s(i+0)])\
425  AS2( mov [v(i)], eax)\
426  AS2( rol eax, 8)\
427  AS2( lea AS_REG_7, [AS_REG_6 + R2##j##r])\
428  AS2( xor AS_REG_7d, R1##j)\
429  AS2( mov [u(i)], AS_REG_7d)\
430  AS2( mov AS_REG_7d, 1)\
431  AS2( and AS_REG_7d, R2##j)\
432  AS1( neg AS_REG_7d)\
433  AS2( and AS_REG_7d, AS_REG_6d)\
434  AS2( xor AS_REG_6d, eax)\
435  AS2( movzx eax, al)\
436  AS2( xor AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\
437  AS2( mov eax, [s(i+3)])\
438  AS2( xor AS_REG_7d, [s(i+2)])\
439  AS2( add R1##j, AS_REG_7d)\
440  AS2( movzx AS_REG_7d, al)\
441  AS2( shr eax, 8)\
442  AS2( xor AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\
443  AS2( xor AS_REG_6d, eax)\
444  AS2( imul R2##j, AS_HEX(54655307))\
445  AS2( rol R2##j, 7)\
446  AS2( mov [s(i+0)], AS_REG_6d)\
447 
448  ASL(2) // outer loop, each iteration of this processes 80 words
449  AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
450  AS2( mov WORD_REG(ax), 80)
451  AS2( cmp WORD_REG(si), 80)
452  AS2( cmovg WORD_REG(si), WORD_REG(ax))
453  AS2( mov SSE2_wordsLeft2, WORD_REG(si))
454  AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop
455  AS2( mov SSE2_diEnd, WORD_REG(si))
456 #ifdef _MSC_VER
457  AS2( lea WORD_REG(si), s_sosemanukMulTables)
458 #else
459  AS2( mov WORD_REG(si), SSE2_pMulTables)
460 #endif
461 
462  ASL(0) // first inner loop, 20 words each, 4 iterations
463  SSE2_STEP(0, 0)
464  SSE2_STEP(1, 1)
465  SSE2_STEP(2, 0)
466  SSE2_STEP(3, 1)
467  SSE2_STEP(4, 0)
468  SSE2_STEP(5, 1)
469  SSE2_STEP(6, 0)
470  SSE2_STEP(7, 1)
471  SSE2_STEP(8, 0)
472  SSE2_STEP(9, 1)
473  SSE2_STEP(10, 0)
474  SSE2_STEP(11, 1)
475  SSE2_STEP(12, 0)
476  SSE2_STEP(13, 1)
477  SSE2_STEP(14, 0)
478  SSE2_STEP(15, 1)
479  SSE2_STEP(16, 0)
480  SSE2_STEP(17, 1)
481  SSE2_STEP(18, 0)
482  SSE2_STEP(19, 1)
483  // loop
484  AS2( add WORD_REG(di), 5*4)
485  AS2( cmp WORD_REG(di), SSE2_diEnd)
486  ASJ( jne, 0, b)
487 
488  AS2( mov WORD_REG(ax), SSE2_input)
489  AS2( mov AS_REG_7, SSE2_output)
490  AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
491  AS2( mov WORD_REG(si), SSE2_wordsLeft2)
492 
493  ASL(1) // second inner loop, 16 words each, 5 iterations
494  AS2( movdqa xmm0, [WORD_REG(di)+0*20*4])
495  AS2( movdqa xmm2, [WORD_REG(di)+2*20*4])
496  AS2( movdqa xmm3, [WORD_REG(di)+3*20*4])
497  AS2( movdqa xmm1, [WORD_REG(di)+1*20*4])
498  // S2
499  AS2( movdqa xmm4, xmm0)
500  AS2( pand xmm0, xmm2)
501  AS2( pxor xmm0, xmm3)
502  AS2( pxor xmm2, xmm1)
503  AS2( pxor xmm2, xmm0)
504  AS2( por xmm3, xmm4)
505  AS2( pxor xmm3, xmm1)
506  AS2( pxor xmm4, xmm2)
507  AS2( movdqa xmm1, xmm3)
508  AS2( por xmm3, xmm4)
509  AS2( pxor xmm3, xmm0)
510  AS2( pand xmm0, xmm1)
511  AS2( pxor xmm4, xmm0)
512  AS2( pxor xmm1, xmm3)
513  AS2( pxor xmm1, xmm4)
514  AS2( pxor xmm4, xmm7)
515  // xor with v
516  AS2( pxor xmm2, [WORD_REG(di)+80*4])
517  AS2( pxor xmm3, [WORD_REG(di)+80*5])
518  AS2( pxor xmm1, [WORD_REG(di)+80*6])
519  AS2( pxor xmm4, [WORD_REG(di)+80*7])
520  // exit loop early if less than 16 words left to output
521  // this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
522  AS2( cmp WORD_REG(si), 16)
523  ASJ( jl, 4, f)
524  // unpack
525  AS2( movdqa xmm6, xmm2)
526  AS2( punpckldq xmm2, xmm3)
527  AS2( movdqa xmm5, xmm1)
528  AS2( punpckldq xmm1, xmm4)
529  AS2( movdqa xmm0, xmm2)
530  AS2( punpcklqdq xmm2, xmm1)
531  AS2( punpckhqdq xmm0, xmm1)
532  AS2( punpckhdq xmm6, xmm3)
533  AS2( punpckhdq xmm5, xmm4)
534  AS2( movdqa xmm3, xmm6)
535  AS2( punpcklqdq xmm6, xmm5)
536  AS2( punpckhqdq xmm3, xmm5)
537  // output keystream
538  AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4)
539 
540  // loop
541  AS2( add WORD_REG(di), 4*4)
542  AS2( sub WORD_REG(si), 16)
543  ASJ( jnz, 1, b)
544 
545  // outer loop
546  AS2( mov WORD_REG(si), SSE2_wordsLeft)
547  AS2( sub WORD_REG(si), 80)
548  ASJ( jz, 6, f)
549  AS2( mov SSE2_wordsLeft, WORD_REG(si))
550  AS2( mov SSE2_input, WORD_REG(ax))
551  AS2( mov SSE2_output, AS_REG_7)
552  ASJ( jmp, 2, b)
553 
554  ASL(4) // final output of less than 16 words
555  AS2( test WORD_REG(ax), WORD_REG(ax))
556  ASJ( jz, 5, f)
557  AS2( movd xmm0, dword ptr [WORD_REG(ax)+0*4])
558  AS2( pxor xmm2, xmm0)
559  AS2( movd xmm0, dword ptr [WORD_REG(ax)+1*4])
560  AS2( pxor xmm3, xmm0)
561  AS2( movd xmm0, dword ptr [WORD_REG(ax)+2*4])
562  AS2( pxor xmm1, xmm0)
563  AS2( movd xmm0, dword ptr [WORD_REG(ax)+3*4])
564  AS2( pxor xmm4, xmm0)
565  AS2( add WORD_REG(ax), 16)
566  ASL(5)
567  AS2( movd dword ptr [AS_REG_7+0*4], xmm2)
568  AS2( movd dword ptr [AS_REG_7+1*4], xmm3)
569  AS2( movd dword ptr [AS_REG_7+2*4], xmm1)
570  AS2( movd dword ptr [AS_REG_7+3*4], xmm4)
571  AS2( sub WORD_REG(si), 4)
572  ASJ( jz, 6, f)
573  AS2( add AS_REG_7, 16)
574  AS2( psrldq xmm2, 4)
575  AS2( psrldq xmm3, 4)
576  AS2( psrldq xmm1, 4)
577  AS2( psrldq xmm4, 4)
578  ASJ( jmp, 4, b)
579 
580  ASL(6) // save state
581  AS2( mov AS_REG_6, SSE2_state)
582  AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
583  AS2( movdqa [AS_REG_6+0*16], xmm0)
584  AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
585  AS2( movdqa [AS_REG_6+1*16], xmm0)
586  AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
587  AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0)
588  AS2( mov [AS_REG_6+10*4], ecx)
589  AS2( mov [AS_REG_6+11*4], edx)
590 
591  AS_POP_IF86( sp)
592  AS_POP_IF86( bp)
593 
594 #ifdef __GNUC__
595  AS_POP_IF86( bx)
596  ".att_syntax prefix;"
597  :
598  : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input)
599  #if CRYPTOPP_BOOL_X64
600  , "r" (workspace.m_ptr)
601  : "memory", "cc", "%r9", "%r10", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
602  #else
603  : "memory", "cc"
604  #endif
605  );
606 #endif
607 #ifdef CRYPTOPP_GENERATE_X64_MASM
608  movdqa xmm6, [rsp + 02f0h]
609  movdqa xmm7, [rsp + 0300h]
610  add rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8
611  pop rdi
612  pop rsi
613  ret
614  Sosemanuk_OperateKeystream ENDP
615 #else
616  }
617  else
618 #endif
619 #endif
620 #ifndef CRYPTOPP_GENERATE_X64_MASM
621  {
622 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
623 #define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_sosemanukMulTables[byte(x)])
624 #else
625 #define MUL_A(x) (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24])
626 #endif
627 
628 #define DIV_A(x) (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)])
629 
630 #define r1(i) ((i%2) ? reg2 : reg1)
631 #define r2(i) ((i%2) ? reg1 : reg2)
632 
633 #define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u) \
634  u = (s##x9 + r2(x0)) ^ r1(x0);\
635  v = s##x0;\
636  s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
637  r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
638  r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
639 
640 #define SOSEMANUK_OUTPUT(x) \
641  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\
642  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, u3 ^ v1);\
643  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, u1 ^ v2);\
644  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, u4 ^ v3);
645 
646 #define OUTPUT4 \
647  S2(0, u0, u1, u2, u3, u4);\
648  CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SOSEMANUK_OUTPUT, 4*4);
649 
650  word32 s0 = m_state[0];
651  word32 s1 = m_state[1];
652  word32 s2 = m_state[2];
653  word32 s3 = m_state[3];
654  word32 s4 = m_state[4];
655  word32 s5 = m_state[5];
656  word32 s6 = m_state[6];
657  word32 s7 = m_state[7];
658  word32 s8 = m_state[8];
659  word32 s9 = m_state[9];
660  word32 reg1 = m_state[10];
661  word32 reg2 = m_state[11];
662  word32 u0, u1, u2, u3, u4, v0, v1, v2, v3;
663 
664  do
665  {
666  STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v0, u0)
667  STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v1, u1)
668  STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v2, u2)
669  STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v3, u3)
670  OUTPUT4
671  STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v0, u0)
672  STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v1, u1)
673  STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v2, u2)
674  STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v3, u3)
675  OUTPUT4
676  STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v0, u0)
677  STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v1, u1)
678  STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v2, u2)
679  STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v3, u3)
680  OUTPUT4
681  STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v0, u0)
682  STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v1, u1)
683  STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v2, u2)
684  STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v3, u3)
685  OUTPUT4
686  STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v0, u0)
687  STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v1, u1)
688  STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v2, u2)
689  STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v3, u3)
690  OUTPUT4
691  }
692  while (--iterationCount);
693 
694  m_state[0] = s0;
695  m_state[1] = s1;
696  m_state[2] = s2;
697  m_state[3] = s3;
698  m_state[4] = s4;
699  m_state[5] = s5;
700  m_state[6] = s6;
701  m_state[7] = s7;
702  m_state[8] = s8;
703  m_state[9] = s9;
704  m_state[10] = reg1;
705  m_state[11] = reg2;
706  }
707 }
708 
709 NAMESPACE_END
710 
711 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
interface for retrieving values given their names
Definition: cryptlib.h:225