/* This post describes an implementation of the ChaCha20 stream cipher in x86-64 * assembly, optimized for code size and clarity rather than speed. * It is also an x86-64 assembly source file, so you could assemble it yourself: * * as -o chacha.o chacha.s * * but it is *not* a complete program, so you'd need to write an appropriate * main function. Also, I did not implement the part where you take an * arbitrary-length message and xor it against the blocks of key stream * generated by the chacha block function, since it is not particularly * interesting to write or to read. This file just contains the chacha block * function itself, which is the "guts" of the cipher. * * Internally, chacha20's state is 16 32-bit words, which are mutated over a * series of rounds. The "quarter round" function takes four indexes (a, b, c, * d) into the 16-word state s, and does: * * s[a] += s[b]; s[d] ^= s[a]; s[d] <<<= 16; * s[c] += s[d]; s[b] ^= s[c]; s[b] <<<= 12; * s[a] += s[b]; s[d] ^= s[a]; s[d] <<<= 8; * s[c] += s[d]; s[b] ^= s[c]; s[b] <<<= 7; * * Using that function, which we'll call qr() here, chacha20 runs 20 total * rounds (hence the name), each made up of four quarter rounds. These quarter * rounds operate on varying parts of the 16-word state. Specifically, chacha20 * has an inner block function defined this way, which ends up being applied 10 * times for 20 total rounds: * * void inblock(state) { * column_round(state); * diagonal_round(state); * } * * where: * * void column_round(s) { * qr(s, 0, 4, 8, 12); * qr(s, 1, 5, 9, 13); * qr(s, 2, 6, 10, 14); * qr(s, 3, 7, 11, 15); * } * * void diagonal_round(s) { * qr(s, 0, 5, 10, 15); * qr(s, 1, 6, 11, 12); * qr(s, 2, 7, 8, 13); * qr(s, 3, 4, 9, 14); * } * * Wrapped around the inner block function, there are two steps: setup and * finalization. Setup involves building the state from the block function's * inputs and saving a copy of it, and finalization involves re-adding the * initial state to the output. The full block function therefore looks like * this (not quite C any more): * * byte[64] block(byte key[32], byte counter[4], byte nonce[12]) { * word constants[4] = { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 }; * word state[16] = concat(constants, key, counter, nonce); * word initial_state[16] = state * for (i = 0; i < 10; i++) * inblock(state) * state += initial_state * return serialize(state) * } * * From a given key, nonce, and counter (index into the key stream) this * produces a 64-byte block of pseudorandom data, which can be XORed with * plaintext to produce ciphertext in the usual stream cipher fashion. * * A couple of notes about this construction: * * C1. The block function doesn't depend on the previous block, so you can * seek to any point in the key stream if you want to decrypt only part of * a large ciphertext (although presumably you are going to authenticate * the whole thing first anyway...) * C2. The constants in block() are the ASCII string "expand 32-byte k", which * is a pretty great nothing-up-my-sleeve number. * * Alright, enough front matter - on to the code! * * A couple of *implementation* notes: * * I1. I ignored the usual calling conventions about registers because it was * convenient. * I2. I used the old x86 string/loop instructions to save code size. * I3. I suspect a version of this using SSE would be much faster and maybe * even more compact, but I don't know the SSE extensions at all. */ /* This is the quarter-round function, which we called qr() above. As inputs: * %rsi is a pointer to the state words * %rdi selects which state words to manipulate * specifically, since the quarter-round function affects 4 state words, the * indexes of those 4 words are packed into %rdi as bytes in big-endian * byte order. So for example, for the very first qr() call, which operates on * words 0, 4, 8, and 12, %rdi is 0x0004080c. This little maneuver requires 41 * bytes of setup code to decode %rdi, but saves even more code size elsewhere. */ chacha20_qr: /* xor and move small constant assembles to: * 48 31 c0 xor %rax, %rax * b0 ff mov $0xff, %al * while simply moving to a wider register instead yields: * 48 c7 c0 ff 00 00 00 mov $0xff, %rax * because there are no "short forms" of moving constants into the * wider registers. * * We'll use this $0xff as a mask below to extract bytes from %rdi. */ xor %rax, %rax mov $0xff, %al /* shift and mask the four bytes out of %rdi into %r8, %r9, %r10, %r11 * - blessed x86-64 extra regs! */ mov %rdi, %r8 shr $24, %r8 and %rax, %r8 mov %rdi, %r9 shr $16, %r9 and %rax, %r9 mov %rdi, %r10 shr $8, %r10 and %rax, %r10 mov %rdi, %r11 and %rax, %r11 /* load the a, b, c, d state words */ mov (%rsi,%r8,4), %eax mov (%rsi,%r9,4), %ebx mov (%rsi,%r10,4), %ecx mov (%rsi,%r11,4), %edx /* add-xor-rotate a few times */ add %ebx, %eax xor %eax, %edx rol $16, %edx add %edx, %ecx xor %ecx, %ebx rol $12, %ebx add %ebx, %eax xor %eax, %edx rol $8, %edx add %edx, %ecx xor %ecx, %ebx rol $7, %ebx /* put the state words back */ mov %eax, (%rsi,%r8,4) mov %ebx, (%rsi,%r9,4) mov %ecx, (%rsi,%r10,4) mov %edx, (%rsi,%r11,4) ret /* The inblock function: this loads appropriate values of %rdi for each of the * eight quarter-rounds from roundtab, which is a table of 8 words in rodata. * It calls qr in a loop on each one. Throughout this function, %rsi points * at the state. */ chacha20_inblock: /* %r12 is our index into roundtab, %r13 is the base of it; doing this, * rather than incrementing %r13 directly, simplifies telling whether * the iteration is done. In a performance-focused implementation you * could simply unroll all of this and have no roundtab. */ xor %r12, %r12 lea roundtab, %r13 /* for (i = 0; i < 8; i++) * qr(state, roundtab[i]); */ 1: mov (%r13, %r12, 4), %edi call chacha20_qr inc %r12 cmp $8, %r12 jne 1b ret /* The setup function! This takes the three pieces of per-block state (the key, * the nonce, and the block number) and packs them into the initial block state. * It also handles adding the initial constants to the state. * %rsi = (output) state, %rdi = key, %rbx = nonce, %rdx = block * * The resulting 16-word, 64-byte state looks like: * * cccccccc cccccccc cccccccc cccccccc * kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk * kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk * bbbbbbbb nnnnnnnn nnnnnnnn nnnnnnnn * where the cs are the constant words, ks are the key words, b is the block * number word, and the ns are the nonce words. */ chacha20_setup: push %rsi push %rdi /* Somewhat awkwardly, we used %rsi for the state, but we want to use * the x86 string instructions here which use %rsi as the source and * %rdi as the destination - and also modify both registers. Solution: * save both, allow them to become mangled, and simply put them back at * the end. */ xchg %rsi, %rdi /* The constants go in first - we're adjusting %rdi as we go. */ movl $0x61707865, (%rdi) movl $0x3320646e, 4(%rdi) movl $0x79622d32, 8(%rdi) movl $0x6b206574, 12(%rdi) add $16, %rdi /* The idiom: * mov $n, %cl * rep movsl * means "copy $n words from %rsi to %rdi, modifying %rsi and %rdi and * counting down in %cl". It compiles into 4 bytes of machine code (!). * This copies the 8 32-bit words of key. */ mov $8, %cl rep movsl /* 1 word of block number */ mov %rdx, (%rdi) add $4, %rdi /* 3 words of nonce, same idea as above */ mov $3, %cl mov %rbx, %rsi rep movsl /* put the registers we trashed back */ pop %rdi pop %rsi ret /* The actual block function itself! * %rsi = state, %rdi = key, %rbx = nonce, %rdx = block * * Set up the state, make a copy of it on the stack, run inblock on it 10 times, * then re-add the copy from the stack. */ chacha20_block: mov %rsi, %r8 call chacha20_setup /* Make a copy */ sub $64, %rsp mov %rsp, %rdi mov $16, %rcx rep movsl mov %r8, %rsi /* 10 iterations of inblock */ mov $10, %cl 1: /* note: the loop instruction depends on %rcx but inblock will trash it, * so we have to save it here */ push %rcx call chacha20_inblock pop %rcx loop 1b /* re-add the copied initial state, bytewise */ mov %rsp, %rdi mov $64, %cl 2: mov (%rsp,%rcx,1), %al add %al, (%rsi,%rcx,1) loop 2b add $64, %rsp ret /* The table of %rdi values for qr(), used by inblock - see its comments :) */ .section rodata roundtab: .long 0x0004080c .long 0x0105090d .long 0x02060a0e .long 0x03070b0f .long 0x00050a0f .long 0x01060b0c .long 0x0207080d .long 0x0304090e /* And that's it! 245 bytes of code, 32 bytes of rodata, and probably pretty bad * performance but you do get what you pay for sometimes. You would probably * need a further hundred bytes or so to handle getting a message, generating * the key blocks, and xoring them together appropriately. * * Thanks for reading! Next up I might do an implementation of SPECK, a * recent-ish lightweight block cipher from the NSA with a somewhat dubious * standardization history... */