## Can the Serpent Cipher fit in the ICE40 FPGA?

The question of whether the Serpent cipher could fit in a ICE40 FPGA was posed recently, and my first thought was: *why not bake what appears to be the heaviest moving part, and see how many gates it requires?* Then it will be possible to estimate whether the entire thing is worth baking. (I have very little interest in using again any FPGA other than ICE40, unless another reversing breakthrough, concerning a larger chip, is made public.)

And so I took the forward S-box of Serpent and came up with the following:

module sbox_forward(clk, fire, box_n, a, b, c, d, w, x, y, z); // clock input wire clk; // trigger input wire fire; // number of sbox input [3:0] box_n; // inputs input [31:0] a; input [31:0] b; input [31:0] c; input [31:0] d; // outputs output reg [31:0] w = 0; output reg [31:0] x = 0; output reg [31:0] y = 0; output reg [31:0] z = 0; always @(posedge clk) if (fire) begin begin case (box_n) ///////////////////////////////// 4'd0: begin w < = ( a & ~b & ~c & d) | (~a & c & d) | (~a & ~c & ~d) | ( b & ~d); x <= ( a & b & ~c & d) | ( a & c & ~d) | (~a & ~b) | (~a & ~c & ~d); y <= ((~b | c) & d) ^ ((a ^ b) & (b | c)); z <= b ^ c ^ (a | d); end ///////////////////////////////// 4'd1: begin w <= ( a & b & ~c & ~d) | ( a & ~b & d) | (~a & b & c) | (~a & ~b & ~c) | (~a & ~b & ~d); x <= ( a & b & ~d) | ( a & ~b & ~c & d) | (~a & b & d) | (~a & c & d) | (~a & ~c & ~d); y <= c ^ d ^ (a | ~b); z <= ( a & b & c & ~d) | ( a & ~b & d) | (~a & ~b & ~d) | ( b & ~c & d) | ( ~b & ~c & ~d); end ///////////////////////////////// 4'd2: begin w <= a ^ b ^ d ^ (a | c); x <= ( a & ~b & ~c & ~d) | ( a & c & d) | (~a & b & ~c) | (~a & c & ~d) | ( b & c & ~d); y <= ( a & ~b & ~d) | ( a & c & ~d) | (~a & b & ~c) | (~a & b & d) | (~a & ~c & d) | ( b & ~c & d); z <= ( a & b & ~d) | ( a & ~b & c) | (~a & ~b & ~c) | (~a & ~c & d) | ( b & c & ~d); end ///////////////////////////////// 4'd3: begin w <= ( a & ~b) | ( a & c & ~d) | (~a & b & c & d) | (~a & b & ~c & ~d) | ( ~b & ~c & d); x <= ( a & b & c) | ( a & ~b & ~c & ~d) | (~a & b & ~c) | ( b & c & ~d) | (~b & c & d); y <= ( a & b & d) | ( a & ~b & ~c & ~d) | ( a & c & d) | (~a & b & c) | (~a & ~b & ~c & d) | (~a & c & ~d); z <= ( a & b & c & d) | ( a & ~b & ~d) | (~a & ~b & d) | ( b & ~c & ~d) | ( ~b & c & ~d); end ///////////////////////////////// 4'd4: begin w <= ( a & ~b & ~c) | ( a & ~c & ~d) | (~a & b & c) | (~a & c & d) | ( b & c & d) | ( ~b & ~c & ~d); x <= ( a & b & ~c) | ( a & ~b & c & d) | ( a & ~c & ~d) | (~a & b & c) | (~a & ~b & ~c & d) | ( b & c & ~d); y <= ( a & b & c) | ( a & ~b & ~c) | ( a & ~b & d) | (~a & b & d) | (~a & ~b & c & ~d); z <= a ^ (d & (a | b)) ^ (b | c); end ///////////////////////////////// 4'd5: begin w <= ( a & ~b & ~c) | ( a & ~c & ~d) | (~a & b & c) | (~a & c & d) | ( b & c & d) | (~b & ~c & ~d); x <= ( a & ~b & c) | ( a & ~b & d) | (~a & b & d) | (~a & ~c & ~d) | ( b & ~c & ~d); y <= ( a & b & c & ~d) | (~a & b & d) | (~a & ~b & c) | ( ~b & c & d) | ( ~b & ~c & ~d); z <= ( a & b & ~c) | ( a & c & ~d) | (~a & ~b & c & d) | (~a & ~b & ~c & ~d) | ( b & c & ~d) | ( b & ~c & d); end ///////////////////////////////// 4'd6: begin w <= ( a & b & ~d) | ( a & ~c & d) | (~a & ~b & ~c & ~d) | ( b & ~c & d) | ( ~b & c & d); x <= ~(b ^ c ^ (a & d)); y <= ( a & b & ~c) | ( a & ~b & c & ~d) | (~a & b & ~d) | (~a & ~b & ~c) | (~a & ~b & d); z <= ( a & b & c & ~d) | ( a & ~c & d) | (~a & b & ~c & ~d) | (~a & ~b & c) | (~a & c & d) | ( ~b & ~c & d); end ///////////////////////////////// 4'd7: begin w <= ( a & b & c & ~d) | (~a & ~b & ~c) | (~a & c & d) | (~a & ~c & ~d) | ( ~b & c & d) | ( ~b & ~c & ~d); x <= ( a & b & c) | ( a & b & d) | ( a & c & d) | (~a & b & ~d) | (~a & ~b & ~c & d) | (~a & c & ~d); y <= ( a & ~b & ~c) | (~a & b & ~c) | (~a & ~b & c & ~d) | (~a & ~c & d) | ( b & c & d); z <= c ^ (a & ~d) ^ (b | (a & c)); end ///////////////////////////////// endcase // case (box_n) end end endmodule

And a simple "smoke test":

`timescale 1ns/100ps `include "sbox.v" module serpent_sbox_tester; reg clk = 0; // firing signal reg f = 0; // box # reg [3:0] box_n = 0; // inputs reg [31:0] a = 0; reg [31:0] b = 0; reg [31:0] c = 0; reg [31:0] d = 0; // outputs wire [31:0] w; wire [31:0] x; wire [31:0] y; wire [31:0] z; // serpent 'forward' sbox sbox_forward sbf(.clk(clk), .fire(f), .box_n(box_n), .a(a), .b(b), .c(c), .d(d), .w(w), .x(x), .y(y), .z(z)); // clockator always begin clk = 1'b1; #1; clk = 1'b0; #1; end initial begin: Init #0 $display ("Started...\n"); #0 a = 32'hDEADF00D; #0 b = 32'hC0DEDBAD; #0 c = 32'hCAFEBABE; #0 d = 32'hBADCAFED; // display inputs $display ("a = %h b = %h c = %h d = %h", a, b, c, d); #1 f = 1; // sbox 0: #1 box_n = 0; $display ("box_n = %h", box_n); #2; $display ("w = %h x = %h y = %h z = %h", w, x, y, z); // sbox 1: #3 box_n = 1; $display ("box_n = %h", box_n); #4; $display ("w = %h x = %h y = %h z = %h", w, x, y, z); // sbox 2: #5 box_n = 2; $display ("box_n = %h", box_n); #6; $display ("w = %h x = %h y = %h z = %h", w, x, y, z); // sbox 3: #7 box_n = 3; $display ("box_n = %h", box_n); #8; $display ("w = %h x = %h y = %h z = %h", w, x, y, z); // sbox 4: #9 box_n = 4; $display ("box_n = %h", box_n); #10; $display ("w = %h x = %h y = %h z = %h", w, x, y, z); // sbox 5: #11 box_n = 5; $display ("box_n = %h", box_n); #12; $display ("w = %h x = %h y = %h z = %h", w, x, y, z); // sbox 6: #13 box_n = 6; $display ("box_n = %h", box_n); #14; $display ("w = %h x = %h y = %h z = %h", w, x, y, z); // sbox 7: #15 box_n = 7; $display ("box_n = %h", box_n); #16; $display ("w = %h x = %h y = %h z = %h", w, x, y, z); $display ("Finish.\n"); $finish; end // block: Init endmodule

# Project Name PROJ=small # Simulator testbench BENCH=tester sim: iverilog -v $(BENCH).v -o $(PROJ)sim

make ./smallsim

... produced the following output:

a = deadf00d b = c0dedbad c = cafebabe d = badcafed box_n = 0 w = 51525aa0 x = 61201453 y = b0ae854c z = f4dd9efe box_n = 1 w = 3b526ef2 x = 51505ba0 y = 8f8fe10c z = 5f013113 box_n = 2 w = 7a507ef2 x = ce8fb11e y = 64711fe1 z = 6b227540 box_n = 3 w = 7e713ee0 x = ce8fb10c y = aedfaeff z = a4adc45e box_n = 4 w = 95dfcaac x = 6e537ee1 y = deddbbbe z = 8e8fa01f box_n = 5 w = 95dfcaac x = 1b706ba0 y = 4f513bb2 z = 41225101 box_n = 6 w = 5b007101 x = 6f533ee1 y = 21224441 z = 70501ef3 box_n = 7 w = 6f513ee0 x = ea8eb45f y = b4dd8ffe z = 44211113 Finish.

Let's quickly verify, using a section of the ancient C source for Serpent:

#define u32 uint32_t #define SBOX0(a, b, c, d, w, x, y, z) \ { \ u32 t02, t03, t05, t06, t07, t08, t09; \ u32 t11, t12, t13, t14, t15, t17, t01; \ t01 = b ^ c ; \ t02 = a | d ; \ t03 = a ^ b ; \ z = t02 ^ t01; \ t05 = c | z ; \ t06 = a ^ d ; \ t07 = b | c ; \ t08 = d & t05; \ t09 = t03 & t07; \ y = t09 ^ t08; \ t11 = t09 & y ; \ t12 = c ^ d ; \ t13 = t07 ^ t11; \ t14 = b & t06; \ t15 = t06 ^ t13; \ w = ~ t15; \ t17 = w ^ t14; \ x = t12 ^ t17; \ } #define SBOX1(a, b, c, d, w, x, y, z) \ { \ u32 t02, t03, t04, t05, t06, t07, t08; \ u32 t10, t11, t12, t13, t16, t17, t01; \ t01 = a | d ; \ t02 = c ^ d ; \ t03 = ~ b ; \ t04 = a ^ c ; \ t05 = a | t03; \ t06 = d & t04; \ t07 = t01 & t02; \ t08 = b | t06; \ y = t02 ^ t05; \ t10 = t07 ^ t08; \ t11 = t01 ^ t10; \ t12 = y ^ t11; \ t13 = b & d ; \ z = ~ t10; \ x = t13 ^ t12; \ t16 = t10 | x ; \ t17 = t05 & t16; \ w = c ^ t17; \ } #define SBOX2(a, b, c, d, w, x, y, z) \ { \ u32 t02, t03, t05, t06, t07, t08; \ u32 t09, t10, t12, t13, t14, t01; \ t01 = a | c ; \ t02 = a ^ b ; \ t03 = d ^ t01; \ w = t02 ^ t03; \ t05 = c ^ w ; \ t06 = b ^ t05; \ t07 = b | t05; \ t08 = t01 & t06; \ t09 = t03 ^ t07; \ t10 = t02 | t09; \ x = t10 ^ t08; \ t12 = a | d ; \ t13 = t09 ^ x ; \ t14 = b ^ t13; \ z = ~ t09; \ y = t12 ^ t14; \ } #define SBOX3(a, b, c, d, w, x, y, z) \ { \ u32 t02, t03, t04, t05, t06, t07, t08; \ u32 t09, t10, t11, t13, t14, t15, t01; \ t01 = a ^ c ; \ t02 = a | d ; \ t03 = a & d ; \ t04 = t01 & t02; \ t05 = b | t03; \ t06 = a & b ; \ t07 = d ^ t04; \ t08 = c | t06; \ t09 = b ^ t07; \ t10 = d & t05; \ t11 = t02 ^ t10; \ z = t08 ^ t09; \ t13 = d | z ; \ t14 = a | t07; \ t15 = b & t13; \ y = t08 ^ t11; \ w = t14 ^ t15; \ x = t05 ^ t04; \ } #define SBOX4(a, b, c, d, w, x, y, z) \ { \ u32 t02, t03, t04, t05, t06, t08, t09; \ u32 t10, t11, t12, t13, t14, t15, t16, t01; \ t01 = a | b ; \ t02 = b | c ; \ t03 = a ^ t02; \ t04 = b ^ d ; \ t05 = d | t03; \ t06 = d & t01; \ z = t03 ^ t06; \ t08 = z & t04; \ t09 = t04 & t05; \ t10 = c ^ t06; \ t11 = b & c ; \ t12 = t04 ^ t08; \ t13 = t11 | t03; \ t14 = t10 ^ t09; \ t15 = a & t05; \ t16 = t11 | t12; \ y = t13 ^ t08; \ x = t15 ^ t16; \ w = ~ t14; \ } #define SBOX5(a, b, c, d, w, x, y, z) \ { \ u32 t02, t03, t04, t05, t07, t08, t09; \ u32 t10, t11, t12, t13, t14, t01; \ t01 = b ^ d ; \ t02 = b | d ; \ t03 = a & t01; \ t04 = c ^ t02; \ t05 = t03 ^ t04; \ w = ~ t05; \ t07 = a ^ t01; \ t08 = d | w ; \ t09 = b | t05; \ t10 = d ^ t08; \ t11 = b | t07; \ t12 = t03 | w ; \ t13 = t07 | t10; \ t14 = t01 ^ t11; \ y = t09 ^ t13; \ x = t07 ^ t08; \ z = t12 ^ t14; \ } #define SBOX6(a, b, c, d, w, x, y, z) \ { \ u32 t02, t03, t04, t05, t07, t08, t09, t10; \ u32 t11, t12, t13, t15, t17, t18, t01; \ t01 = a & d ; \ t02 = b ^ c ; \ t03 = a ^ d ; \ t04 = t01 ^ t02; \ t05 = b | c ; \ x = ~ t04; \ t07 = t03 & t05; \ t08 = b & x ; \ t09 = a | c ; \ t10 = t07 ^ t08; \ t11 = b | d ; \ t12 = c ^ t11; \ t13 = t09 ^ t10; \ y = ~ t13; \ t15 = x & t03; \ z = t12 ^ t07; \ t17 = a ^ b ; \ t18 = y ^ t15; \ w = t17 ^ t18; \ } #define SBOX7(a, b, c, d, w, x, y, z) \ { \ u32 t02, t03, t04, t05, t06, t08, t09, t10; \ u32 t11, t13, t14, t15, t16, t17, t01; \ t01 = a & c ; \ t02 = ~ d ; \ t03 = a & t02; \ t04 = b | t01; \ t05 = a & b ; \ t06 = c ^ t04; \ z = t03 ^ t06; \ t08 = c | z ; \ t09 = d | t05; \ t10 = a ^ t08; \ t11 = t04 & z ; \ x = t09 ^ t10; \ t13 = b ^ x ; \ t14 = t01 ^ x ; \ t15 = c ^ t05; \ t16 = t11 | t13; \ t17 = t02 | t14; \ w = t15 ^ t17; \ y = a ^ t16; \ } uint32_t a = 0xDEADF00D; uint32_t b = 0xC0DEDBAD; uint32_t c = 0xCAFEBABE; uint32_t d = 0xBADCAFED; uint32_t w = 0, x = 0, y = 0, z = 0; void clear() { w = 0; x = 0; y = 0; z = 0; } void dump_ins() { printf("a = 0x%08x b = 0x%08x c = 0x%08x d = 0x%08x\n", a, b, c, d); } void dump_outs() { printf("w = 0x%08x x = 0x%08x y = 0x%08x z = 0x%08x\n", w, x, y, z); } void main() { dump_ins(); printf("SBOX0:\n"); clear(); SBOX0(a, b, c, d, w, x, y, z); dump_outs(); printf("SBOX1:\n"); clear(); SBOX1(a, b, c, d, w, x, y, z); dump_outs(); printf("SBOX2:\n"); clear(); SBOX2(a, b, c, d, w, x, y, z); dump_outs(); printf("SBOX3:\n"); clear(); SBOX3(a, b, c, d, w, x, y, z); dump_outs(); printf("SBOX4:\n"); clear(); SBOX4(a, b, c, d, w, x, y, z); dump_outs(); printf("SBOX5:\n"); clear(); SBOX5(a, b, c, d, w, x, y, z); dump_outs(); printf("SBOX6:\n"); clear(); SBOX6(a, b, c, d, w, x, y, z); dump_outs(); printf("SBOX7:\n"); clear(); SBOX7(a, b, c, d, w, x, y, z); dump_outs(); }

The latter spits out:

a = 0xdeadf00d b = 0xc0dedbad c = 0xcafebabe d = 0xbadcafed SBOX0: w = 0x51525aa0 x = 0x61201453 y = 0xb0ae854c z = 0xf4dd9efe SBOX1: w = 0x3b526ef2 x = 0x51505ba0 y = 0x8f8fe10c z = 0x5f013113 SBOX2: w = 0x7a507ef2 x = 0xce8fb11e y = 0x64711fe1 z = 0x6b227540 SBOX3: w = 0x7e713ee0 x = 0xce8fb10c y = 0xaedfaeff z = 0xa4adc45e SBOX4: w = 0x95dfcaac x = 0x6e537ee1 y = 0xdeddbbbe z = 0x8e8fa01f SBOX5: w = 0x95dfcaac x = 0x1b706ba0 y = 0x4f513bb2 z = 0x41225101 SBOX6: w = 0x5b007101 x = 0x6f533ee1 y = 0x21224441 z = 0x70501ef3 SBOX7: w = 0x6f513ee0 x = 0xea8eb45f y = 0xb4dd8ffe z = 0x44211113

I.e. a matching set, for the particular input.

**Not** a proof of correctness, by any means, but we do learn that there is no obvious mistake.

The next logical step of the experiment would be to put the *forward S-box* component into an ICE40 and see precisely how many gates it occupies. Then can multiply this by two (there is also an *inverse S-Box*, and see whether it makes sense to attempt to produce the rest of the mechanism. The 32kB of *block RAM* available on the largest ICE40, theoretically suffices for Serpent's *key schedule* and other required buffers. But does the gate count?

*~To be continued!~*