```   1 ------------------------------------------------------------------------------
2 ------------------------------------------------------------------------------
3 -- This file is part of 'Finite Field Arithmetic', aka 'FFA'.               --
4 --                                                                          --
5 -- (C) 2018 Stanislav Datskovskiy ( www.loper-os.org )                      --
6 -- http://wot.deedbot.org/17215D118B7239507FAFED98B98228A001ABFFC7.html     --
7 --                                                                          --
8 -- You do not have, nor can you ever acquire the right to use, copy or      --
9 -- distribute this software ; Should you use this software for any purpose, --
10 -- or copy and distribute it to anyone or in any manner, you are breaking   --
11 -- the laws of whatever soi-disant jurisdiction, and you promise to         --
12 -- continue doing so for the indefinite future. In any case, please         --
13 -- always : read and understand any software ; verify any PGP signatures    --
14 -- that you use - for any purpose.                                          --
15 --                                                                          --
17 ------------------------------------------------------------------------------
18 ------------------------------------------------------------------------------
19
20 with Words;    use Words;
21 with Word_Ops; use Word_Ops;
22 with W_Mul;    use W_Mul;
23 with FZ_Arith; use FZ_Arith;
24
25
26 package body FZ_Sqr is
27
28    -- Square case of Comba's multiplier. (CAUTION: UNBUFFERED)
29    procedure FZ_Sqr_Comba(X  : in  FZ;
30                           XX : out FZ) is
31
32       -- Words in each multiplicand
33       L : constant Word_Index := X'Length;
34
35       -- Length of Product, i.e. double the length of X
36       LP : constant Word_Index := 2 * L;
37
38       -- 3-word Accumulator
39       A2, A1, A0 : Word := 0;
40
41       Lo, Hi  : Word; -- Output of WxW multiply/square
42
43       -- Type for referring to a column of XX
44       subtype ColXX is Word_Index range 0 .. LP - 1;
45
46       procedure Accum is
47          C      : WBool; -- Carry for the Accumulator addition
48          Sum    : Word;  -- Sum for Accumulator addition
49       begin
51          -- A0 += Lo; C := Carry
52          Sum := A0 + Lo;
53          C   := W_Carry(A0, Lo, Sum);
54          A0  := Sum;
55          -- A1 += Hi + C; C := Carry
56          Sum := A1 + Hi + C;
57          C   := W_Carry(A1, Hi, Sum);
58          A1  := Sum;
59          -- A2 += A2 + C
60          A2  := A2 + C;
61       end Accum;
62       pragma Inline_Always(Accum);
63
64       procedure SymmDigits(N : in ColXX; From : in ColXX; To : in ColXX) is
65       begin
66          for j in From .. To loop
67             -- Hi:Lo := j-th * (N - j)-th Word, and then,
68             Mul_Word(X(X'First + j),
69                      X(X'First - j + N),
70                      Lo, Hi);
71             Accum;
72             Accum; -- Accum += 2 * (Hi:Lo)
73          end loop;
74       end SymmDigits;
75       pragma Inline_Always(SymmDigits);
76
77       procedure SqrDigit(N : in ColXX) is
78       begin
79          Sqr_Word(X(X'First + N), Lo, Hi); -- Hi:Lo := Square(N-th digit)
80          Accum;                            -- Accum += Hi:Lo
81       end SqrDigit;
82       pragma Inline_Always(SqrDigit);
83
84       procedure HaveDigit(N : in ColXX) is
85       begin
86          -- Save the Nth (indexed from zero) word of XX:
87          XX(XX'First + N) := A0;
88          -- Right-Shift the Accumulator by one Word width:
89          A0 := A1;
90          A1 := A2;
91          A2 := 0;
92       end HaveDigit;
93       pragma Inline_Always(HaveDigit);
94
95       -- Compute the Nth (indexed from zero) column of the Product
96       procedure Col(N : in ColXX; U : in ColXX; V : in ColXX) is
97       begin
98          -- The branch pattern depends only on FZ wordness
99          if N mod 2 = 0 then         -- If we're doing an EVEN-numbered column:
100             SymmDigits(N, U, V - 1); --   Stop before V: it is the square case
101             SqrDigit(V);             --   Compute the square case at V
102          else                        -- If we're doing an ODD-numbered column:
103             SymmDigits(N, U, V);     --   All of them are the symmetric case
104          end if;                     -- After either even or odd column:
105          HaveDigit(N);               --   We have the N-th digit of the result.
106       end Col;
107       pragma Inline_Always(Col);
108
109    begin
110       -- First col always exists:
111       SqrDigit(ColXX'First);
112       HaveDigit(ColXX'First);
113
114       -- Compute the lower half of the Product:
115       for i in 1 .. L - 1 loop
116          Col(i, 0, i / 2);
117       end loop;
118
119       -- Compute the upper half (sans last Word) of the Product:
120       for i in L .. LP - 2 loop
121          Col(i, i - L + 1, i / 2);
122       end loop;
123
124       -- The very last Word of the Product:
125       XX(XX'Last) := A0; -- Equiv. of XX(XX'First + 2*L - 1) := A0;
126    end FZ_Sqr_Comba;
127
128
129    -- Karatsuba's Squaring. (CAUTION: UNBUFFERED)
130    procedure Sqr_Karatsuba(X  : in  FZ;
131                            XX : out FZ) is
132
133       -- L is the wordness of X. Guaranteed to be a power of two.
134       L : constant Word_Count := X'Length;
135
136       -- An 'LSeg' is the same length as X.
137       subtype LSeg is FZ(1 .. L);
138
139       -- K is HALF of the length of X.
140       K : constant Word_Index := L / 2;
141
142       -- A 'KSeg' is the same length as HALF of X.
143       subtype KSeg is FZ(1 .. K);
144
145       -- The three L-sized variables of the product equation, i.e.:
146       -- XX = LL + 2^(K*Bitness)(LL + HH - DD) + 2^(2*K*Bitness)HH
147       LL, DD, HH : LSeg;
148
149       -- K-sized term Dx, Dx := abs(XLo - XHi) to then get DD := Dx^2
150       Dx         : KSeg;
151
152       -- IGNORED Subtraction borrow, sign of (XL - XH)
153       Cx         : WBool; -- given as DD := Dx^2, DD is always positive
154       pragma Unreferenced(Cx);
155
156       -- Bottom and Top K-sized halves of X.
157       XLo        : KSeg  renames    X(    X'First       ..    X'Last - K );
158       XHi        : KSeg  renames    X(    X'First + K   ..    X'Last     );
159
160       -- L-sized middle segment of the product XX (+/- K from the midpoint).
161       XXMid      : LSeg  renames   XX(   XX'First + K   ..   XX'Last - K );
162
163       -- Bottom and Top L-sized halves of the product XX.
164       XXLo       : LSeg  renames   XX(   XX'First       ..   XX'Last - L );
165       XXHi       : LSeg  renames   XX(   XX'First + L   ..   XX'Last     );
166
167       -- Topmost K-sized quarter segment of the product XX, or 'tail'
168       XXHiHi     : KSeg  renames XXHi( XXHi'First + K   .. XXHi'Last     );
169
170       -- Carry from addition of HH and LL terms; borrow from subtraction of DD
171       C          : WBool;
172
173       -- Barring a cosmic ray, 0 <= TC <= 2
174       subtype TailCarry is Word range 0 .. 2;
175
176       -- Tail-Carry accumulator, for the final ripple-out into XXHiHi
177       TC         : TailCarry := 0;
178
179       -- Barring a cosmic ray, the tail ripple will NOT overflow.
180       FinalCarry : WZeroOrDie := 0;
181
182    begin
183
184       -- Recurse: LL := XLo^2
185       FZ_Square_Unbuffered(XLo, LL);
186
187       -- Recurse: HH := XHi^2
188       FZ_Square_Unbuffered(XHi, HH);
189
190       -- Dx := |XLo - XHi| , and we don't care about the borrow
191       FZ_Sub_Abs(X => XLo, Y => XHi, Difference => Dx, Underflow => Cx);
192
193       -- Recurse: DD := Dx^2 (always positive, which is why we don't need Cx)
194       FZ_Square_Unbuffered(Dx, DD);
195
196       -- XX := LL + 2^(2 * K * Bitness) * HH
197       XXLo := LL;
198       XXHi := HH;
199
200       -- XX += 2^(K * Bitness) * HH, carry goes into Tail Carry accumulator.
201       FZ_Add_D(X => XXMid, Y => HH, Overflow => TC);
202
203       -- XX += 2^(K * Bitness) * LL, ...
204       FZ_Add_D(X => XXMid, Y => LL, Overflow => C);
205
206       -- ... add the carry from LL addition into the Tail Carry accumulator.
207       TC := TC + C;
208
209       -- XX -= 2^(K * Bitness) * DD
210       FZ_Sub_D(X         => XXMid, -- Because DD is always positive,
211                Y         => DD,    -- this term is always subtractive.
212                Underflow => C); -- C is the borrow from DD term subtraction
213
214       -- Get final Tail Carry for the ripple by subtracting DD term's borrow
215       TC := TC - C;
216
217       -- Ripple the Tail Carry into the tail.
218       FZ_Add_D_W(X => XXHiHi, W => TC, Overflow => FinalCarry);
219
220    end Sqr_Karatsuba;
221    -- CAUTION: Inlining prohibited for Sqr_Karatsuba !
222
223
224    -- Squaring. (CAUTION: UNBUFFERED)
225    procedure FZ_Square_Unbuffered(X     : in  FZ;
226                                   XX    : out FZ) is
227    begin
228
229       if X'Length <= Sqr_Karatsuba_Thresh then
230
231          -- Base case:
232          FZ_Sqr_Comba(X, XX);
233
234       else
235
236          -- Recursive case:
237          Sqr_Karatsuba(X, XX);
238
239       end if;
240
241    end FZ_Square_Unbuffered;
242
243
244    -- Squaring. Preserves the inputs.
245    procedure FZ_Square_Buffered(X     : in  FZ;
246                                 XX_Lo : out FZ;
247                                 XX_Hi : out FZ) is
248
249       -- Product buffer.
250       P : FZ(1 .. 2 * X'Length);
251
252    begin
253
254       FZ_Square_Unbuffered(X, P);
255
256       XX_Lo := P(P'First            .. P'First + X'Length - 1);
257       XX_Hi := P(P'First + X'Length .. P'Last);
258
259    end FZ_Square_Buffered;
260
261 end FZ_Sqr;
```