\documentclass[slidestop]{beamer}
\usepackage{beamerthemesplit}
\usepackage{graphics}
\usepackage{pstricks}
\graphicspath{{./}}
\title{Big Integer Arithmetic Instruction design}
\author{Luke Kenneth Casson Leighton}
\begin{document}
\frame{
\begin{center}
\huge{Big Integer Arithmetic Instruction design}\\
\vspace{32pt}
\Large{An analysis of biginteger arithmetic instructions}\\
\Large{(why not to put all eggs in Custom Silicon basket)}\\
\vspace{24pt}
\Large{Silicon Salon 2023}\\
\vspace{16pt}
\large{Sponsored by NLnet's Assure Programme}\\
\vspace{6pt}
\large{\today}
\end{center}
}
\frame{\frametitle{Who are we?}
\begin{itemize}
\item LibreSOC: a fully Libre Project with the goal of creating
a Hybrid 3D CPUVPUGPU including designing a powerful Vector
Extension (for the Power ISA). https://libresoc.org
\vspace{6pt}
\item RED Semiconductor Ltd: a commercial realisation of LibreSOC
designs. https://redsemiconductor.com
\vspace{6pt}
\item LibreSOC researches and designs instructions that are then
proposed to the OpenPOWER Foundation ISA Technical Workgroup;
RED Semiconductor (as an OPF ISA WG Voting Member) then keeps
an eye on the RFC.
\vspace{6pt}
\item RED Semiconductor Ltd seeks VC funding and commercial business
propositions, LibreSOC covers Research.
\vspace{6pt}
\end{itemize}
}
\frame{\frametitle{What are the challenges faced by Biginteger?}
\begin{itemize}
\item Algorithms especially postquantum are now fastmoving. This
does not go down well! It typically takes 510 years for an
algorithm to become "trustable".
\vspace{6pt}
\item Custom Cryptographic Hardware will typically take 3 years from
design concept to first production silicon: Certification even longer.
If a fault is found in the algorithm, the entire investment is wasted.
\vspace{6pt}
\item Performance on 32bit and 64bit Embedded Hardware sucks. Algorithms
are roughly O(N\textsuperscript{2}) which wreaks havoc. The
temptation therefore is to add SIMD instructions or dedicated
"custom" instructions which makes the problem worse.
\vspace{6pt}
\item So how can these polar opposites be solved?
\vspace{6pt}
\end{itemize}
}
\begin{frame}[fragile]\frametitle{Go back to the algorithms.}
\begin{itemize}
\item https://libresoc.org/openpower/sv/biginteger/analysis/
\item Starting with Knuth's Algorithm D and M, if a TrueScalable
Vector ISA can cope with those, chances are good it'll cope
with more (Karatsuba, and so on).
\item SVP64 has "looping" as a primary construct: \\
loop i 0..VL1: GPR(RT+i) = ADD(GPR(RA+i), GPR(RB+i)\\
\vspace{1pt}
\item If however Carryin and Carryout are included in that, we
have arbitrarylength BigInteger Vector Add!
\item For all other operations as long as ScalarVector is ok,
it turns out to be possible to do 64bit carryin and
64bit carryout, without significant hardware disruption.
\item Irony: all relevant Scalar instructions (shift, mul, div)
usually drop 1/2 the result on the floor!
\end{itemize}
\end{frame}
\begin{frame}[fragile]\frametitle{Turning addwithcarry into VectorAdd}
\begin{itemize}
\item AddwithCarry is the buildingblock of larger operations
\item Let's simply chain them together.
\item sv.adde (AddCarry with Vector loop) creates chains
\end{itemize}
\begin{verbatim}
R0,CA = A0+B0+CA adde r0,a0,b0

++

R1,CA = A1+B1+CA adde r1,a1,b1

++

R2,CA = A2+B2+CA adde r2,a2,b2
\end{verbatim}
\end{frame}
\begin{frame}[fragile]\frametitle{VectorScalar Shift}
\begin{itemize}
\item Shift by 64bit is just "pick a register"
\item Add a 2nd input register with what needs to be shifted IN\\
(64bit carry in)
\item Add 2nd output saving what normally gets thrown away\\
(64bit carryout)
\item Again: a chain of these performs VectorbyScalar shift
\end{itemize}
\begin{verbatim}
brs(uint64_t s, uint64_t r[], uint64_t un[], int n) {
for (int i = 0; i < n  1; i++)
r[i] = (un[i] >> s)  (un[i + 1] << (64  s));
r[n  1] = un[n  1] >> s;
}
\end{verbatim}
\end{frame}
\begin{frame}[fragile]\frametitle{VectorScalar Multiply}
\begin{itemize}
\item Normally in MAC the top 64bits is thrown away.
\item What if we stored those 64bits in a 2nd register?\\
(64bit carryout)
\item And what if the next MAC added that "digit" on?\\
(64bit carryin)
\item Again: a chain of these performs VectorbyScalar Multiply
\end{itemize}
\begin{verbatim}
RT0, RC0 = RA0 * RB0 + 0

++

RT1, RC1 = RA1 * RB1 + RC0

++

RT2, RC2 = RA2 * RB2 + RC1
\end{verbatim}
\end{frame}
\begin{frame}[fragile]\frametitle{VectorScalar Divide}
\begin{itemize}
\item Same story. specialcase for overflow.
\end{itemize}
\begin{verbatim}
RT0 = (( 0<<64)  RA0) / RB0
RC0 = (( 0<<64)  RA0) % RB0

++

RT1 = ((RC0<<64)  RA1) / RB1
RC1 = ((RC0<<64)  RA1) % RB1

++

RT2 = ((RC1<<64)  RA2) / RB2
RC2 = ((RC1<<64)  RA2) % RB2
\end{verbatim}
\end{frame}
\frame{\frametitle{Summary so far}
\begin{itemize}
\item Extending the usual 1bit Carryin Carryout to 64bit and
adding a loopconstruct inherently turns Scalar operations
into arbitrarylength Vectorised ones
\item Irony: 30 years ago Power ISA actually had a "Carry SPR", where
the normallydiscarded upper half of multiply would be placed in
that SPR (it was deprecated).
\item Hardware is NOT made more complex because in all shift multiply
and divide operations these bits are discarded in other ISAs,
which is why you end up with complex carry workarounds. This
gives ISAs a "bad rep" for doing Bigint
\item The "complication" is that you need 3in 2out instructions,
but actually in Microcode you can do operandforwarding.
1st op: 3in 1out. chain: 2in 1out. Last: 2in 2out.
\end{itemize}
}
\frame{\frametitle{OpenTITAN}
\begin{itemize}
\item https://opentitan.org/book/hw/ip/otbn/index.html
\item 256b wide data path with 32 256b wide registers
\item ZeroOverhead Loop Control would have been better\\
https://ieeexplore.ieee.org/abstract/document/1692906/
\item Formal verification completion time is a factor of the operation
bitwidth. 256bit unlikely to be reasonable time.
\item 256bit is great for EC25519 but for RSA (etc.) you run
into exactly the same problem as a Scalar ISA, made worse.
\item Opportunities to optimise algorithms not possible (efficient
poweroptimised Karatsuba, etc.)
\end{itemize}
}
\begin{frame}[fragile]\frametitle{OpenTITAN shift}
\begin{itemize}
\item Immediateonly. what about shiftbyreg?
\item merges 2 operands, still not chainable.
\item needs a copy of the vector input (double number of regs)
\item needs massive 256bit shifter! 8 layers of muxes!
\end{itemize}
\begin{verbatim}
a = WDRs[wrs1]
b = WDRs[wrs2]
result = (((a << 256)  b) >> imm) & ((1 << 256)  1)
WDRs[wrd] = result
\end{verbatim}
\end{frame}
\begin{frame}[fragile]\frametitle{Draft DoubleShift}
\begin{itemize}
\item Remarkably similar to x86's shld/shrd
\item Does not need 128bit ROT: simple mod to existing hardware
\item Hardware may macroop fuse Vectorshift for better efficiency
\item Chainable and inplace (no copy of vector needed).
\end{itemize}
\begin{verbatim}
n < (RB)[58:63] # Power ISA MSB0 numbering. sigh
v < ROTL64((RA), n)
mask < MASK(0, 63n)
RT < (v[0:63] & mask)  ((RC) & ~mask)
RS < v[0:63] & ~mask
\end{verbatim}
\end{frame}
\frame{\frametitle{Conclusion}
\begin{itemize}
\item We went back to the algorithms (Knuth D and M) and examined
what they are trying to achieve.
\item Turns out they need a 64bit carryin and carryout
\item Keeping to 64bit maximum hardware means Formal Proofs complete
in reasonable time (less than heatdeath of universe)
\item Reasonably straightforward: creates and uses partial results
normally thrown away (needing more instructions)
\item Freaks out pureRISC proponents (3in 2out) but look at the
number of instructions (and temporary registers) needed
otherwise, and the overall algorithm efficiency, and the
case for these instructions is clear.
\item They also speed up \textbf{generalpurpose} code
\end{itemize}
}
\frame{
\begin{center}
{\Huge The end\\
Thank you\\
Questions?\\\vspace{5pt}
}
\end{center}
\begin{itemize}
\item https://redsemiconductor.com
\item Discussion: http://lists.libresoc.org
\item Libera.Chat IRC \#libresoc
\item http://libresoc.org/
\item http://nlnet.nl/assure
\item https://libresoc.org/nlnet/\#faq
\end{itemize}
}
\end{document}