Implementation Log

Fri Aug 16 23:38:45 2024

lut3(imm, a, b, c):
    idx = c << 2 | b << 1 | a
    return imm[idx] # idx by LSB0 order

for i in range(64): 
    RT[i] = lut3(imm, RB[i], RA[i], RT[i]) 

lut2(imm, a, b):
    idx = b << 1 | a
    return imm[idx] # idx by LSB0 order

imm = (RC>>(nh*4))&0b1111
for i in range(64): 
    RT[i] = lut2(imm, RB[i], RA[i]) 

for i in range(64): 
    RT[i] = lut2(CRs{BFA}, RB[i], RA[i]) 

for i in range(4):
    a,b,c = CRs[BF][i], CRs[BFA][i], CRs[BFB][i])
    if msk[i] CRs[BF][i] = lut3(imm, a, b, c)

for i in range(4):
    a,b = CRs[BF][i], CRs[BF][i])
    if msk[i] CRs[BF][i] = lut2(CRs[BFB], a, b)

uint_xlen_t mins(uint_xlen_t rs1, uint_xlen_t rs2)
{ return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
}
uint_xlen_t maxs(uint_xlen_t rs1, uint_xlen_t rs2)
{ return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
}
uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
{ return rs1 < rs2 ? rs1 : rs2;
}
uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
{ return rs1 > rs2 ? rs1 : rs2;
}

uint_xlen_t intavg(uint_xlen_t rs1, uint_xlen_t rs2) {
     return (rs1 + rs2 + 1) >> 1:
}

uint_xlen_t absdu(uint_xlen_t rs1, uint_xlen_t rs2) {
     return (src1 > src2) ? (src1-src2) : (src2-src1)
}

uint_xlen_t uintabsacc(uint_xlen_t rs, uint_xlen_t ra, uint_xlen_t rb) {
     return rs + (src1 > src2) ? (src1-src2) : (src2-src1)
}
uint_xlen_t intabsacc(uint_xlen_t rs, int_xlen_t ra, int_xlen_t rb) {
     return rs + (src1 > src2) ? (src1-src2) : (src2-src1)
}

# 1.6.27 Z23-FORM
    |0     |6     |11    |15 |16     |21 |23    |31 |
    | PO   |  RT  |   RA     |   RB  |sm |   XO |Rc |

n <- (RB)
m <- sm + 1
RT <- (n[m:XLEN-1] || [0]*m) + (RA)

shift <- sm + 1                # Shift is between 1-4
n <- EXTS((RB)[XLEN/2:XLEN-1]) # Only use lower XLEN/2-bits of RB
RT <- (n << shift) + (RA)      # Shift n, add RA

n <- ([0]*(XLEN/2)) || (RB)[XLEN/2:XLEN-1]
m <- sm + 1
RT <- (n[m:XLEN-1] || [0]*m) + (RA)

uint_xlen_t shadd(uint_xlen_t RA, uint_xlen_t RB, uint8_t sm) {
    sm = sm & 0x3;
    return (RB << (sm+1)) + RA;
}

uint_xlen_t shaddw(uint_xlen_t RA, uint_xlen_t RB, uint8_t sm) {
    uint_xlen_t n = (int_xlen_t)(RB << XLEN / 2) >> XLEN / 2;
    sm = sm & 0x3;
    return (n << (sm+1)) + RA;
}

uint_xlen_t shadduw(uint_xlen_t RA, uint_xlen_t RB, uint8_t sm) {
    uint_xlen_t n = RB & 0xFFFFFFFF;
    sm = sm & 0x3;
    return (n << (sm+1)) + RA;
}

def MASK(x, y):
     if x < y:
         x = x+1
         mask_a = ((1 << x) - 1) & ((1 << 64) - 1)
         mask_b = ((1 << y) - 1) & ((1 << 64) - 1)
     elif x == y:
         return 1 << x
     else:
         x = x+1
         mask_a = ((1 << x) - 1) & ((1 << 64) - 1)
         mask_b = (~((1 << y) - 1)) & ((1 << 64) - 1)
     return mask_a ^ mask_b


uint_xlen_t bmset(RS, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RS | (mask << shamt);
}

uint_xlen_t bmclr(RS, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RS & ~(mask << shamt);
}

uint_xlen_t bminv(RS, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RS ^ (mask << shamt);
}

uint_xlen_t bmext(RS, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return mask & (RS >> shamt);
}

msb = ra[5:0];
rev[0:msb] = rb[msb:0];
rt = ZE(rev[msb:0]);

uint_xlen_t bmrevi(RA, RB, sh)
{
    int shamt = XLEN-1;
    if (RA != 0) shamt = (GPR(RA) & (XLEN - 1));
    shamt = (XLEN-1)-shamt;  # shift other end
    brb = bitreverse(GPR(RB))     # swap LSB-MSB
    mask = (2<<sh)-1;
    return mask & (brb >> shamt);
}

uint_xlen_t bmrev(RA, RB, RC) {
    return bmrevi(RA, RB, GPR(RC) & 0b111111);
}

sv.ori./elwid=8 r10.v, r10.v, 0

def lut2(imm, a, b):
    idx = b << 1 | a
    return (imm>>idx) & 1

def dorow(imm8, step_i, chunk_size):
    step_o = 0
    for j in range(64):
        if (j&chunk_size) == 0:
           imm = (imm8 & 0b1111)
        else:
           imm = (imm8>>4)
        a = (step_i>>j)&1
        b = (step_i>>(j ^ chunk_size))&1
        res = lut2(imm, a, b)
        #print(j, bin(imm), a, b, res)
        step_o |= (res<<j)
    #print ("  ", chunk_size, bin(step_o))
    return step_o

def grevlut64(RA, RB, imm, iv):
    x = 0
    if RA is None: # RA=0
        x = 0x5555555555555555
    else:
        x = RA
    if (iv): x = ~x;
    shamt = RB & 63;
    for i in range(6):
        step = 1<<i
        if (shamt & step):
            x = dorow(imm, x, step)
    return x & ((1<<64)-1)

uint64_t grevlutr(uint64_t RA, uint64_t RB, bool iv, bool is32b)
{
    uint64_t x = 0x5555_5555_5555_5555;
    if (RA != 0) x = GPR(RA);
    if (iv) x = ~x;
    for i in 0 to (6-is32b)
        step = 1<<i
        imm = (RB>>(i*8))&0xff
        x = dorow(imm, x, step, is32b)
    return x;
}

uint_xlen_t xpermi(uint8_t imm8, uint_xlen_t RB, int sz_log2)
{
    uint_xlen_t r = 0;
    uint_xlen_t sz = 1LL << sz_log2;
    uint_xlen_t mask = (1LL << sz) - 1;
    uint_xlen_t RA = imm8 | imm8<<8 | ... | imm8<<56;
    for (int i = 0; i < XLEN; i += sz) {
        uint_xlen_t pos = ((RA >> i) & mask) << sz_log2;
        if (pos < XLEN)
            r |= ((RB >> pos) & mask) << i;
    }
    return r;
}
uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
{
    uint_xlen_t r = 0;
    uint_xlen_t sz = 1LL << sz_log2;
    uint_xlen_t mask = (1LL << sz) - 1;
    for (int i = 0; i < XLEN; i += sz) {
        uint_xlen_t pos = ((RA >> i) & mask) << sz_log2;
        if (pos < XLEN)
            r |= ((RB >> pos) & mask) << i;
    }
    return r;
}
uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
{  return xperm(RA, RB, 2); }
uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
{  return xperm(RA, RB, 3); }
uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
{  return xperm(RA, RB, 4); }
uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
{  return xperm(RA, RB, 5); }

uint64_t bmatflip(uint64_t RA)
{
    uint64_t x = RA;
    x = shfl64(x, 31);
    x = shfl64(x, 31);
    x = shfl64(x, 31);
    return x;
}

uint64_t bmatxori(uint64_t RS, uint64_t RA, uint8_t imm) {
    // transpose of RA
    uint64_t RAt = bmatflip(RA);
    uint8_t u[8]; // rows of RS
    uint8_t v[8]; // cols of RA
    for (int i = 0; i < 8; i++) {
        u[i] = RS >> (i*8);
        v[i] = RAt >> (i*8);
    }
    uint64_t bit, x = 0;
    for (int i = 0; i < 64; i++) {
        bit = (imm >> (i%8)) & 1;
        bit ^= pcnt(u[i / 8] & v[i % 8]) & 1;
        x |= bit << i;
    }
    return x;
}

uint64_t bmatxor(uint64_t RA, uint64_t RB) {
    return bmatxori(RA, RB, 0xff)
}

uint64_t bmator(uint64_t RA, uint64_t RB) {
    // transpose of RB
    uint64_t RBt = bmatflip(RB);
    uint8_t u[8]; // rows of RA
    uint8_t v[8]; // cols of RB
    for (int i = 0; i < 8; i++) {
        u[i] = RA >> (i*8);
        v[i] = RBt >> (i*8);
    }
    uint64_t x = 0;
    for (int i = 0; i < 64; i++) {
        if ((u[i / 8] & v[i % 8]) != 0)
            x |= 1LL << i;
    }
    return x;
}

uint64_t bmatand(uint64_t RA, uint64_t RB) {
    // transpose of RB
    uint64_t RBt = bmatflip(RB);
    uint8_t u[8]; // rows of RA
    uint8_t v[8]; // cols of RB
    for (int i = 0; i < 8; i++) {
        u[i] = RA >> (i*8);
        v[i] = RBt >> (i*8);
    }
    uint64_t x = 0;
    for (int i = 0; i < 64; i++) {
        if ((u[i / 8] & v[i % 8]) == 0xff)
            x |= 1LL << i;
    }
    return x;
}


clmadd RT, RA, RB, RC

(RT) = clmul((RA), (RB)) ^ (RC)

cltmadd RT, RA, RB, RC

a = (RA)
c = (RC)
# read all inputs before writing to any outputs in case
# an input overlaps with an output register.
(RT) = clmul(a, (RB)) ^ c
(RS) = a ^ c


cldiv RT, RA, RB

n = (RA)
d = (RB)
q, r = cldivrem(n, d, width=XLEN)
(RT) = q

clrem RT, RA, RB

n = (RA)
d = (RB)
q, r = cldivrem(n, d, width=XLEN)
(RT) = r


gfbmul RT, RA, RB


gfbmadd RT, RA, RB, RC


gfbtmadd RT, RA, RB, RC

a = (RA)
c = (RC)
# read all inputs before writing to any outputs in case
# an input overlaps with an output register.
(RT) = gfbmadd(a, (RB), c)
# use gfbmadd again since it reduces the result
(RS) = gfbmadd(a, 1, c) # "a * 1 + c"

gfbinv RT, RA


gfpadd RT, RA, RB


gfpsub RT, RA, RB


gfpmul RT, RA, RB


gfpinv RT, RA


gfpmadd RT, RA, RB, RC


gfpmsub RT, RA, RB, RC


gfpmsubr RT, RA, RB, RC


gfpmaddsubr RT, RA, RB, RC

factor1 = (RA)
factor2 = (RB)
term = (RC)
# read all inputs before writing to any outputs in case
# an input overlaps with an output register.
(RT) = gfpmadd(factor1, factor2, term)
(RS) = gfpmsubr(factor1, factor2, term)

uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
    return (RA & RB) | (RC & ~RB);
}

count = 0
do i = 0 to 63 if((RB)i=1) then do
if((RS)i=1) then break end end count ← count + 1
RA ← EXTZ64(count)

do while(m < 64)
   if VSR[VRB+32].dword[i].bit[63-m]=1 then do
      result = VSR[VRA+32].dword[i].bit[63-k]
      VSR[VRT+32].dword[i].bit[63-m] = result
      k = k + 1
   m = m + 1


uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t r = 0;
    for (int i = 0, j = 0; i < XLEN; i++)
        if ((RB >> i) & 1) {
            if ((RA >> j) & 1)
                r |= uint_xlen_t(1) << i;
            j++;
        }
    return r;
}

uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t r = 0;
    for (int i = 0, j = 0; i < XLEN; i++)
        if ((RB >> i) & 1) {
            if ((RA >> i) & 1)
                r |= uint_xlen_t(1) << j;
            j++;
        }
    return r;
}

ptr0 = 0
ptr1 = 0
do i = 0 to 63
    if((RB)i=0) then do
       resultptr0 = (RS)i
    end 
    ptr0 = ptr0 + 1
    if((RB)63-i==1) then do
        result63-ptr1 = (RS)63-i
    end
    ptr1 = ptr1 + 1
RA = result

do j = 0 to 7
  do k = 0 to 7
     b = VSR[VRB+32].dword[i].byte[k].bit[j]
     VSR[VRT+32].dword[i].byte[j].bit[k] = b

uint64_t grev64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 63;
    if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
                        ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
    if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
                        ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
    if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
                        ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
    if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
                        ((x & 0xFF00FF00FF00FF00LL) >>  8);
    if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
                        ((x & 0xFFFF0000FFFF0000LL) >> 16);
    if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
                        ((x & 0xFFFFFFFF00000000LL) >> 32);
    return x;
}

uint32_t gorc32(uint32_t RA, uint32_t RB)
{
    uint32_t x = RA;
    int shamt = RB & 31;
    if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
    if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
    if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
    if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
    if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
    return x;
}
uint64_t gorc64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 63;
    if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
                         ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
    if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
                         ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
    if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
                         ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
    if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
                         ((x & 0xFF00FF00FF00FF00LL)  >>  8);
    if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
                         ((x & 0xFFFF0000FFFF0000LL)  >> 16);
    if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
                         ((x & 0xFFFFFFFF00000000LL)  >> 32);
    return x;
}

26-28	29.30	31	name	Form
	0 0	Rc	ternlogi	TLI-Form
0 0 0	0 1		crfternlogi	CRB-Form
0 0 1	0 1		rsvd	rsvd
0 1 /	0 1	/	svshape4	SVI2-Form
1 0 n	0 1		rsvd	rsvd
1 1 n	0 1		rsvd	rsvd
	1 iv	1	grevlogi	TLI-Form
0 n n	1 0	0	madd/sub	A-Form
	1 1	0	crternlogi	TLI-Form

0.5	6.10	11.15	16.20	21..25	26....30	name	Form
NN	RT	RA	RB	RC	/ 00 10	maddsubrs	A-Form
NN	RT	RA	RB	RC	/ 01 10	maddrs	A-Form
NN	RT	RA	RB	RC	/ 10 10	msubrs	A-Form

28.30	31	name
-00	0	xpermi
-00	1	binary lut
-01	0	grevlog
-01	1	swizzle mv/fmv
010	Rc	bitmask
011		SVP64
110	Rc	1/2-op
111		bmrevi

0.5	6.10	11.15	16.20	21..25	26....30	31	name	Form
NN	RT	RA	it/im57	im0-4	0 00 00	0	xpermi	TODO-Form
NN					- 10 00	0	svshape3	rsvd
NN					- 11 00	0	svshape4	rsvd
NN	RT	RA	RB	RC	nh 00 00	1	binlut	VA-Form
NN	RT	RA	RB	/BFA/	0 01 00	1	bincrflut	VA-Form
NN					1 01 00	1	svindex	SVI-Form
NN	RT	RA	RB	mode	L 10 00	1	bmask	BM2-Form
NN					0 11 00	1	svshape	SVM-Form
NN					1 11 00	1	svremap	SVRM-Form
NN	RT	RA	RB	im0-4	im5-7 01	0	grevlog	TLI-Form
NN					- -- 01	1	swizzle mv/f	TODO
NN	RT	RA	RB	RC	mode 010	Rc	bitmask*	VA2-Form
NN	FRS	d1	d0	d0	00 011	d2	fmvis	DX-Form
NN	FRS	d1	d0	d0	01 011	d2	fishmv	DX-Form
NN					10 011	Rc	svstep	SVL-Form
NN					11 011	Rc	setvl	SVL-Form
NN					---- 110		1/2 ops	other table [1]
NN	RT	RA	RB	RC	11 110	Rc	bmrev	VA2-Form
NN	RT	RA	RB	sh0-4	sh5 1 111	Rc	bmrevi	MDS-Form

0.5	6.10	11.15	16.20	21..25	26..31	Form
NN	RT	RA	RB	RC	nh 00001	VA-Form
NN	RT	RA	RB	/BFA/	0 01001	VA-Form

dest	src1	src2	subop	op
RT	RA	RB	or	bmatflip
RT	RA	RB	xor	bmatflip
RT	RA	RB		grev
RT	RA	RB		clmul*
RT	RA	RB		gorc
RT	RA	RB	shuf	shuffle
RT	RA	RB	unshuf	shuffle
RT	RA	RB	width	xperm
RT	RA	RB	MMM	`minmax`
RT	RA	RB		av abs avgadd
RT	RA	RB	type	vmask ops
RT	RA	RB	type	abs accumulate (overwrite)

0.5	6.10	11.15	16.20	21	22.23	24....30	31	name	Form
NN	RS	me	sh	SH	ME 0	nn00 110	Rc	bmopsi	BM-Form
NN	RS	RA	sh	SH	0 1	nn00 110	Rc	bmopsi	XB-Form
NN	RS	RA	im04	im5	1 1	im67 00 110	Rc	bmatxori	TODO
NN	RT	RA	RB	1	00	0001 110	Rc	cldiv	X-Form
NN	RT	RA	RB	1	01	0001 110	Rc	clmod	X-Form
NN	RT	RA		1	10	0001 110	Rc	clmulh	X-Form
NN	RT	RA	RB	1	11	0001 110	Rc	clmul	X-Form
NN	RT	RA	RB	0	00	0001 110	Rc	rsvd
NN	RT	RA	RB	0	01	0001 110	Rc	rsvd
NN	RT	RA	RB	0	10	0001 110	Rc	rsvd
NN	RT	RA	RB	0	11	0001 110	Rc	vec cprop	X-Form
NN					00	0101 110	0	crfbinlog	CRB-Form
NN	BT	BA	BFB//	0	00	0101 110	1	crbinlog	X-Form
NN				1	00	0101 110	1	rsvd
NN					10	0101 110	Rc	rsvd
NN	RT	RA	RB	sm0	sm1 1	0101 110	Rc	shaddw	X-Form
NN				0		1001 110	Rc	rsvd
NN	RT	RA	RB	1	00	1001 110	Rc	av abss	X-Form
NN	RT	RA	RB	1	01	1001 110	Rc	av absu	X-Form
NN	RT	RA	RB	1	10	1001 110	Rc	av avgadd	X-Form
NN	RT	RA	RB	1	11	1001 110	Rc	grevlutr	X-Form
NN	RT	RA	RB	sm0	sm1 0	1101 110	Rc	shadd	X-Form
NN	RT	RA	RB	sm0	sm1 1	1101 110	Rc	shadduw	X-Form
NN	RT	RA	RB	0	00	0010 110	Rc	rsvd
NN	RS	RA	sh	SH	00	1010 110	Rc	rsvd
NN	RT	RA	RB	0	00	0110 110	Rc	rsvd
NN	RS	RA	SH	0	00	1110 110	Rc	rsvd
NN	RT	RA	RB	1	00	1110 110	Rc	absds	X-Form
NN	RT	RA	RB	0	01	0010 110	Rc	rsvd
NN	RT	RA	RB	1	01	0010 110	Rc	clmulr	X-Form
NN	RS	RA	sh	SH	01	1010 110	Rc	rsvd
NN	RT	RA	RB	0	01	0110 110	Rc	rsvd
NN	RS	RA	SH	0	01	1110 110	Rc	rsvd
NN	RT	RA	RB	1	01	1110 110	Rc	absdu	X-Form
NN	RS	RA	RB	0	10	0010 110	Rc	bmator	X-Form
NN	RS	RA	RB	0	10	0110 110	Rc	bmatand	X-Form
NN	RS	RA	RB	0	10	1010 110	Rc	bmatxor	X-Form
NN	RS	RA		0	10	1110 110		bmatflip	X-Form
NN	RT	RA	RB	1	10	0010 110	Rc	xpermn	X-Form
NN	RT	RA	RB	1	10	0110 110	Rc	xpermb	X-Form
NN	RT	RA	RB	1	10	1010 110	Rc	xpermh	X-Form
NN	RT	RA	RB	1	10	1110 110	Rc	xpermw	X-Form
NN	RT	RA	RB	0	11	1110 110	Rc	absdacs	X-Form
NN	RT	RA	RB	1	11	1110 110	Rc	absdacu	X-Form
NN						--11 110	Rc	bmrev	VA2-Form

RA=0	RB	imm	iv	result
0x555..	0b10	0b01101100	0	0x111111...
0x555..	0b110	0b01101100	0	0x010101...
0x555..	0b1110	0b01101100	0	0x00010001...
0x555..	0b10	0b11000110	1	0x88888...
0x555..	0b110	0b11000110	1	0x808080...
0x555..	0b1110	0b11000110	1	0x80008000...

Implementation Log

bitmanipulation

Draft Opcode tables

binary and ternary bitops

ternlogi

binlut

crternlogi

crbinlog

int ops

min/m

average

absdu

abs-accumulate

shift-and-add

bitmask set

grevlut

xperm

bitmatrix

Introduction to Carry-less and GF arithmetic

Instructions for Carry-less Operations

Carry-less Multiply Instructions

clmul Carry-less Multiply

clmulh Carry-less Multiply High

clmulr Carry-less Multiply (Reversed)

clmadd Carry-less Multiply-Add

cltmadd Twin Carry-less Multiply-Add (for FFTs)

cldivrem Carry-less Division and Remainder

cldiv Carry-less Division

clrem Carry-less Remainder

Instructions for Binary Galois Fields GF(2^m)

GFBREDPOLY SPR -- Reducing Polynomial

gfbredpoly -- Set the Reducing Polynomial SPR GFBREDPOLY

gfbmul -- Binary Galois Field GF(2^m) Multiplication

gfbmadd -- Binary Galois Field GF(2^m) Multiply-Add

gfbtmadd -- Binary Galois Field GF(2^m) Twin Multiply-Add (for FFT)

gfbinv -- Binary Galois Field GF(2^m) Inverse

Instructions for Prime Galois Fields GF(p)

GFPRIME SPR -- Prime Modulus For gfp* Instructions

gfpadd Prime Galois Field GF(p) Addition

gfpsub Prime Galois Field GF(p) Subtraction

gfpmul Prime Galois Field GF(p) Multiplication

gfpinv Prime Galois Field GF(p) Invert

gfpmadd Prime Galois Field GF(p) Multiply-Add

gfpmsub Prime Galois Field GF(p) Multiply-Subtract

gfpmsubr Prime Galois Field GF(p) Multiply-Subtract-Reversed

gfpmaddsubr Prime Galois Field GF(p) Multiply-Add and Multiply-Sub-Reversed (for FFT)

Already in POWER ISA or subsumed

cmix

count leading/trailing zeros with mask

bit deposit

bit extract

centrifuge

bit to byte permute

grev

gorc

Appendix

`clmul` Carry-less Multiply

`clmulh` Carry-less Multiply High

`clmulr` Carry-less Multiply (Reversed)

`clmadd` Carry-less Multiply-Add

`cltmadd` Twin Carry-less Multiply-Add (for FFTs)

`cldivrem` Carry-less Division and Remainder

`cldiv` Carry-less Division

`clrem` Carry-less Remainder

Instructions for Binary Galois Fields `GF(2^m)`

`GFBREDPOLY` SPR -- Reducing Polynomial

`gfbredpoly` -- Set the Reducing Polynomial SPR `GFBREDPOLY`

`gfbmul` -- Binary Galois Field `GF(2^m)` Multiplication

`gfbmadd` -- Binary Galois Field `GF(2^m)` Multiply-Add

`gfbtmadd` -- Binary Galois Field `GF(2^m)` Twin Multiply-Add (for FFT)

`gfbinv` -- Binary Galois Field `GF(2^m)` Inverse

Instructions for Prime Galois Fields `GF(p)`

`GFPRIME` SPR -- Prime Modulus For `gfp*` Instructions

`gfpadd` Prime Galois Field `GF(p)` Addition

`gfpsub` Prime Galois Field `GF(p)` Subtraction

`gfpmul` Prime Galois Field `GF(p)` Multiplication

`gfpinv` Prime Galois Field `GF(p)` Invert

`gfpmadd` Prime Galois Field `GF(p)` Multiply-Add

`gfpmsub` Prime Galois Field `GF(p)` Multiply-Subtract

`gfpmsubr` Prime Galois Field `GF(p)` Multiply-Subtract-Reversed

`gfpmaddsubr` Prime Galois Field `GF(p)` Multiply-Add and Multiply-Sub-Reversed (for FFT)