summary

1-op and variants

dest src1 subop op
RT RA .. bmatflip
RT RA size crc32
RT RA size crc32c

2-op and variants

dest src1 src2 subop op
RT RA RB or bmatflip
RT RA RB xor bmatflip
RT RA RB bdep dep/ext
RT RA RB bext dep/ext
RT RA RB grev
RT RA RB gorc
RT RA RB shuf shuffle
RT RA RB unshuf shuffle
RT RA RB width xperm
RT RA RB type clmul
RT RA RB type minmax
RT RA RB
RT RA RB
RT RA RB

3 ops

  • bitmask swt/extract
  • ternary bitops

ops

0.5 6.10 11.15 16.20 21.22 23 24..30 31 name
NN RA RB 0 0000110 Rc rsvd
NN RA RB RC itype 1 0000110 Rc xperm
NN RA RB RC itype 0 0100110 Rc minmax
NN RA RB 1 0100110 Rc rsvd
NN RA RB sh itype SH 1000110 Rc bmopsi
NN RA RB 1100110 Rc rsvd
NN RA RB RC itype 0 0001110 Rc clmul
NN RA RB sh itype 0 0101110 Rc clmulw
NN RA RB RC 00 0 0010110 Rc gorc
NN RA RB sh 00 SH 1010110 Rc gorci
NN RA RB RC 00 0 0110110 Rc gorcw
NN RA RB sh 00 0 1110110 Rc gorcwi
NN RA RB RC 00 1 1110110 Rc bmator
NN RA RB RC 01 0 0010110 Rc grev
NN RA RB sh 01 SH 1010110 Rc grevi
NN RA RB RC 01 0 0110110 Rc grevw
NN RA RB sh 01 0 1110110 Rc grevwi
NN RA RB RC 01 1 1110110 Rc bmatxor
NN RA RB RC 10 0 0010110 Rc shfl
NN RA RB sh 10 SH 1010110 Rc shfli
NN RA RB RC 10 0 0110110 Rc shflw
NN RA RB RC 10 0 1110110 Rc bdep
NN RA RB RC 10 1 1110110 Rc bext
NN RA RB 11 1110110 Rc rsvd
NN RA RB NN11110 Rc rsvd

bit to byte permute

similar to matrix permute in RV bitmanip, which has XOR and OR variants

do j = 0 to 7
  do k = 0 to 7
     b = VSR[VRB+32].dword[i].byte[k].bit[j]
     VSR[VRT+32].dword[i].byte[j].bit[k] = b

vector bit deposit

vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep

do while(m < 64)
   if VSR[VRB+32].dword[i].bit[63-m]=1 then do
      result = VSR[VRA+32].dword[i].bit[63-k]
      VSR[VRT+32].dword[i].bit[63-m] = result
      k = k + 1
   m = m + 1

uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t r = 0;
    for (int i = 0, j = 0; i < XLEN; i++)
        if ((RB >> i) & 1) {
            if ((RA >> j) & 1)
                r |= uint_xlen_t(1) << i;
            j++;
        }
    return r;
}

vector bit extract

other way round: identical to RV bext

uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t r = 0;
    for (int i = 0, j = 0; i < XLEN; i++)
        if ((RB >> i) & 1) {
            if ((RA >> i) & 1)
                r |= uint_xlen_t(1) << j;
            j++;
        }
    return r;
}

int min/max

signed and unsigned min/max for integer. this is sort-of partly synthesiseable in svp64 with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).

signed/unsigned min/max gives more flexibility.

ternary bitops

Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register

0.5 6.10 11.15 16.20 21..25 26..30 31
NN RT RA RB im0-4 im5-7 00 Rc
for i in range(64):
    idx = RT[i] << 2 | RA[i] << 1 | RB[i]
    RT[i] = (imm & (1<<idx)) != 0

bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.

a 4 operand variant which becomes more along the lines of an FPGA:

0.5 6.10 11.15 16.20 21.25 26..30 31
NN RT RA RB RC mode 1 1
for i in range(64):
    idx = RT[i] << 2 | RA[i] << 1 | RB[i]
    RT[i] = (RC & (1<<idx)) != 0

mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.

also, another possible variant involving swizzle and vec4:

for i in range(8):
    idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
    RT[i] = (RA.w[i] & (1<<idx)) != 0
0.5 6.10 11.15 16.23 24.27 28.30 31
NN RT RA xyzw mask mode 1 1
for i in range(8):
    idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]   
    res = (RA.w[i] & (1<<idx)) != 0
    for j in range(4):
         if mask[j]: RT[i+j*8] = res
0.5 6.10 11.15 16.23 24.27 28.30 31
NN RT RA imm mask mode 1 1
for i in range(8):
    idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]   
    res = (imm & (1<<idx)) != 0
    for j in range(3):
         if mask[j]: RT[i+j*8] = res

another mode selection would be CRs not Ints.

0.5 6.8 9.11 12.14 15.17 18.20 21..25 26.29 30 31
NN BT BA BB BC im5-7 im0-4 mask 1 0
for i in range(4):
    if not mask[i] continue
    idx = crregs[BA][i] << 2 |
          crregs[BB][i] << 1 |
          crregs[BC][i]
    crregs[BT][i] = (imm & (1<<idx)) != 0

bitmask set

based on RV bitmanip singlebit set, instruction format similar to shift fixedshift. bmext is actually covered already (shift-with-mask). however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.

0.5 6.10 11.15 16.20 21.25 26..30 31
NN RT RA RB RC mode 010 Rc
uint_xlen_t bmset(RA, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RA | (mask << shamt);
}

uint_xlen_t bmclr(RA, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RA & ~(mask << shamt);
}

uint_xlen_t bminv(RA, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RA ^ (mask << shamt);
}

uint_xlen_t bmext(RA, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return mask & (RA >> shamt);
}

grev

based on RV bitmanip

uint64_t grev64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 63;
    if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
                        ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
    if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
                        ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
    if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
                        ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
    if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
                        ((x & 0xFF00FF00FF00FF00LL) >>  8);
    if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
                        ((x & 0xFFFF0000FFFF0000LL) >> 16);
    if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
                        ((x & 0xFFFFFFFF00000000LL) >> 32);
    return x;
}

shuffle / unshuffle

based on RV bitmanip

uint32_t shfl32(uint32_t RA, uint32_t RB)
{
    uint32_t x = RA;
    int shamt = RB & 15;
    if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
    if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
    if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
    if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
    return x;
}
uint32_t unshfl32(uint32_t RA, uint32_t RB)
{
    uint32_t x = RA;
    int shamt = RB & 15;
    if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
    if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
    if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
    if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
    return x;
}

uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
{
    uint64_t x = src & ~(maskL | maskR);
    x |= ((src << N) & maskL) | ((src >> N) & maskR);
    return x;
}
uint64_t shfl64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 31;
    if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
                                           0x00000000ffff0000LL, 16);
    if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
                                           0x0000ff000000ff00LL, 8);
    if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
                                           0x00f000f000f000f0LL, 4);
    if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
                                           0x0c0c0c0c0c0c0c0cLL, 2);
    if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
                                           0x2222222222222222LL, 1);
    return x;
}
uint64_t unshfl64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 31;
    if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
                                           0x2222222222222222LL, 1);
    if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
                                           0x0c0c0c0c0c0c0c0cLL, 2);
    if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
                                           0x00f000f000f000f0LL, 4);
    if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
                                           0x0000ff000000ff00LL, 8);
    if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
                                           0x00000000ffff0000LL, 16);
    return x;
}

xperm

based on RV bitmanip

uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
{
    uint_xlen_t r = 0;
    uint_xlen_t sz = 1LL << sz_log2;
    uint_xlen_t mask = (1LL << sz) - 1;
    for (int i = 0; i < XLEN; i += sz) {
        uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
        if (pos < XLEN)
            r |= ((RA >> pos) & mask) << i;
    }
    return r;
}
uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
{  return xperm(RA, RB, 2); }
uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
{  return xperm(RA, RB, 3); }
uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
{  return xperm(RA, RB, 4); }
uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
{  return xperm(RA, RB, 5); }

gorc

based on RV bitmanip

uint32_t gorc32(uint32_t RA, uint32_t RB)
{
    uint32_t x = RA;
    int shamt = RB & 31;
    if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
    if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
    if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
    if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
    if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
    return x;
}
uint64_t gorc64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 63;
    if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
                         ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
    if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
                         ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
    if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
                         ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
    if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
                         ((x & 0xFF00FF00FF00FF00LL)  >>  8);
    if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
                         ((x & 0xFFFF0000FFFF0000LL)  >> 16);
    if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
                         ((x & 0xFFFFFFFF00000000LL)  >> 32);
    return x;
}

cmix

based on RV bitmanip, covered by ternary bitops

uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
    return (RA & RB) | (RC & ~RB);
}

carryless mul

based on RV bitmanip see https://en.wikipedia.org/wiki/CLMUL_instruction_set

uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t x = 0;
    for (int i = 0; i < XLEN; i++)
        if ((RB >> i) & 1)
            x ^= RA << i;
    return x;
}
uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t x = 0;
    for (int i = 1; i < XLEN; i++)
        if ((RB >> i) & 1)
            x ^= RA >> (XLEN-i);
    return x;
}
uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t x = 0;
    for (int i = 0; i < XLEN; i++)
        if ((RB >> i) & 1)
            x ^= RA >> (XLEN-i-1);
    return x;
}

crc

uint_xlen_t crc32(uint_xlen_t x, int nbits)
{
    for (int i = 0; i < nbits; i++)
        x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
    return x;
}
uint_xlen_t crc32c(uint_xlen_t x, int nbits)
{
    for (int i = 0; i < nbits; i++)
        x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
    return x;
}
uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
#if XLEN > 32
uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
#endif

bitmatrix

uint64_t bmatflip(uint64_t RA)
{
    uint64_t x = RA;
    x = shfl64(x, 31);
    x = shfl64(x, 31);
    x = shfl64(x, 31);
    return x;
}
uint64_t bmatxor(uint64_t RA, uint64_t RB)
{
    // transpose of RB
    uint64_t RBt = bmatflip(RB);
    uint8_t u[8]; // rows of RA
    uint8_t v[8]; // cols of RB
    for (int i = 0; i < 8; i++) {
        u[i] = RA >> (i*8);
        v[i] = RBt >> (i*8);
    }
    uint64_t x = 0;
    for (int i = 0; i < 64; i++) {
        if (pcnt(u[i / 8] & v[i % 8]) & 1)
            x |= 1LL << i;
    }
    return x;
}
uint64_t bmator(uint64_t RA, uint64_t RB)
{
    // transpose of RB
    uint64_t RBt = bmatflip(RB);
    uint8_t u[8]; // rows of RA
    uint8_t v[8]; // cols of RB
    for (int i = 0; i < 8; i++) {
        u[i] = RA >> (i*8);
        v[i] = RBt >> (i*8);
    }
    uint64_t x = 0;
    for (int i = 0; i < 64; i++) {
        if ((u[i / 8] & v[i % 8]) != 0)
            x |= 1LL << i;
    }
    return x;
}