Analysis

Fri Nov 17 20:48:06 2023 · This fits naturally with a Scalable Vector ISA such as SVP64

R0,CA = A0+B0+CA  adde r0,a0,b0
    |
    +----------+
               |
R1,CA = A1+B1+CA  adde r1,a1,b1
    |
    +----------+
               |
R2,CA = A2+B2+CA  adde r2,a2,b2

  aptr = A address
  bptr = B address
  rptr = Result address
  li r0, 0        # used to help clear CA
  addic r0, r0, 0 # CA to zero as well
  setmvli 8       # set MAXVL to 8
loop:
  setvl t0, n         # n is the number of digits
  mulli t1, t0, 8     # 8 bytes per digit/element
  sv.ldu a0, aptr, t1 # update advances pointer
  sv.ldu b0, bptr, t1 # likewise
  sv.adde r0, a0, b0  # takes in CA, updates CA
  sv.stu rptr, r0, t1 # pointer advances too
  sub. n, n, t0       # should not alter CA
  bnz loop            # do more digits

C4 C3 C2 C1 C0
         A0xB0
      A1xB0
   A2xB0
A3xB0
R4 R3 R2 R1 R0

      // this becomes the basis for sv.maddedu in RS=RC Mode,
      // where k is RC. k takes the upper half of product
      // and adds it in on the next iteration
      k = 0;
      for (i = 0; i < m; i++) {
         unsigned product = u[i]*v[j] + k;
         k = product>>16;
         plo[i] = product; // & 0xffff
      }
      // this is simply sv.adde where k is XER.CA
      k = 0;
      for (i = 0; i < m; i++) {
         t = plo[i] + w[i + j] + k;
         w[i + j] = t;          // (I.e., t & 0xFFFF).
         k = t >> 16; // carry: should only be 1 bit
      }

product = RA*RB+RC
RT = lowerhalf(product)
RC = upperhalf(product)

RT0, RC0 = RA0 * RB0 + 0
      |
      +----------------+
                       |
RT1, RC1 = RA1 * RB1 + RC0
      |
      +----------------+
                       |
RT2, RC2 = RA2 * RB2 + RC1

li r16, 0                     # zero accumulator
addic r16, r16, 0             # CA to zero as well
sv.maddedu *r0, *r8, r17, r16 # mul vector
sv.adde *r24, *r24, *r0   # big-add row to result

product = RA*RB+RC
RT = lowerhalf(product)
RS=RT+MAXVL = upperhalf(product)

product = RA*RB+RC
RT = lowerhalf(product)
RS=RC = upperhalf(product)

void bigrsh(unsigned s, uint64_t r[], uint64_t un[], int n) {
    for (int i = 0; i < n - 1; i++)
        r[i] = (un[i] >> s) | (un[i + 1] << (64 - s));
    r[n - 1] = un[n - 1] >> s;
}

subfic t1, t0, 64     # compute 64-s (s in t0)
sv.srd *r8, *r24, t0  # shift each element of r24 vector up by s
sv.sld *r16, *r25, t1 # offset start of vector by one (r25)
sv.or  *r8, *r8, *r16 # OR two parts together

    # r[i] = (un[i] >> s) | (un[i + 1] << (64 - s));
    temp <- ROT128(RA || RC, RB[58:63])
    RT <- temp[64:127]
    RS <- temp[0:63]

    n <- (RB)[58:63]
    v <- ROTL64((RA), 64-n)
    mask <- MASK(n, 63)
    RT <- (v[0:63] & mask) | ((RC) & ¬mask)
    RS <- v[0:63] & ¬mask

sv.dsrd *r8, *r24, t1, t0

      // Multiply and subtract.
      k = 0;
      for (i = 0; i < n; i++) {
         p = qhat*vn[i]; // 64-bit product
         t = un[i+j] - k - (p & 0xFFFFFFFFLL);
         un[i+j] = t;
         k = (p >> 32) - (t >> 32);
      }

        uint32_t carry = 0;
        // this is just sv.maddedu again
        for (int i = 0; i <= n; i++) {
            uint64_t value = (uint64_t)vn[i] * (uint64_t)qhat + carry;
            carry = (uint32_t)(value >> 32); // upper half for next loop
            product[i] = (uint32_t)value;    // lower into vector
        }
        bool ca = true;
        // this is simply sv.subfe where ca is XER.CA
        for (int i = 0; i <= n; i++) {
            uint64_t value = (uint64_t)~product[i] + (uint64_t)un_j[i] + ca;
            ca = value >> 32 != 0;
            un_j[i] = value;
        }
        bool need_fixup = !ca; // for phase 3 correction

        // Compute estimate qhat of q[j] from top 2 digits
        uint64_t dig2 = ((uint64_t)un[j + n] << 32) | un[j + n - 1];
        if (un[j+n] >= vn[n-1]) {
            // rhat can be bigger than 32-bit when the division overflows
            qhat = UINT32_MAX;
            rhat = dig2 - (uint64_t)UINT32_MAX * vn[n - 1];
        } else {
            qhat = dig2 / vn[n - 1]; // 64/32 divide
            rhat = dig2 % vn[n - 1]; // 64/32 modulo
        }
        // use 3rd-from-top digit to obtain better accuracy
        b = 1UL<<32;
        while (rhat < b || qhat * vn[n - 2] > b * rhat + un[j + n - 2]) {
            qhat = qhat - 1;
            rhat = rhat + vn[n - 1];
        }

        k = 0; // the case of a
        for (j = m - 1; j >= 0; j--)
        {                                 // single-digit
            uint64_t dig2 = ((k << 32) | u[j]);
            q[j] = dig2 / v[0]; // divisor here.
            k = dig2 % v[0]; // modulo back into next loop
        }

RT0      = ((  0<<64) | RA0) / RB0
     RC0 = ((  0<<64) | RA0) % RB0
      |
      +-------+
              |
RT1      = ((RC0<<64) | RA1) / RB1
     RC1 = ((RC0<<64) | RA1) % RB1
      |
      +-------+
              |
RT2      = ((RC1<<64) | RA2) / RB2
     RC2 = ((RC1<<64) | RA2) % RB2

 dividend = (RC) || (RA)
 divisor = EXTZ128(RB)
 RT = UDIV(dividend, divisor)
 RS = UREM(dividend, divisor)

Analysis

Vector Add and Subtract

Vector Multiply

Vector Shift

Vector Divide

Conclusion