SV Load and Store

Mon Dec 11 02:59:42 2023 · all

    if RA.isvec:
        svctx.ldstmode = indexed
    elif els == 0:
        svctx.ldstmode = unitstride
    elif immediate != 0:
        svctx.ldstmode = elementstride

    if els and !RA.isvec and !RB.isvec:
        svctx.ldstmode = elementstride

    imm(RA)  RT.v   RA.v   no stride allowed
    imm(RA)  RT.s   RA.v   no stride allowed
    imm(RA)  RT.v   RA.s   stride-select allowed
    imm(RA)  RT.s   RA.s   not vectorized
    RA,RB    RT.v  {RA|RB}.v Standard Indexed
    RA,RB    RT.s  {RA|RB}.v Indexed but single LD (no VSPLAT)
    RA,RB    RT.v  {RA&RB}.s VSPLAT possible. stride selectable
    RA,RB    RT.s  {RA&RB}.s not vectorized (scalar identity)

    lbux RT, RA, RB
    EA <- (RA) + (RB)
    RT <- MEM(EA)

    lb RT,D(RA)
    EA <- RA + EXTS(D)
    RT <- MEM(EA)

    # LD not VLD!  format - ldop RT, immed(RA)
    # op_width: lb=1, lh=2, lw=4, ld=8
    op_load(RT, RA, op_width, immed, svctx, RAupdate):
      ps = get_pred_val(FALSE, RA); # predication on src
      pd = get_pred_val(FALSE, RT); # ... AND on dest
      for (i=0, j=0, u=0; i < VL && j < VL;):
        # skip nonpredicates elements
        if (RA.isvec) while (!(ps & 1<<i)) i++;
        if (RAupdate.isvec) while (!(ps & 1<<u)) u++;
        if (RT.isvec) while (!(pd & 1<<j)) j++;
        if postinc:
            offs = 0; # added afterwards
            if RA.isvec: srcbase = ireg[RA+i]
            else         srcbase = ireg[RA]
        elif svctx.ldstmode == elementstride:
          # element stride mode
          srcbase = ireg[RA]
          offs = i * immed              # j*immed for a ST
        elif svctx.ldstmode == unitstride:
          # unit stride mode
          srcbase = ireg[RA]
          offs = immed + (i * op_width) # j*op_width for ST
        elif RA.isvec:
          # quirky Vector indexed mode but with an immediate
          srcbase = ireg[RA+i]
          offs = immed;
        else
          # standard scalar mode (but predicated)
          # no stride multiplier means VSPLAT mode
          srcbase = ireg[RA]
          offs = immed

        # compute EA
        EA = srcbase + offs
        # load from memory
        ireg[RT+j] <= MEM[EA];
        # check post-increment of EA
        if postinc: EA = srcbase + immed;
        # update RA?
        if RAupdate: ireg[RAupdate+u] = EA;
        if (!RT.isvec)
            break # destination scalar, end now
        if (RA.isvec) i++;
        if (RAupdate.isvec) u++;
        if (RT.isvec) j++;

    # format: ldop RT, RA, RB
    function op_ldx(RT, RA, RB, RAupdate=False) # LD not VLD!
      ps = get_pred_val(FALSE, RA); # predication on src
      pd = get_pred_val(FALSE, RT); # ... AND on dest
      for (i=0, j=0, k=0, u=0; i < VL && j < VL && k < VL):
        # skip nonpredicated RA, RB and RT
        if (RA.isvec) while (!(ps & 1<<i)) i++;
        if (RAupdate.isvec) while (!(ps & 1<<u)) u++;
        if (RB.isvec) while (!(ps & 1<<k)) k++;
        if (RT.isvec) while (!(pd & 1<<j)) j++;
        if svctx.ldstmode == elementstride:
            EA = ireg[RA] + ireg[RB]*j   # register-strided
        else
            EA = ireg[RA+i] + ireg[RB+k] # indexed address
        if RAupdate: ireg[RAupdate+u] = EA
        ireg[RT+j] <= MEM[EA];
        if (!RT.isvec)
            break # destination scalar, end immediately
        if (RA.isvec) i++;
        if (RAupdate.isvec) u++;
        if (RB.isvec) k++;
        if (RT.isvec) j++;

    # sv.ld *RT,RA,*RB with Index REMAP applied to RB
    for i in 0..VL-1:
        if remap.indexed:
            rb_idx = indexed_remap(i) # remap
        else:
            rb_idx = i # use the index as-is
        EA = GPR(RA) + GPR(RB+rb_idx)
        GPR(RT+i) = MEM(EA, 8)

    for(i = 0; i < VL; i++)
        reg[rt + i] = mem[reg[ra] + i * reg[rb]];

   RT=1 # vec - deliberately overlaps by one with RA
   RA=0 # vec - first one is valid, contains ptr
   imm = 8 # offset_of(ptr->next)
   for i in range(VL):
       # this part is the Scalar Defined Word-instruction (standard scalar ld operation)
       EA = GPR(RA+i) + imm          # ptr + offset(next)
       data = MEM(EA, 8)             # 64-bit address of ptr->next
       # was a normal vector-ld up to this point. now the Data-Fail-First
       cr_test = conditions(data)
       if Rc=1 or RC1: CR.field(i) = cr_test # only store if Rc=1/RC1
       action_load = True
       if cr_test.EQ == testbit:             # check if zero
           if VLI then
              VL = i+1            # update VL, inclusive
           else
              VL = i              # update VL, exclusive current
              action_load = False # current load excluded
           stop = True            # stop looping
       if action_load:
          GPR(RT+i) = data        # happens to be read on next loop!
       if stop: break

    # LD not VLD!
    # this covers unit stride mode and a type of vector offset
    function op_ld(RT, RA, op_width, imm_offs, svctx)
      for (int i = 0, int j = 0; i < svctx.VL && j < svctx.VL):
        if not svctx.unit/el-strided:
            # strange vector mode, compute 64 bit address which is
            # not polymorphic! elwidth hardcoded to 64 here
            srcbase = get_polymorphed_reg(RA, 64, i)
        else:
            # unit / element stride mode, compute 64 bit address
            srcbase = get_polymorphed_reg(RA, 64, 0)
            # adjust for unit/el-stride
            srcbase += .... uses op_width here

        # read the underlying memory
        memread <= MEM(srcbase + imm_offs, op_width)

        # truncate/extend to over-ridden dest width.
        memread = adjust_wid(memread, op_width, svctx.elwidth)

        # takes care of inserting memory-read (now correctly byteswapped)
        # into regfile underlying LE-defined order, into the right place
        # using Element-Packing starting at register RT, respecting destination
        # element bitwidth, and the element index (j)
        set_polymorphed_reg(RT, svctx.elwidth, j, memread)

        # increments both src and dest element indices (no predication here)
        i++;
        j++;

    # LD not VLD! ld*rx if brev else ld*
    function op_ld(RT, RA, RB, op_width, svctx, brev)
      for (int i = 0, int j = 0; i < svctx.VL && j < svctx.VL):
        if not svctx.el-strided:
            # RA not polymorphic! elwidth hardcoded to 64 here
            srcbase = get_polymorphed_reg(RA, 64, i)
        else:
            # element stride mode, again RA not polymorphic
            srcbase = get_polymorphed_reg(RA, 64, 0)
        # RB *is* polymorphic
        offs = get_polymorphed_reg(RB, svctx.src_elwidth, i)
        # sign-extend
        if svctx.SEA: offs = sext(offs, svctx.src_elwidth, 64)

        # takes care of (merges) processor LE/BE and ld/ldbrx
        bytereverse = brev XNOR MSR.LE

        # read the underlying memory
        memread <= MEM(srcbase + offs, op_width)

        # optionally performs byteswap at op width
        if (bytereverse):
            memread = byteswap(memread, op_width)

        # truncate/extend to over-ridden dest width.
        dest_width = op_width if RT.isvec else 64
        memread = adjust_wid(memread, op_width, dest_width)

        # takes care of inserting memory-read (now correctly byteswapped)
        # into regfile underlying LE-defined order, into the right place
        # within the NEON-like register, respecting destination element
        # bitwidth, and the element index (j)
        set_polymorphed_reg(RT, destwidth, j, memread)

        # increments both src and dest element indices (no predication here)
        i++;
        j++;

0	1	2	3 4	description
els	0	PI	zz LF	post-increment and Fault-First
VLi	1	inv	CR-bit	Data-Dependent ffirst CR sel

0	1	2	3 4	description
els	0	PI	zz SEA	post-increment and Fault-First
VLi	1	inv	CR-bit	Data-Dependent ffirst CR sel

SV Load and Store

Rationale

Modes overview

Format and fields

Vectorization of Scalar Power ISA v3.0B

LD/ST Indexed vs Indexed REMAP

LD/ST ffirst (Fault-First)

Data-Dependent Fail-First (not Fail/Fault-First)

LOAD/STORE Elwidths

Remapped LD/ST