svstep

SVL-Form

  • svstep RT,RA,SVi,vf (Rc=0)
  • svstep. RT,RA,SVi,vf (Rc=1)

Pseudo-code:

if SVi[3:4] = 0b11 then
    # store pack and unpack in SVSTATE
    SVSTATE[53] <- SVi[5]
    SVSTATE[54] <- SVi[6]
    RT <- [0]*62 || SVSTATE[53:54]
else
    step <- SVSTATE_NEXT(SVi, vf)
    RT <- [0]*57 || step

Special Registers Altered:

CR0                     (if Rc=1)

setvl

SVL-Form

  • setvl RT,RA,SVi,vf,vs,ms (Rc=0)
  • setvl. RT,RA,SVi,vf,vs,ms (Rc=1)

Pseudo-code:

overflow <- 0b0
VLimm <- SVi + 1
# set or get MVL
if ms = 1 then MVL <- VLimm[0:6]
else           MVL <- SVSTATE[0:6]
# set or get VL
if vs = 0                then VL <- SVSTATE[7:13]
else if _RA != 0         then
    if (RA) >u 0b1111111 then
        VL <- 0b1111111
        overflow <- 0b1
    else                      VL <- (RA)[57:63]
else if _RT = 0          then VL <- VLimm[0:6]
else if CTR >u 0b1111111 then
    VL <- 0b1111111
    overflow <- 0b1
else                          VL <- CTR[57:63]
# limit VL to within MVL
if VL >u MVL then
    overflow <- 0b1
    VL <- MVL
SVSTATE[0:6] <- MVL
SVSTATE[7:13] <- VL
if _RT != 0 then
   GPR(_RT) <- [0]*57 || VL
# MAXVL is a static "state-reset".
if ms = 1 then
    SVSTATE[63] <- vf   # set Vertical-First mode
    SVSTATE[62] <- 0b0  # clear persist bit

Special Registers Altered:

CR0                     (if Rc=1)

svremap

SVRM-Form

  • svremap SVme,mi0,mi1,mi2,mo0,mo1,pst

Pseudo-code:

# registers RA RB RC RT EA/FRS SVSHAPE0-3 indices
SVSTATE[32:33] <- mi0
SVSTATE[34:35] <- mi1
SVSTATE[36:37] <- mi2
SVSTATE[38:39] <- mo0
SVSTATE[40:41] <- mo1
# enable bit for RA RB RC RT EA/FRS
SVSTATE[42:46] <- SVme
# persistence bit (applies to more than one instruction)
SVSTATE[62] <- pst

Special Registers Altered:

None

svshape

SVM-Form

  • svshape SVxd,SVyd,SVzd,SVrm,vf

Pseudo-code:

# for convenience, VL to be calculated and stored in SVSTATE
vlen <- [0] * 7
mscale[0:5] <- 0b000001 # for scaling MAXVL
itercount[0:6] <- [0] * 7
SVSTATE[0:31] <- [0] * 32
# only overwrite REMAP if "persistence" is zero
if (SVSTATE[62] = 0b0) then
    SVSTATE[32:33] <- 0b00
    SVSTATE[34:35] <- 0b00
    SVSTATE[36:37] <- 0b00
    SVSTATE[38:39] <- 0b00
    SVSTATE[40:41] <- 0b00
    SVSTATE[42:46] <- 0b00000
    SVSTATE[62] <- 0b0
    SVSTATE[63] <- 0b0
# clear out all SVSHAPEs
SVSHAPE0[0:31] <- [0] * 32
SVSHAPE1[0:31] <- [0] * 32
SVSHAPE2[0:31] <- [0] * 32
SVSHAPE3[0:31] <- [0] * 32
# set schedule up for multiply
if (SVrm = 0b0000) then
    # VL in Matrix Multiply is xd*yd*zd
    xd <- (0b00 || SVxd) + 1
    yd <- (0b00 || SVyd) + 1
    zd <- (0b00 || SVzd) + 1
    n <- xd * yd * zd
    vlen[0:6] <- n[14:20]
    # set up template in SVSHAPE0, then copy to 1-3
    SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
    SVSHAPE0[6:11] <- (0b0 || SVyd)   # ydim
    SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim
    SVSHAPE0[28:29] <- 0b11           # skip z
    # copy
    SVSHAPE1[0:31] <- SVSHAPE0[0:31]
    SVSHAPE2[0:31] <- SVSHAPE0[0:31]
    SVSHAPE3[0:31] <- SVSHAPE0[0:31]
    # set up FRA
    SVSHAPE1[18:20] <- 0b001          # permute x,z,y
    SVSHAPE1[28:29] <- 0b01           # skip z
    # FRC
    SVSHAPE2[18:20] <- 0b001          # permute x,z,y
    SVSHAPE2[28:29] <- 0b11           # skip y
# set schedule up for FFT butterfly
if (SVrm = 0b0001) then
    # calculate O(N log2 N)
    n <- [0] * 3
    do while n < 5
       if SVxd[4-n] = 0 then
           leave
       n <- n + 1
    n <- ((0b0 || SVxd) + 1) * n
    vlen[0:6] <- n[1:7]
    # set up template in SVSHAPE0, then copy to 1-3
    # for FRA and FRT
    SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
    SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D FFT)
    mscale <- (0b0 || SVzd) + 1
    SVSHAPE0[30:31] <- 0b01          # Butterfly mode
    # copy
    SVSHAPE1[0:31] <- SVSHAPE0[0:31]
    SVSHAPE2[0:31] <- SVSHAPE0[0:31]
    # set up FRB and FRS
    SVSHAPE1[28:29] <- 0b01           # j+halfstep schedule
    # FRC (coefficients)
    SVSHAPE2[28:29] <- 0b10           # k schedule
# set schedule up for (i)DCT Inner butterfly
# SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode)
if ((SVrm = 0b0100) |
    (SVrm = 0b1100)) then
    # calculate O(N log2 N)
    n <- [0] * 3
    do while n < 5
       if SVxd[4-n] = 0 then
           leave
       n <- n + 1
    n <- ((0b0 || SVxd) + 1) * n
    vlen[0:6] <- n[1:7]
    # set up template in SVSHAPE0, then copy to 1-3
    # set up FRB and FRS
    SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
    SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
    mscale <- (0b0 || SVzd) + 1
    if (SVrm = 0b1100) then
        SVSHAPE0[30:31] <- 0b11          # iDCT mode
        SVSHAPE0[18:20] <- 0b011         # iDCT Inner Butterfly sub-mode
    else
        SVSHAPE0[30:31] <- 0b01          # DCT mode
        SVSHAPE0[18:20] <- 0b001         # DCT Inner Butterfly sub-mode
        SVSHAPE0[21:23] <- 0b001         # "inverse" on outer loop
    SVSHAPE0[6:11] <- 0b000011       # (i)DCT Inner Butterfly mode 4
    # copy
    SVSHAPE1[0:31] <- SVSHAPE0[0:31]
    SVSHAPE2[0:31] <- SVSHAPE0[0:31]
    if (SVrm != 0b0100) & (SVrm != 0b1100) then
        SVSHAPE3[0:31] <- SVSHAPE0[0:31]
    # for FRA and FRT
    SVSHAPE0[28:29] <- 0b01           # j+halfstep schedule
    # for cos coefficient
    SVSHAPE2[28:29] <- 0b10           # ci (k for mode 4) schedule
    SVSHAPE2[12:17] <- 0b000000       # reset costable "striding" to 1
    if (SVrm != 0b0100) & (SVrm != 0b1100) then
        SVSHAPE3[28:29] <- 0b11           # size schedule
# set schedule up for (i)DCT Outer butterfly
if (SVrm = 0b0011) | (SVrm = 0b1011) then
    # calculate O(N log2 N) number of outer butterfly overlapping adds
    vlen[0:6] <- [0] * 7
    n <- 0b000
    size <- 0b0000001
    itercount[0:6] <- (0b00 || SVxd) + 0b0000001
    itercount[0:6] <- (0b0 || itercount[0:5])
    do while n < 5
       if SVxd[4-n] = 0 then
           leave
       n <- n + 1
       count <- (itercount - 0b0000001) * size
       vlen[0:6] <- vlen + count[7:13]
       size[0:6] <- (size[1:6] || 0b0)
       itercount[0:6] <- (0b0 || itercount[0:5])
    # set up template in SVSHAPE0, then copy to 1-3
    # set up FRB and FRS
    SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
    SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
    mscale <- (0b0 || SVzd) + 1
    if (SVrm = 0b1011) then
        SVSHAPE0[30:31] <- 0b11      # iDCT mode
        SVSHAPE0[18:20] <- 0b011     # iDCT Outer Butterfly sub-mode
        SVSHAPE0[21:23] <- 0b101     # "inverse" on outer and inner loop
    else
        SVSHAPE0[30:31] <- 0b01      # DCT mode
        SVSHAPE0[18:20] <- 0b100     # DCT Outer Butterfly sub-mode
    SVSHAPE0[6:11] <- 0b000010       # DCT Butterfly mode
    # copy
    SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule
    SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients
    # for FRA and FRT
    SVSHAPE1[28:29] <- 0b01           # j+halfstep schedule
    # reset costable "striding" to 1
    SVSHAPE2[12:17] <- 0b000000
# set schedule up for DCT COS table generation
if (SVrm = 0b0101) | (SVrm = 0b1101) then
    # calculate O(N log2 N)
    vlen[0:6] <- [0] * 7
    itercount[0:6] <- (0b00 || SVxd) + 0b0000001
    itercount[0:6] <- (0b0 || itercount[0:5])
    n <- [0] * 3
    do while n < 5
       if SVxd[4-n] = 0 then
           leave
       n <- n + 1
       vlen[0:6] <- vlen + itercount
       itercount[0:6] <- (0b0 || itercount[0:5])
    # set up template in SVSHAPE0, then copy to 1-3
    # set up FRB and FRS
    SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
    SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
    mscale <- (0b0 || SVzd) + 1
    SVSHAPE0[30:31] <- 0b01          # DCT/FFT mode
    SVSHAPE0[6:11] <- 0b000100       # DCT Inner Butterfly COS-gen mode
    if (SVrm = 0b0101) then
        SVSHAPE0[21:23] <- 0b001     # "inverse" on outer loop for DCT
    # copy
    SVSHAPE1[0:31] <- SVSHAPE0[0:31]
    SVSHAPE2[0:31] <- SVSHAPE0[0:31]
    # for cos coefficient
    SVSHAPE1[28:29] <- 0b10           # ci schedule
    SVSHAPE2[28:29] <- 0b11           # size schedule
# set schedule up for iDCT / DCT inverse of half-swapped ordering
if (SVrm = 0b0110) | (SVrm = 0b1110) | (SVrm = 0b1111) then
    vlen[0:6] <- (0b00 || SVxd) + 0b0000001
    # set up template in SVSHAPE0
    SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
    SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
    mscale <- (0b0 || SVzd) + 1
    if (SVrm = 0b1110) then
        SVSHAPE0[18:20] <- 0b001     # DCT opposite half-swap
    if (SVrm = 0b1111) then
        SVSHAPE0[30:31] <- 0b01          # FFT mode
    else
        SVSHAPE0[30:31] <- 0b11          # DCT mode
    SVSHAPE0[6:11] <- 0b000101       # DCT "half-swap" mode
# set schedule up for parallel reduction or prefix-sum
if (SVrm = 0b0111) then
    # is scan/prefix-sum
    is_scan <- SVyd = 2
    # calculate the total number of operations (brute-force)
    vlen[0:6] <- [0] * 7
    itercount[0:6] <- (0b00 || SVxd) + 0b0000001
    if is_scan then
        # prefix sum algorithm with operations replaced with
        # incrementing vlen
        dist <- 1
        vlen[0:6] <- 0
        do while dist <u itercount
            start <- dist * 2 - 1
            step <- dist * 2
            i <- start
            do while i <u itercount
                vlen[0:6] <- vlen[0:6] + 1
                i <- i + step
            dist <- dist * 2
        dist <- dist / 2
        do while dist != 0
            i <- dist * 3 - 1
            do while i <u itercount
                vlen[0:6] <- vlen[0:6] + 1
                i <- i + dist * 2
            dist <- dist / 2
    else
        step <- 0b0000001
        i <- 0b0000000
        do while step <u itercount
            newstep <- step[1:6] || 0b0
            j[0:6] <- 0b0000000
            do while (j+step <u itercount)
                j <- j + newstep
                i <- i + 1
            step <- newstep
        # VL in Parallel-Reduce is the number of operations
        vlen[0:6] <- i
    # set up template in SVSHAPE0, then copy to 1. only 2 needed
    SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
    SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
    mscale <- (0b0 || SVzd) + 1
    SVSHAPE0[30:31] <- 0b10          # parallel reduce/prefix submode
    # copy
    SVSHAPE1[0:31] <- SVSHAPE0[0:31]
    # set up submodes: parallel or prefix
    SVSHAPE0[28:29] <- 0b00   # left operand
    SVSHAPE1[28:29] <- 0b01   # right operand
    if is_scan then
        SVSHAPE0[28:29] <- 0b10   # left operand
        SVSHAPE1[28:29] <- 0b11   # right operand
# set VL, MVL and Vertical-First
m[0:12] <- vlen * mscale
maxvl[0:6] <- m[6:12]
SVSTATE[0:6] <- maxvl  # MAVXL
SVSTATE[7:13] <- vlen  # VL
SVSTATE[63] <- vf

Special Registers Altered:

None

svindex

SVI-Form

  • svindex SVG,rmm,SVd,ew,SVyx,mm,sk

Pseudo-code:

# based on nearest MAXVL compute other dimension
MVL <- SVSTATE[0:6]
d <- [0] * 6
dim <- SVd+1
do while d*dim <u ([0]*4 || MVL)
   d <- d + 1
# set up template, then copy once location identified
shape <- [0]*32
shape[30:31] <- 0b00            # mode
if SVyx = 0 then
    shape[18:20] <- 0b110       # indexed xd/yd
    shape[0:5] <- (0b0 || SVd)  # xdim
    if sk = 0 then shape[6:11] <- 0 # ydim
    else           shape[6:11] <- 0b111111 # ydim max
else
    shape[18:20] <- 0b111       # indexed yd/xd
    if sk = 1 then shape[6:11] <- 0 # ydim
    else           shape[6:11] <- d-1 # ydim max
    shape[0:5] <- (0b0 || SVd) # ydim
shape[12:17] <- (0b0 || SVG)        # SVGPR
shape[28:29] <- ew                  # element-width override
shape[21] <- sk                     # skip 1st dimension
# select the mode for updating SVSHAPEs
SVSTATE[62] <- mm # set or clear persistence
if mm = 0 then
    # clear out all SVSHAPEs first
    SVSHAPE0[0:31] <- [0] * 32
    SVSHAPE1[0:31] <- [0] * 32
    SVSHAPE2[0:31] <- [0] * 32
    SVSHAPE3[0:31] <- [0] * 32
    SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
    SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
    idx <- 0
    for bit = 0 to 4
        if rmm[4-bit] then
            # activate requested shape
            if idx = 0 then SVSHAPE0 <- shape
            if idx = 1 then SVSHAPE1 <- shape
            if idx = 2 then SVSHAPE2 <- shape
            if idx = 3 then SVSHAPE3 <- shape
            SVSTATE[bit*2+32:bit*2+33] <- idx
            # increment shape index, modulo 4
            if idx = 3 then idx <- 0
            else            idx <- idx + 1
else
    # refined SVSHAPE/REMAP update mode
    bit <- rmm[0:2]
    idx <- rmm[3:4]
    if idx = 0 then SVSHAPE0 <- shape
    if idx = 1 then SVSHAPE1 <- shape
    if idx = 2 then SVSHAPE2 <- shape
    if idx = 3 then SVSHAPE3 <- shape
    SVSTATE[bit*2+32:bit*2+33] <- idx
    SVSTATE[46-bit] <- 1

Special Registers Altered:

None

svshape2

SVM2-Form

  • svshape2 SVo,SVyx,rmm,SVd,sk,mm

Pseudo-code:

# based on nearest MAXVL compute other dimension
MVL <- SVSTATE[0:6]
d <- [0] * 6
dim <- SVd+1
do while d*dim <u ([0]*4 || MVL)
   d <- d + 1
# set up template, then copy once location identified
shape <- [0]*32
shape[30:31] <- 0b00            # mode
shape[0:5] <- (0b0 || SVd)      # x/ydim
if SVyx = 0 then
    shape[18:20] <- 0b000       # ordering xd/yd(/zd)
    if sk = 0 then shape[6:11] <- 0 # ydim
    else           shape[6:11] <- 0b111111 # ydim max
else
    shape[18:20] <- 0b010       # ordering yd/xd(/zd)
    if sk = 1 then shape[6:11] <- 0 # ydim
    else           shape[6:11] <- d-1 # ydim max
# offset (the prime purpose of this instruction)
shape[24:27] <- SVo         # offset
if sk = 1 then shape[28:29] <- 0b01 # skip 1st dimension
else           shape[28:29] <- 0b00 # no skipping
# select the mode for updating SVSHAPEs
SVSTATE[62] <- mm # set or clear persistence
if mm = 0 then
    # clear out all SVSHAPEs first
    SVSHAPE0[0:31] <- [0] * 32
    SVSHAPE1[0:31] <- [0] * 32
    SVSHAPE2[0:31] <- [0] * 32
    SVSHAPE3[0:31] <- [0] * 32
    SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
    SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
    idx <- 0
    for bit = 0 to 4
        if rmm[4-bit] then
            # activate requested shape
            if idx = 0 then SVSHAPE0 <- shape
            if idx = 1 then SVSHAPE1 <- shape
            if idx = 2 then SVSHAPE2 <- shape
            if idx = 3 then SVSHAPE3 <- shape
            SVSTATE[bit*2+32:bit*2+33] <- idx
            # increment shape index, modulo 4
            if idx = 3 then idx <- 0
            else            idx <- idx + 1
else
    # refined SVSHAPE/REMAP update mode
    bit <- rmm[0:2]
    idx <- rmm[3:4]
    if idx = 0 then SVSHAPE0 <- shape
    if idx = 1 then SVSHAPE1 <- shape
    if idx = 2 then SVSHAPE2 <- shape
    if idx = 3 then SVSHAPE3 <- shape
    SVSTATE[bit*2+32:bit*2+33] <- idx
    SVSTATE[46-bit] <- 1

Special Registers Altered:

None