Decoding variable-length instructions efficiently
We can use a method similar to carry look-ahead to make decoding N
instructions take time O(log N)
:
Using the encoding in: http://lists.libre-soc.org/pipermail/libre-soc-dev/2020-November/001282.html and demo
See following code on Compiler Explorer:
#include <stddef.h>
#include <stdint.h>
#include <assert.h>
enum Mode {
Standard,
Compressed,
StandardOnceThenCompressed,
};
struct SizeAndMode {
size_t size; // size in bytes; always even
Mode mode;
};
struct RangeSummary {
// SizeAndMode::mode fields are the new mode after executing
// the instructions summarized by this RangeSummary
SizeAndMode mode_was_standard;
SizeAndMode mode_was_compressed;
SizeAndMode mode_was_standard_once_then_compressed;
};
// predecodes a single uint16_t, treating it as if it were the first
// uint16_t in an instruction
RangeSummary predecode_one(uint16_t nth, uint16_t n_plus_1th) {
uint16_t po_field = nth >> 10;
uint16_t scm_field = nth & 0x1;
uint16_t scmt_field = nth >> 15;
uint16_t xo_field = n_plus_1th >> 1;
SizeAndMode mode_was_standard = {
.size = 4, // 32-bit
.mode = Standard,
};
SizeAndMode mode_was_standard_once_then_compressed = {
.size = 4, // 32-bit
.mode = Compressed,
};
switch(po_field) {
case 0: // 32/48-bits
// check for bit set by "Support Processor Attention" instruction
if((xo_field & 256) == 0) {
mode_was_standard.size = 6; // 48-bit
mode_was_standard_once_then_compressed.size = 6;
} else {
// SVP32 or "Support Processor Attention"
// leave sizes as 32-bit
}
break;
case 1: // PowerISA v3.1 prefix & SVP64
mode_was_standard.size = 8; // 64-bit
mode_was_standard_once_then_compressed.size = 8;
break;
case 5:
mode_was_standard.size = 2; // 16-bit
mode_was_standard_once_then_compressed.size = 2;
if(scm_field != 0) // swap to compressed mode
mode_was_standard.mode = Compressed;
break;
default: // standard PowerISA instructions
// leave sizes as 32-bit
break;
}
SizeAndMode mode_was_compressed = {
.size = 2, // 16-bit
.mode = Compressed,
};
if(scm_field) { // swap to standard mode
if(scmt_field) // swap to standard mode temporarily
mode_was_compressed.mode = StandardOnceThenCompressed;
else
mode_was_compressed.mode = Standard;
} else
mode_was_compressed.mode = Compressed;
return RangeSummary{
.mode_was_standard = mode_was_standard,
.mode_was_compressed = mode_was_compressed,
.mode_was_standard_once_then_compressed =
mode_was_standard_once_then_compressed,
};
}
SizeAndMode propagate_through_range(SizeAndMode in,
RangeSummary range_summary) {
SizeAndMode picked;
switch(in.mode) {
case Standard:
picked = range_summary.mode_was_standard;
break;
case Compressed:
picked = range_summary.mode_was_compressed;
break;
case StandardOnceThenCompressed:
picked = range_summary.mode_was_standard_once_then_compressed;
break;
}
return SizeAndMode{
.size = in.size + picked.size,
.mode = picked.mode,
};
}
RangeSummary merge(RangeSummary first, RangeSummary second) {
SizeAndMode s = propagate_through_range(
first.mode_was_standard, second);
SizeAndMode c = propagate_through_range(
first.mode_was_compressed, second);
SizeAndMode sc = propagate_through_range(
first.mode_was_standard_once_then_compressed, second);
return RangeSummary{
.mode_was_standard = s,
.mode_was_compressed = c,
.mode_was_standard_once_then_compressed = sc,
};
}
enum Endian {
LittleEndian,
BigEndian,
};
const size_t INSTRUCTION_MEMORY_SIZE = 0x1000000; // just for demo purposes
uint8_t instruction_memory[INSTRUCTION_MEMORY_SIZE];
// get the uint16_t corresponding to the instruction indicated by the PC
uint16_t get_u16(Endian endian, size_t pc) {
assert((pc & 1) == 0); // pc must be aligned to 2 bytes
uint8_t lsb_byte;
uint8_t msb_byte;
switch(endian) {
case LittleEndian:
msb_byte = instruction_memory[(pc ^ 2) + 1];
lsb_byte = instruction_memory[pc ^ 2];
break;
case BigEndian:
msb_byte = instruction_memory[pc];
lsb_byte = instruction_memory[pc + 1];
break;
}
return (uint16_t)lsb_byte | ((uint16_t)msb_byte << 8);
}
struct PredecodedInstruction {
Mode mode; // mode for this instruction
bool is_start; // true if this is the initial uint16_t in an instruction
};
// decode width in uint16_t units
// can replace with whatever value you like
const size_t DECODE_WIDTH = 8;
void predecode_all(Endian endian, size_t pc, Mode starting_mode,
PredecodedInstruction outputs[DECODE_WIDTH]) {
RangeSummary range_summaries[DECODE_WIDTH];
for(size_t i = 0; i < DECODE_WIDTH; i++) {
uint16_t nth = get_u16(endian, pc + i * 2);
uint16_t n_plus_1th = get_u16(endian, pc + (i + 1) * 2);
range_summaries[i] = predecode_one(nth, n_plus_1th);
}
// use a parallel prefix-sum algorithm,
// using `merge()` as the sum's operation.
// use a simple recursive algorithm
for(size_t step = 1; step < DECODE_WIDTH; step *= 2) {
// outer loop loops ceil(log2(DECODE_WIDTH)) times
RangeSummary temp[DECODE_WIDTH];
for(size_t i = 0; i < DECODE_WIDTH; i++) {
if(i < step)
temp[i] = range_summaries[i];
else
temp[i] = merge(range_summaries[i - step],
range_summaries[i]);
}
for(size_t i = 0; i < DECODE_WIDTH; i++) {
range_summaries[i] = temp[i];
}
}
for(size_t i = 0; i < DECODE_WIDTH; i++) {
SizeAndMode size_and_mode{
.size = 0,
.mode = starting_mode,
};
if(i > 0) {
size_and_mode = propagate_through_range(size_and_mode,
range_summaries[i - 1]);
}
outputs[i] = PredecodedInstruction{
.mode = size_and_mode.mode,
.is_start = (size_and_mode.size == pc + i * 2),
};
}
}