Anthropic Performance Challenge

Kernel Code

// Anthropic Performance Challenge
// Kernel Builder - JavaScript Version
//
// Edit the buildKernel function to optimize the cycle count.
// Click "Run" to test your changes.

function buildKernel(kb, forestHeight, nNodes, batchSize, rounds) {
    const tmp1 = kb.allocScratch("tmp1");
    const tmp2 = kb.allocScratch("tmp2");
    const tmp3 = kb.allocScratch("tmp3");

// Load initial values from memory header
    const initVars = [
        "rounds", "n_nodes", "batch_size", "forest_height",
        "forest_values_p", "inp_indices_p", "inp_values_p"
    ];
    for (const v of initVars) {
        kb.allocScratch(v, 1);
    }
    for (let i = 0; i < initVars.length; i++) {
        kb.add("load", ["const", tmp1, i]);
        kb.add("load", ["load", kb.scratch[initVars[i]], tmp1]);
    }

const zeroConst = kb.scratchConst(0);
    const oneConst = kb.scratchConst(1);
    const twoConst = kb.scratchConst(2);

kb.add("flow", ["pause"]);

// Allocate working registers
    const tmpIdx = kb.allocScratch("tmp_idx");
    const tmpVal = kb.allocScratch("tmp_val");
    const tmpNodeVal = kb.allocScratch("tmp_node_val");
    const tmpAddr = kb.allocScratch("tmp_addr");

// Main loop - this is where you optimize!
    for (let round = 0; round < rounds; round++) {
        for (let i = 0; i < batchSize; i++) {
            const iConst = kb.scratchConst(i);

// Load index
            kb.add("alu", ["+", tmpAddr, kb.scratch["inp_indices_p"], iConst]);
            kb.add("load", ["load", tmpIdx, tmpAddr]);

// Load value
            kb.add("alu", ["+", tmpAddr, kb.scratch["inp_values_p"], iConst]);
            kb.add("load", ["load", tmpVal, tmpAddr]);

// Load tree node
            kb.add("alu", ["+", tmpAddr, kb.scratch["forest_values_p"], tmpIdx]);
            kb.add("load", ["load", tmpNodeVal, tmpAddr]);

// XOR
            kb.add("alu", ["^", tmpVal, tmpVal, tmpNodeVal]);

// Hash (6 stages, 3 ops each)
            for (const [op1, val1, op2, op3, val3] of Simulator.HASH_STAGES) {
                kb.add("alu", [op1, tmp1, tmpVal, kb.scratchConst(val1)]);
                kb.add("alu", [op3, tmp2, tmpVal, kb.scratchConst(val3)]);
                kb.add("alu", [op2, tmpVal, tmp1, tmp2]);
            }

// Calculate next index
            kb.add("alu", ["%", tmp1, tmpVal, twoConst]);
            kb.add("alu", ["==", tmp1, tmp1, zeroConst]);
            kb.add("flow", ["select", tmp3, tmp1, oneConst, twoConst]);
            kb.add("alu", ["*", tmpIdx, tmpIdx, twoConst]);
            kb.add("alu", ["+", tmpIdx, tmpIdx, tmp3]);

// Wrap if past tree bounds
            kb.add("alu", ["<", tmp1, tmpIdx, kb.scratch["n_nodes"]]);
            kb.add("flow", ["select", tmpIdx, tmp1, tmpIdx, zeroConst]);

// Store results
            kb.add("alu", ["+", tmpAddr, kb.scratch["inp_indices_p"], iConst]);
            kb.add("store", ["store", tmpAddr, tmpIdx]);
            kb.add("alu", ["+", tmpAddr, kb.scratch["inp_values_p"], iConst]);
            kb.add("store", ["store", tmpAddr, tmpVal]);
        }
    }

kb.add("flow", ["pause"]);
    return kb.instrs;
}

Anthropic Performance Challenge

Kernel Code

Benchmarks

SLOT LIMITS

Output

Quick Start

The Goal

The Machine

Key Insight

Operations

Packing Operations

Vectors (SIMD)

Tips