Frontend Development 16 min read

Using WebGPU Compute Pipeline for Large‑Scale Bird Flocking Simulation

This article explains how to leverage WebGPU's compute pipeline and WGSL shaders to simulate the flocking behavior of tens of thousands of birds on the GPU, covering context creation, shader modules, pipeline setup, data buffers, and the rendering‑compute loop with full JavaScript code examples.

Rare Earth Juejin Tech Community
Rare Earth Juejin Tech Community
Rare Earth Juejin Tech Community
Using WebGPU Compute Pipeline for Large‑Scale Bird Flocking Simulation

The article introduces WebGPU as a high‑performance alternative to WebGL, highlighting its added compute pipeline which enables general‑purpose GPU (GPGPU) tasks such as large‑scale simulations.

It starts by showing how to obtain a GPU object, request an adapter, and create a logical device, then configures a WebGPU canvas context:

// 获取GPU对象
const gpu = navigator.gpu;
if ( !gpu ) throw new Error( "GPU is not available" );

// 请求GPU适配器,异步过程
const adapter = await gpu.requestAdapter();
if ( !adapter ) throw new Error( "COULD NOT REQUEST GPU ADAPTER" );

// 通过GPU适配器请求GPU逻辑设备
const device = await adapter.requestDevice();
if ( !device ) throw new Error( "COULD NOT REQUEST GPU DEVICE" );

// 创建WebGPU上下文
canvas.width = window.innerWidth;
canvas.height = window.innerHeight;
const context = canvas.getContext( "webgpu" );
if ( !context ) throw new Error( "COULD NOT GET GPU CONTEXT" );

// 配置WebGPU上下文
const presentationFormat = gpu.getPreferredCanvasFormat();
context.configure( {
    device,
    format: presentationFormat,
    alphaMode: "premultiplied"
} );

Next, it explains the creation of WGSL shader modules. The vertex shader outputs position and color, while the fragment shader simply returns the interpolated color. Important WGSL annotations such as @builtin , @location , and @group are described.

// 顶点着色器的输出类型,location 索引为 4 ,则在片元着色器可以通过 location(4) 获取到 color
struct VertexOutput {
    @builtin(position) position : vec4f,
    @location(4) color : vec4f,
}

@vertex
fn main(
@location(0) a_particlePos : vec2f,
@location(1) a_particleVel : vec2f,
@location(2) a_pos : vec2f,
) -> VertexOutput{
    let angle = -atan2(a_particleVel.x, a_particleVel.y);
    let pos = vec2(
    a_pos.x * cos(angle) - a_pos.y * sin(angle),
    a_pos.x * sin(angle) + a_pos.y * cos(angle)
    );
    var output : VertexOutput;
    output.position = vec4(pos + a_particlePos, 0.0, 1.0);
    output.color = vec4(
    1.0 - sin(angle + 1.0) - a_particleVel.y,
    pos.x * 100.0 - a_particleVel.y + 0.1,
    a_particleVel.x + cos(angle + 0.5),
    1.0);
    return output;
}

@fragment
fn frag_main(@location(4) color : vec4f) -> @location(0) vec4f {
    return color;
}

The compute shader implements a classic flocking algorithm. Two particle buffers (A and B) store positions and velocities for each bird; each frame the shader reads from buffer A, computes new velocities based on three rules (cohesion, separation, alignment), writes results to buffer B, and the buffers are swapped each iteration.

struct Particle {
    pos : vec2f,
    vel : vec2f,
}

struct SimParams {
    deltaT : f32,
    rule1Distance : f32,
    rule2Distance : f32,
    rule3Distance : f32,
    rule1Scale : f32,
    rule2Scale : f32,
    rule3Scale : f32,
}

struct Particles{
    particles : array
}

@binding(0) @group(0) var
params : SimParams;
@binding(1) @group(0) var
particlesA : Particles;
@binding(2) @group(0) var
particlesB : Particles;

@compute @workgroup_size(64)
fn main(
@builtin(global_invocation_id) GlobalInvovationID : vec3u
) {
    var index = GlobalInvovationID.x;
    var vPos = particlesA.particles[index].pos;
    var vVel = particlesA.particles[index].vel;
    var cMass = vec2(0.0);
    var cVel = vec2(0.0);
    var colVel = vec2(0.0);
    var cMassCount = 0u;
    var cVelCount = 0u;
    var pos : vec2f;
    var vel : vec2f;

    for(var i = 0u; i < arrayLength(&particlesA.particles); i++) {
        if(i == index) { continue; }
        pos = particlesA.particles[i].pos.xy;
        vel = particlesA.particles[i].vel.xy;
        if(distance(pos, vPos) < params.rule1Distance) { cMass += pos; cMassCount ++; }
        if(distance(pos, vPos) < params.rule2Distance) { colVel -= pos - vPos; }
        if(distance(pos, vPos) < params.rule3Distance) { cVel += vel; cVelCount++; }
    }
    if(cMassCount > 0) { cMass = (cMass / vec2(f32(cMassCount))) - vPos; }
    if(cVelCount > 0) { cVel /= f32(cVelCount); }
    vVel += (cMass * params.rule1Scale) + (colVel * params.rule2Scale) + (cVel * params.rule3Scale);
    vVel = normalize(vVel) * clamp(length(vVel), 0.0, 0.1);
    vPos = vPos + (vVel * params.deltaT);
    // wrap around
    if (vPos.x < -1.0) { vPos.x = 1.0; }
    if (vPos.x > 1.0) { vPos.x = -1.0; }
    if (vPos.y < -1.0) { vPos.y = 1.0; }
    if (vPos.y > 1.0) { vPos.y = -1.0; }
    particlesB.particles[index].pos = vPos;
    particlesB.particles[index].vel = vVel;
}

Shader modules are created in JavaScript with device.createShaderModule({code: …}) for both the rendering and compute shaders.

const spriteShaderModule = device.createShaderModule( { code: spriteWGSL } );
const spriteUpdateShaderModule = device.createShaderModule( { code: updateSpritesWGSL } );

Two pipelines are then built: a render pipeline that draws each bird as a triangle using the vertex/fragment shaders, and a compute pipeline that runs the flocking logic.

const renderPipeline = device.createRenderPipeline({
    layout: 'auto',
    vertex: { module: spriteShaderModule, buffers: [/* vertex & instance buffers */] },
    fragment: { module: spriteShaderModule, targets: [{ format: presentationFormat }] },
    primitive: { topology: 'triangle-list' }
});

const computePipeline = device.createComputePipeline({
    layout: 'auto',
    compute: { module: spriteUpdateShaderModule }
});

Data buffers are prepared: a static vertex buffer for the bird triangle, a uniform buffer for simulation parameters, and two storage buffers that hold particle data and are swapped each frame. The buffers are mapped, filled with random initial positions, and then unmapped for GPU use.

const vertexBufferData = new Float32Array([ -0.01, -0.02, 0.01, -0.02, 0.0, 0.02 ]);
const spriteVertexBuffer = device.createBuffer({ size: vertexBufferData.byteLength, usage: GPUBufferUsage.VERTEX, mappedAtCreation: true });
new Float32Array(spriteVertexBuffer.getMappedRange()).set(vertexBufferData);
spriteVertexBuffer.unmap();

const simParams = { deltaT: 0.04, rule1Distance: 0.1, rule2Distance: 0.025, rule3Distance: 0.025, rule1Scale: 0.02, rule2Scale: 0.05, rule3Scale: 0.005 };
const simParamBufferSize = 7 * Float32Array.BYTES_PER_ELEMENT;
const simParamBuffer = device.createBuffer({ size: simParamBufferSize, usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST });
device.queue.writeBuffer(simParamBuffer, 0, new Float32Array([ simParams.deltaT, simParams.rule1Distance, simParams.rule2Distance, simParams.rule3Distance, simParams.rule1Scale, simParams.rule2Scale, simParams.rule3Scale ]));

const numParticles = 1500;
const initialParticleData = new Float32Array(numParticles * 4);
for (let i = 0; i < numParticles; ++i) {
    initialParticleData[4*i+0] = 2 * (Math.random() - 0.5);
    initialParticleData[4*i+1] = 2 * (Math.random() - 0.5);
    initialParticleData[4*i+2] = 2 * (Math.random() - 0.5) * 0.1;
    initialParticleData[4*i+3] = 2 * (Math.random() - 0.5) * 0.1;
}
const particleBuffers = new Array(2);
const particleBindGroups = new Array(2);
for (let i = 0; i < 2; ++i) {
    particleBuffers[i] = device.createBuffer({ size: initialParticleData.byteLength, usage: GPUBufferUsage.VERTEX | GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC, mappedAtCreation: true });
    new Float32Array(particleBuffers[i].getMappedRange()).set(initialParticleData);
    particleBuffers[i].unmap();
    particleBindGroups[i] = device.createBindGroup({
        layout: computePipeline.getBindGroupLayout(0),
        entries: [
            { binding: 0, resource: { buffer: simParamBuffer } },
            { binding: 1, resource: { buffer: particleBuffers[i], offset: 0, size: initialParticleData.byteLength } },
            { binding: 2, resource: { buffer: particleBuffers[(i+1)%2], offset: 0, size: initialParticleData.byteLength } }
        ]
    });
}

The main animation loop creates a command encoder, runs the compute pass (dispatching enough workgroups for all particles), then runs the render pass to draw the birds. After rendering, the updated particle buffer is copied to a staging buffer, mapped back to JavaScript, and logged.

async function frame() {
    renderPassDescriptor.colorAttachments[0].view = context.getCurrentTexture().createView();
    const commandEncoder = device.createCommandEncoder();
    // Compute pass
    const computePass = commandEncoder.beginComputePass();
    computePass.setPipeline(computePipeline);
    computePass.setBindGroup(0, particleBindGroups[t % 2]);
    computePass.dispatchWorkgroups(Math.ceil(numParticles / 64));
    computePass.end();
    // Render pass
    const renderPass = commandEncoder.beginRenderPass(renderPassDescriptor);
    renderPass.setPipeline(renderPipeline);
    renderPass.setVertexBuffer(0, particleBuffers[(t+1)%2]);
    renderPass.setVertexBuffer(1, spriteVertexBuffer);
    renderPass.draw(3, numParticles, 0, 0);
    renderPass.end();
    // Copy results back to CPU
    commandEncoder.copyBufferToBuffer(particleBuffers[(t+1)%2], 0, stagingBuffer, 0, initialParticleData.byteLength);
    device.queue.submit([commandEncoder.finish()]);
    await stagingBuffer.mapAsync(GPUMapMode.READ, 0, initialParticleData.byteLength);
    const copyArrayBuffer = stagingBuffer.getMappedRange(0, initialParticleData.byteLength);
    const data = copyArrayBuffer.slice(0);
    stagingBuffer.unmap();
    console.log(new Float32Array(data));
    ++t;
    requestAnimationFrame(frame);
}
frame();

In the conclusion, the author notes that removing the rendering code leaves a pure compute pipeline that can process 30 000 particles at 60 fps on a laptop GPU, achieving roughly 5.4 × 10⁹ operations per second, demonstrating the practicality of GPGPU for big‑data workloads.

graphicsJavaScriptWebGPUFlocking SimulationGPU ComputeWGSL
Rare Earth Juejin Tech Community
Written by

Rare Earth Juejin Tech Community

Juejin, a tech community that helps developers grow.

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.