Maximizing CPU Performance: A Prelude to Comparing GPUs

Often, achieving significant performance gains in computing doesn’t require drastic hardware changes like immediately jumping to GPU comparisons. Sometimes, the most impactful improvements come from optimizing the code you already have to fully leverage your CPU’s capabilities. This can be surprisingly straightforward, and in many cases, it takes very little extra effort.

Consider a simple yet fundamental operation in numerical computing: the dot product. Here’s a Julia function to compute the dot product of two vectors:

function dot_product(x, y)
    out = zero(promote_type(eltype(x), eltype(y)))
    @inbounds for i in eachindex(x,y)
        out += x[i] * y[i]
    end
    out
end
x, y = randn(256), randn(256);

On a modern computer equipped with AVX512 instruction set, let’s examine the assembly code generated by Julia for this seemingly simple function:

julia> @code_native dot_product(x, y)
.text
; Function dot_product {
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: REPL[4]:2
        subq    $40, %rsp
        movq    24(%rdi), %rax
;} ; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
        movq    %rax, %rcx
        sarq    $63, %rcx
        andnq   %rax, %rcx, %r8
;}}}}}}}
; Location: abstractarray.jl:218
; Function _all_match_first; {
; Location: abstractarray.jl:224
; Function #89; {
; Location: abstractarray.jl:218
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: array.jl:155
        movq    24(%rsi), %rcx
;} ; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
        movq    %rcx, %rdx
        sarq    $63, %rdx
        andnq   %rcx, %rdx, %rcx
;}}}}}}}}}
; Function _all_match_first; {
; Location: promotion.jl:403
        cmpq    %rcx, %r8
;}
        jne     L300
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
        testq   %rax, %rax
;}}}}}}}}}
        jle     L76
        movq    (%rdi), %rcx
        movq    (%rsi), %rdx
; Location: REPL[4]:3
        cmpq    $32, %r8
        jae     L88
        vxorpd  %xmm0, %xmm0, %xmm0
        movl    $1, %esi
        jmp     L259
L76:
        vxorps  %xmm0, %xmm0, %xmm0
; Location: REPL[4]:6
        addq    $40, %rsp
        vzeroupper
        retq
; Location: REPL[4]:3
L88:
        movabsq $9223372036854775776, %rdi  # imm = 0x7FFFFFFFFFFFFFE0
        andq    %r8, %rdi
        leaq    1(%rdi), %rsi
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %eax, %eax
        vxorpd  %xmm1, %xmm1, %xmm1
        vxorpd  %xmm2, %xmm2, %xmm2
        vxorpd  %xmm3, %xmm3, %xmm3
        nopl    (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L128:
        vmovupd (%rdx,%rax,8), %zmm4
        vmovupd 64(%rdx,%rax,8), %zmm5
        vmovupd 128(%rdx,%rax,8), %zmm6
        vmovupd 192(%rdx,%rax,8), %zmm7
;} ; Function +; {
; Location: float.jl:395
        vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
        vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
        vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
        vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
        addq    $32, %rax
        cmpq    %rax, %rdi
        jne     L128
        vaddpd  %zmm0, %zmm1, %zmm0
        vaddpd  %zmm0, %zmm2, %zmm0
        vaddpd  %zmm0, %zmm3, %zmm0
        vextractf64x4 $1, %zmm0, %ymm1
        vaddpd  %zmm1, %zmm0, %zmm0
        vextractf128 $1, %ymm0, %xmm1
        vaddpd  %zmm1, %zmm0, %zmm0
        vpermilpd $1, %xmm0, %xmm1         # xmm1 = xmm0[1,0]
        vaddpd  %zmm1, %zmm0, %zmm0
        cmpq    %rdi, %r8
;} ; Location: REPL[4]:3
        je      L292
L259:
        addq    $-1, %rsi
        nopw    (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L272:
        vmovsd  (%rdx,%rsi,8), %xmm1         # xmm1 = mem[0],zero
;} ; Function +; {
; Location: float.jl:395
        vfmadd231sd (%rcx,%rsi,8), %xmm1, %xmm0
;} ; Function iterate; {
; Location: range.jl:591
; Function ==; {
; Location: promotion.jl:403
        addq    $1, %rsi
        cmpq    %rsi, %r8
;}}
        jne     L272
; Location: REPL[4]:6
L292:
        addq    $40, %rsp
        vzeroupper
        retq
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:218
L300:
        movabsq $jl_system_image_data, %rax
        movq    %rax, 8(%rsp)
        movabsq $jl_system_image_data, %rax
        movq    %rax, 16(%rsp)
        movq    %rdi, 24(%rsp)
        movq    %rsi, 32(%rsp)
        movabsq $jl_invoke, %rax
        movabsq $140450899053840, %rdi  # imm = 0x7FBD45F24D10
        leaq    8(%rsp), %rsi
        movl    $4, %edx
        callq   *%rax
        ud2
        nopw    %cs:(%rax,%rax)
;}}}

Looking at the assembly code, particularly the loop labeled L128, we can observe the utilization of zmm registers. These are 512-bit registers, part of the AVX512 instruction set, allowing the processor to perform operations on 512 bits of data at once. Alongside zmm (512-bit), there are also ymm (256-bit) and xmm (128-bit) registers, offering different levels of Single Instruction, Multiple Data (SIMD) parallelism.

L128:
    vmovupd (%rdx,%rax,8), %zmm4
    vmovupd 64(%rdx,%rax,8), %zmm5
    vmovupd 128(%rdx,%rax,8), %zmm6
    vmovupd 192(%rdx,%rax,8), %zmm7
    vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
    vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
    vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
    vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
    addq    $32, %rax
    cmpq    %rax, %rdi
    jne     L128

This loop efficiently loads 4 registers (a total of 32 double-precision floating-point numbers) from one array and then employs fused multiply-add instructions (vfmadd) to multiply these values with corresponding numbers from the other array, accumulating the results. This approach not only dramatically accelerates the dot product calculation but also enhances accuracy compared to a naive iterative summation, resembling pairwise summation.

The performance gains are evident:

julia> using BenchmarkTools
julia> @btime dot_product($x, $y)
  12.979 ns (0 allocations: 0 bytes)
4.738430453861962

julia> @btime $x' * $y
  25.089 ns (0 allocations: 0 bytes)
4.738430453861962

For comparison, on a similarly clocked Ryzen CPU without AVX512, the minimum execution time is around 34 nanoseconds, highlighting the performance boost from AVX512.

While Comparing Gpus often comes to mind for heavy computational tasks, optimizing CPU code to leverage SIMD instructions is a crucial first step. Modern CPUs are powerful, and efficient code can extract remarkable performance from them, sometimes closing the gap and offering a viable alternative to GPU computing for certain workloads.

To illustrate a more complex scenario where CPU optimization through vectorization shines, consider a problem involving a mixture of zero-mean tri-variate T distributions. A Gibbs sampler for this problem requires calculating unnormalized conditional probabilities of group membership. Here’s an initial, straightforward Julia implementation:

using Random, BenchmarkTools, SpecialFunctions

function update_individual_probs_v1!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
    @inbounds for g ∈ 1:NG
        Li11 = Li[1,g]
        Li21 = Li[2,g]
        Li31 = Li[3,g]
        Li22 = Li[4,g]
        Li32 = Li[5,g]
        Li33 = Li[6,g]
        νfactor = (ν[g] - 2) / ν[g]
        exponent = T(-1.5) - T(0.5) * ν[g]
        base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
        for i ∈ 1:size(probabilities,1)
            lx₁ = Li11 * x[i,1]
            lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
            lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
            probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
        end
    end
end

function update_individual_probs_v1!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
    update_individual_probs_v1!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end

In this code, Li represents the inverse of the Cholesky factor of the covariance matrix. We are essentially calculating (Li x)'(Li x) for each of NG groups. Let’s create some dummy data for benchmarking:

T = Float32
NG = 6; N = 1024;
X = randn(T, N,3);
probabilities = Matrix{T}(undef, N, NG);
Li = rand(T, 6,NG);
ν = T(N / NG) + 4 .+ rand(T, NG);
baseπ = rand(T, NG);
baseπ ./= sum(baseπ);

Benchmarking the initial version yields:

BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
---------------
minimum time:      118.813 μs (0.00% GC)
median time:       121.939 μs (0.00% GC)
mean time:         127.839 μs (0.00% GC)
maximum time:      195.095 μs (0.00% GC)
---------------
samples:         10000
evals/sample:        1

Interestingly, simply adding the @simd macro for loop vectorization doesn’t provide significant improvement in this case. However, by using a more advanced macro from SLEEFwrap, which facilitates vectorization even for loops containing special functions (via SLEEF), we can achieve substantial speedup.

using SIMDPirates, SLEEFwrap
using SLEEFwrap: @restrict_simd
@generated function update_individual_probs!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ::AbstractVector{T}, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
    quote
        @inbounds for g ∈ 1:NG
            Li11 = Li[1,g]
            Li21 = Li[2,g]
            Li31 = Li[3,g]
            Li22 = Li[4,g]
            Li32 = Li[5,g]
            Li33 = Li[6,g]
            νfactor = (ν[g] - 2) / ν[g]
            exponent = T(-1.5) - T(0.5) * ν[g]
            base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
            @restrict_simd $T for i ∈ 1:size(probabilities,1)
                lx₁ = Li11 * x[i,1]
                lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
                lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 = Li[6,g]
                probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
            end
        end
    end
end

function update_individual_probs!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
    update_individual_probs!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end

Benchmarking the vectorized version:

BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
---------------
minimum time:      7.363 μs (0.00% GC)
median time:       7.867 μs (0.00% GC)
mean time:         7.990 μs (0.00% GC)
maximum time:      16.006 μs (0.00% GC)
---------------
samples:         10000
evals/sample:        4

The vectorized version using @restrict_simd achieves a significant performance improvement, reducing the execution time from around 120 microseconds to under 8 microseconds. This demonstrates that even for complex computations involving special functions, effective CPU vectorization can yield substantial speedups.

Before jumping to conclusions about comparing GPUs and their advantages, always consider the potential of optimizing your CPU code. Languages like Julia make it increasingly accessible to write high-performance code that leverages modern CPU features. Understanding and exploiting these capabilities can often provide remarkable performance gains, making CPU optimization a vital step even when considering the power of GPUs for demanding computational tasks.

Assembly code showcasing ZMM registers for AVX512 optimization in dot product calculation.

Benchmark results comparing the performance of original and vectorized code for T-distribution probability update.

Maximizing CPU Performance: A Prelude to Comparing GPUs

Comments

Leave a Reply Cancel reply