Julia CPU Performance: Vectorization Techniques Compared

Often, achieving significant performance gains in Julia requires minimal extra effort. Let’s explore how vectorization can drastically improve the speed of your Julia code on CPUs, effectively showcasing a form of Cgpu Compare – not GPU comparison directly, but rather comparing different levels of CPU code optimization.

Consider a simple dot product function:

function dot_product(x, y)
    out = zero(promote_type(eltype(x), eltype(y)))
    @inbounds for i in eachindex(x,y)
        out += x[i] * y[i]
    end
    out
end
x, y = randn(256), randn(256);

On a computer equipped with AVX512, examining the native assembly code generated by Julia reveals the power of vectorization:

julia> @code_native dot_product(x, y)

    .text
; Function dot_product {
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: REPL[4]:2
    subq    $40, %rsp
    movq    24(%rdi), %rax      ;}
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
    movq    %rax, %rcx
    sarq    $63, %rcx
    andnq   %rax, %rcx, %r8     ;}}}}}}}
; Location: abstractarray.jl:218
; Function _all_match_first; {
; Location: abstractarray.jl:224
; Function #89; {
; Location: abstractarray.jl:218
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: array.jl:155
    movq    24(%rsi), %rcx      ;}
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
    movq    %rcx, %rdx
    sarq    $63, %rdx
    andnq   %rcx, %rdx, %rcx        ;}}}}}}}}}
; Function _all_match_first; {
; Location: promotion.jl:403
    cmpq    %rcx, %r8       ;}
    jne L300
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
    testq   %rax, %rax      ;}}}}}}}}}
    jle L76
    movq    (%rdi), %rcx
    movq    (%rsi), %rdx
; Location: REPL[4]:3
    cmpq    $32, %r8
    jae L88
    vxorpd  %xmm0, %xmm0, %xmm0
    movl    $1, %esi
    jmp L259
L76:
    vxorps  %xmm0, %xmm0, %xmm0
; Location: REPL[4]:6
    addq    $40, %rsp
    vzeroupper
    retq
; Location: REPL[4]:3
L88:
    movabsq $9223372036854775776, %rdi # imm = 0x7FFFFFFFFFFFFFE0
    andq    %r8, %rdi
    leaq    1(%rdi), %rsi
    vxorpd  %xmm0, %xmm0, %xmm0
    xorl    %eax, %eax
    vxorpd  %xmm1, %xmm1, %xmm1
    vxorpd  %xmm2, %xmm2, %xmm2
    vxorpd  %xmm3, %xmm3, %xmm3
    nopl    (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L128:
    vmovupd (%rdx,%rax,8), %zmm4
    vmovupd 64(%rdx,%rax,8), %zmm5
    vmovupd 128(%rdx,%rax,8), %zmm6
    vmovupd 192(%rdx,%rax,8), %zmm7     ;}
; Function +; {
; Location: float.jl:395
    vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
    vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
    vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
    vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
    addq    $32, %rax
    cmpq    %rax, %rdi
    jne L128
    vaddpd  %zmm0, %zmm1, %zmm0
    vaddpd  %zmm0, %zmm2, %zmm0
    vaddpd  %zmm0, %zmm3, %zmm0
    vextractf64x4   $1, %zmm0, %ymm1
    vaddpd  %zmm1, %zmm0, %zmm0
    vextractf128    $1, %ymm0, %xmm1
    vaddpd  %zmm1, %zmm0, %zmm0
    vpermilpd   $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
    vaddpd  %zmm1, %zmm0, %zmm0
    cmpq    %rdi, %r8       ;}
; Location: REPL[4]:3
    je  L292
L259:
    addq    $-1, %rsi
    nopw    (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L272:
    vmovsd  (%rdx,%rsi,8), %xmm1 # xmm1 = mem[0],zero       ;}
; Function +; {
; Location: float.jl:395
    vfmadd231sd (%rcx,%rsi,8), %xmm1, %xmm0     ;}
; Function iterate; {
; Location: range.jl:591
; Function ==; {
; Location: promotion.jl:403
    addq    $1, %rsi
    cmpq    %rsi, %r8       ;}}
    jne L272
; Location: REPL[4]:6
L292:
    addq    $40, %rsp
    vzeroupper
    retq
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:218
L300:
    movabsq $jl_system_image_data, %rax
    movq    %rax, 8(%rsp)
    movabsq $jl_system_image_data, %rax
    movq    %rax, 16(%rsp)
    movq    %rdi, 24(%rsp)
    movq    %rsi, 32(%rsp)
    movabsq $jl_invoke, %rax
    movabsq $140450899053840, %rdi # imm = 0x7FBD45F24D10
    leaq    8(%rsp), %rsi
    movl    $4, %edx
    callq   *%rax
    ud2
    nopw    %cs:(%rax,%rax)
;}}}

Looking at the assembly code, particularly the L128 loop, we see the utilization of zmm registers. These are 512-bit registers, alongside ymm (256-bit) and xmm (128-bit). The core loop efficiently processes data in wide vectors:

L128:
    vmovupd (%rdx,%rax,8), %zmm4
    vmovupd 64(%rdx,%rax,8), %zmm5
    vmovupd 128(%rdx,%rax,8), %zmm6
    vmovupd 192(%rdx,%rax,8), %zmm7
;} ; Function +; {
    vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
    vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
    vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
    vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
    addq    $32, %rax
    cmpq    %rax, %rdi
    jne L128

This loop loads 4 registers (32 doubles in total) from one array and utilizes fused multiply-add instructions (vfmadd) to perform element-wise multiplication with corresponding elements from the other array, accumulating the results. This vectorized approach not only accelerates the dot product calculation but also enhances accuracy, resembling pairwise summation.

The performance gain is evident in benchmarks:

julia> using BenchmarkTools

julia> @btime dot_product($x, $y)
  12.979 ns (0 allocations: 0 bytes)
4.738430453861962

julia> @btime $x' * $y
  25.089 ns (0 allocations: 0 bytes)
4.738430453861962

For comparison, on a similarly clocked Ryzen CPU, the minimum time for the same operation is around 34ns, highlighting the cgpu compare in terms of different CPU architectures and their performance capabilities when running the same Julia code.

Let’s consider a more complex example involving a mixture of zero-mean tri-variate T distributions. A Gibbs sampler for this problem requires calculating unnormalized conditional probabilities of group membership. A straightforward implementation is as follows:

using Random, BenchmarkTools, SpecialFunctions

function update_individual_probs_v1!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
    @inbounds for g ∈ 1:NG
        Li11 = Li[1,g]
        Li21 = Li[2,g]
        Li31 = Li[3,g]
        Li22 = Li[4,g]
        Li32 = Li[5,g]
        Li33 = Li[6,g]
        νfactor = (ν[g] - 2) / ν[g]
        exponent = T(-1.5) - T(0.5) * ν[g]
        base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
        for i ∈ 1:size(probabilities,1)
            lx₁ = Li11 * x[i,1]
            lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
            lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
            probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
        end
    end
end

function update_individual_probs_v1!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
    update_individual_probs_v1!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end

Here, Li represents the inverse of the Cholesky factor of the covariance matrix. We are essentially calculating (Li * x)'(Li * x) for each of NG groups. Let’s create some sample data:

T = Float32
NG = 6; N = 1024;
X = randn(T, N,3);
probabilities = Matrix{T}(undef, N, NG);
Li = rand(T, 6,NG);
ν = T(N / NG) + 4 .+ rand(T, NG);
baseπ = rand(T, NG);
baseπ ./= sum(baseπ);

Benchmarking this version yields:

BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
---------------
minimum time:      118.813 μs (0.00% GC)
median time:       121.939 μs (0.00% GC)
mean time:         127.839 μs (0.00% GC)
maximum time:      195.095 μs (0.00% GC)
---------------
samples:         10000
evals/sample:        1

Surprisingly, the @simd macro provides little to no improvement in this scenario.

However, by leveraging the @restrict_simd macro from SLEEFwrap, which facilitates vectorization of loops including special functions via SLEEF, we can achieve significant speedup.

using SIMDPirates, SLEEFwrap
using SLEEFwrap: @restrict_simd

@generated function update_individual_probs!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ::AbstractVector{T}, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
    quote
        @inbounds for g ∈ 1:NG
            Li11 = Li[1,g]
            Li21 = Li[2,g]
            Li31 = Li[3,g]
            Li22 = Li[4,g]
            Li32 = Li[5,g]
            Li33 = Li[6,g]
            νfactor = (ν[g] - 2) / ν[g]
            exponent = T(-1.5) - T(0.5) * ν[g]
            base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
            @restrict_simd $T for i ∈ 1:size(probabilities,1)
                lx₁ = Li11 * x[i,1]
                lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
                lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
                probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
            end
        end
    end
end

function update_individual_probs!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractVector{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
    update_individual_probs!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end

Benchmarking the vectorized version shows a dramatic improvement:

BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
---------------
minimum time:      7.363 μs (0.00% GC)
median time:       7.867 μs (0.00% GC)
mean time:         7.990 μs (0.00% GC)
maximum time:      16.006 μs (0.00% GC)
---------------
samples:         10000
evals/sample:        4

This example further emphasizes the significance of vectorization in Julia for CPU performance optimization. By using tools like SLEEFwrap, even complex computations involving special functions can be vectorized, leading to substantial speed improvements and showcasing the potential for efficient cgpu compare scenarios where CPU code is highly optimized. Julia’s ability to generate vectorized code, as demonstrated in both dot product and T-distribution examples, allows developers to leverage the full power of modern CPUs for computationally intensive tasks.

Julia CPU Performance: Vectorization Techniques Compared

Comments

Leave a Reply Cancel reply