Often, achieving significant performance gains in Julia requires minimal extra effort. Let’s explore how vectorization can drastically improve the speed of your Julia code on CPUs, effectively showcasing a form of Cgpu Compare – not GPU comparison directly, but rather comparing different levels of CPU code optimization.
Consider a simple dot product function:
function dot_product(x, y)
out = zero(promote_type(eltype(x), eltype(y)))
@inbounds for i in eachindex(x,y)
out += x[i] * y[i]
end
out
end
x, y = randn(256), randn(256);
On a computer equipped with AVX512, examining the native assembly code generated by Julia reveals the power of vectorization:
julia> @code_native dot_product(x, y)
.text
; Function dot_product {
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: REPL[4]:2
subq $40, %rsp
movq 24(%rdi), %rax ;}
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %r8 ;}}}}}}}
; Location: abstractarray.jl:218
; Function _all_match_first; {
; Location: abstractarray.jl:224
; Function #89; {
; Location: abstractarray.jl:218
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: array.jl:155
movq 24(%rsi), %rcx ;}
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
movq %rcx, %rdx
sarq $63, %rdx
andnq %rcx, %rdx, %rcx ;}}}}}}}}}
; Function _all_match_first; {
; Location: promotion.jl:403
cmpq %rcx, %r8 ;}
jne L300
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
testq %rax, %rax ;}}}}}}}}}
jle L76
movq (%rdi), %rcx
movq (%rsi), %rdx
; Location: REPL[4]:3
cmpq $32, %r8
jae L88
vxorpd %xmm0, %xmm0, %xmm0
movl $1, %esi
jmp L259
L76:
vxorps %xmm0, %xmm0, %xmm0
; Location: REPL[4]:6
addq $40, %rsp
vzeroupper
retq
; Location: REPL[4]:3
L88:
movabsq $9223372036854775776, %rdi # imm = 0x7FFFFFFFFFFFFFE0
andq %r8, %rdi
leaq 1(%rdi), %rsi
vxorpd %xmm0, %xmm0, %xmm0
xorl %eax, %eax
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
nopl (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L128:
vmovupd (%rdx,%rax,8), %zmm4
vmovupd 64(%rdx,%rax,8), %zmm5
vmovupd 128(%rdx,%rax,8), %zmm6
vmovupd 192(%rdx,%rax,8), %zmm7 ;}
; Function +; {
; Location: float.jl:395
vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
addq $32, %rax
cmpq %rax, %rdi
jne L128
vaddpd %zmm0, %zmm1, %zmm0
vaddpd %zmm0, %zmm2, %zmm0
vaddpd %zmm0, %zmm3, %zmm0
vextractf64x4 $1, %zmm0, %ymm1
vaddpd %zmm1, %zmm0, %zmm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %zmm1, %zmm0, %zmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddpd %zmm1, %zmm0, %zmm0
cmpq %rdi, %r8 ;}
; Location: REPL[4]:3
je L292
L259:
addq $-1, %rsi
nopw (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L272:
vmovsd (%rdx,%rsi,8), %xmm1 # xmm1 = mem[0],zero ;}
; Function +; {
; Location: float.jl:395
vfmadd231sd (%rcx,%rsi,8), %xmm1, %xmm0 ;}
; Function iterate; {
; Location: range.jl:591
; Function ==; {
; Location: promotion.jl:403
addq $1, %rsi
cmpq %rsi, %r8 ;}}
jne L272
; Location: REPL[4]:6
L292:
addq $40, %rsp
vzeroupper
retq
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:218
L300:
movabsq $jl_system_image_data, %rax
movq %rax, 8(%rsp)
movabsq $jl_system_image_data, %rax
movq %rax, 16(%rsp)
movq %rdi, 24(%rsp)
movq %rsi, 32(%rsp)
movabsq $jl_invoke, %rax
movabsq $140450899053840, %rdi # imm = 0x7FBD45F24D10
leaq 8(%rsp), %rsi
movl $4, %edx
callq *%rax
ud2
nopw %cs:(%rax,%rax)
;}}}
Looking at the assembly code, particularly the L128
loop, we see the utilization of zmm
registers. These are 512-bit registers, alongside ymm
(256-bit) and xmm
(128-bit). The core loop efficiently processes data in wide vectors:
L128:
vmovupd (%rdx,%rax,8), %zmm4
vmovupd 64(%rdx,%rax,8), %zmm5
vmovupd 128(%rdx,%rax,8), %zmm6
vmovupd 192(%rdx,%rax,8), %zmm7
;} ; Function +; {
vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
addq $32, %rax
cmpq %rax, %rdi
jne L128
This loop loads 4 registers (32 doubles in total) from one array and utilizes fused multiply-add instructions (vfmadd
) to perform element-wise multiplication with corresponding elements from the other array, accumulating the results. This vectorized approach not only accelerates the dot product calculation but also enhances accuracy, resembling pairwise summation.
The performance gain is evident in benchmarks:
julia> using BenchmarkTools
julia> @btime dot_product($x, $y)
12.979 ns (0 allocations: 0 bytes)
4.738430453861962
julia> @btime $x' * $y
25.089 ns (0 allocations: 0 bytes)
4.738430453861962
For comparison, on a similarly clocked Ryzen CPU, the minimum time for the same operation is around 34ns, highlighting the cgpu compare in terms of different CPU architectures and their performance capabilities when running the same Julia code.
Let’s consider a more complex example involving a mixture of zero-mean tri-variate T distributions. A Gibbs sampler for this problem requires calculating unnormalized conditional probabilities of group membership. A straightforward implementation is as follows:
using Random, BenchmarkTools, SpecialFunctions
function update_individual_probs_v1!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
@inbounds for g ∈ 1:NG
Li11 = Li[1,g]
Li21 = Li[2,g]
Li31 = Li[3,g]
Li22 = Li[4,g]
Li32 = Li[5,g]
Li33 = Li[6,g]
νfactor = (ν[g] - 2) / ν[g]
exponent = T(-1.5) - T(0.5) * ν[g]
base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
for i ∈ 1:size(probabilities,1)
lx₁ = Li11 * x[i,1]
lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
end
end
end
function update_individual_probs_v1!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
update_individual_probs_v1!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end
Here, Li
represents the inverse of the Cholesky factor of the covariance matrix. We are essentially calculating (Li * x)'(Li * x) for each of NG
groups. Let’s create some sample data:
T = Float32
NG = 6; N = 1024;
X = randn(T, N,3);
probabilities = Matrix{T}(undef, N, NG);
Li = rand(T, 6,NG);
ν = T(N / NG) + 4 .+ rand(T, NG);
baseπ = rand(T, NG);
baseπ ./= sum(baseπ);
Benchmarking this version yields:
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
---------------
minimum time: 118.813 μs (0.00% GC)
median time: 121.939 μs (0.00% GC)
mean time: 127.839 μs (0.00% GC)
maximum time: 195.095 μs (0.00% GC)
---------------
samples: 10000
evals/sample: 1
Surprisingly, the @simd
macro provides little to no improvement in this scenario.
However, by leveraging the @restrict_simd
macro from SLEEFwrap, which facilitates vectorization of loops including special functions via SLEEF, we can achieve significant speedup.
using SIMDPirates, SLEEFwrap
using SLEEFwrap: @restrict_simd
@generated function update_individual_probs!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ::AbstractVector{T}, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
quote
@inbounds for g ∈ 1:NG
Li11 = Li[1,g]
Li21 = Li[2,g]
Li31 = Li[3,g]
Li22 = Li[4,g]
Li32 = Li[5,g]
Li33 = Li[6,g]
νfactor = (ν[g] - 2) / ν[g]
exponent = T(-1.5) - T(0.5) * ν[g]
base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
@restrict_simd $T for i ∈ 1:size(probabilities,1)
lx₁ = Li11 * x[i,1]
lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
end
end
end
end
function update_individual_probs!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractVector{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
update_individual_probs!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end
Benchmarking the vectorized version shows a dramatic improvement:
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
---------------
minimum time: 7.363 μs (0.00% GC)
median time: 7.867 μs (0.00% GC)
mean time: 7.990 μs (0.00% GC)
maximum time: 16.006 μs (0.00% GC)
---------------
samples: 10000
evals/sample: 4
This example further emphasizes the significance of vectorization in Julia for CPU performance optimization. By using tools like SLEEFwrap, even complex computations involving special functions can be vectorized, leading to substantial speed improvements and showcasing the potential for efficient cgpu compare scenarios where CPU code is highly optimized. Julia’s ability to generate vectorized code, as demonstrated in both dot product and T-distribution examples, allows developers to leverage the full power of modern CPUs for computationally intensive tasks.