In the realm of high-performance computing, squeezing every ounce of performance from your code is paramount. Often, achieving significant speed improvements doesn’t require radical algorithm changes, but rather leveraging the underlying hardware more effectively. This article explores a powerful technique for CPU optimization: vectorization, and subtly positions it within the broader context of performance enhancement methods, including GPU computing.
Let’s start with a simple example: calculating the dot product of two vectors in Julia. A naive implementation might look like this:
function dot_product(x, y)
out = zero(promote_type(eltype(x), eltype(y)))
@inbounds for i in eachindex(x,y)
out += x[i] * y[i]
end
out
end
x, y = randn(256), randn(256);
On modern CPUs equipped with instruction sets like AVX512, this seemingly straightforward code can be significantly accelerated through vectorization. Vectorization allows the CPU to perform the same operation on multiple data elements simultaneously, drastically increasing throughput. Let’s examine the assembly code generated for the dot_product
function on a machine with AVX512 capabilities:
julia> @code_native dot_product(x, y)
.text
; Function dot_product {
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: REPL[4]:2
subq $40, %rsp
movq 24(%rdi), %rax
;} ; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %r8
;}}}}}}}
; Location: abstractarray.jl:218
; Function _all_match_first; {
; Location: abstractarray.jl:224
; Function #89; {
; Location: abstractarray.jl:218
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: array.jl:155
movq 24(%rsi), %rcx
;} ; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
movq %rcx, %rdx
sarq $63, %rdx
andnq %rcx, %rdx, %rcx
;}}}}}}}}}
; Function _all_match_first; {
; Location: promotion.jl:403
cmpq %rcx, %r8
;}
jne L300
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
testq %rax, %rax
;}}}}}}}}}
jle L76
movq (%rdi), %rcx
movq (%rsi), %rdx
; Location: REPL[4]:3
cmpq $32, %r8
jae L88
vxorpd %xmm0, %xmm0, %xmm0
movl $1, %esi
jmp L259
L76:
vxorps %xmm0, %xmm0, %xmm0
; Location: REPL[4]:6
addq $40, %rsp
vzeroupper
retq
; Location: REPL[4]:3
L88:
movabsq $9223372036854775776, %rdi # imm = 0x7FFFFFFFFFFFFFE0
andq %r8, %rdi
leaq 1(%rdi), %rsi
vxorpd %xmm0, %xmm0, %xmm0
xorl %eax, %eax
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
nopl (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L128:
vmovupd (%rdx,%rax,8), %zmm4
vmovupd 64(%rdx,%rax,8), %zmm5
vmovupd 128(%rdx,%rax,8), %zmm6
vmovupd 192(%rdx,%rax,8), %zmm7
;} ; Function +; {
; Location: float.jl:395
vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
addq $32, %rax
cmpq %rax, %rdi
jne L128
vaddpd %zmm0, %zmm1, %zmm0
vaddpd %zmm0, %zmm2, %zmm0
vaddpd %zmm0, %zmm3, %zmm0
vextractf64x4 $1, %zmm0, %ymm1
vaddpd %zmm1, %zmm0, %zmm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %zmm1, %zmm0, %zmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddpd %zmm1, %zmm0, %zmm0
cmpq %rdi, %r8
;} ; Location: REPL[4]:3
je L292
L259:
addq $-1, %rsi
nopw (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L272:
vmovsd (%rdx,%rsi,8), %xmm1 # xmm1 = mem[0],zero
;} ; Function +; {
; Location: float.jl:395
vfmadd231sd (%rcx,%rsi,8), %xmm1, %xmm0
;} ; Function iterate; {
; Location: range.jl:591
; Function ==; {
; Location: promotion.jl:403
addq $1, %rsi
cmpq %rsi, %r8
;}}
jne L272
; Location: REPL[4]:6
L292:
addq $40, %rsp
vzeroupper
retq
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:218
L300:
movabsq $jl_system_image_data, %rax
movq %rax, 8(%rsp)
movabsq $jl_system_image_data, %rax
movq %rax, 16(%rsp)
movq %rdi, 24(%rsp)
movq %rsi, 32(%rsp)
movabsq $jl_invoke, %rax
movabsq $140450899053840, %rdi # imm = 0x7FBD45F24D10
leaq 8(%rsp), %rsi
movl $4, %edx
callq *%rax
ud2
nopw %cs:(%rax,%rax)
;}}}
Looking at the assembly, particularly the loop starting at L128
, we see instructions like vmovupd
and vfmadd231pd
operating on zmm
registers. These zmm
registers are 512-bit registers (while ymm
is 256-bit and xmm
is 128-bit), indicating that the compiler is indeed leveraging AVX512 to process 512 bits of data at a time. The loop efficiently loads 4 registers (a total of 32 doubles) from memory and uses fused multiply-add instructions to perform the dot product calculation in a highly parallel manner. This approach not only accelerates the computation but, as a side benefit, also tends to be more numerically accurate than naive accumulation, similar to pairwise summation.
The performance gains are evident:
julia> using BenchmarkTools
julia> @btime dot_product($x, $y)
12.979 ns (0 allocations: 0 bytes)
4.738430453861962
julia> @btime $x' * $y
25.089 ns (0 allocations: 0 bytes)
4.738430453861962
This vectorized dot_product
function is significantly faster than even the optimized built-in dot product (x' * y
) in Julia on this AVX512-capable machine. Comparing this to a similarly clocked Ryzen CPU (without AVX512), the minimum time for the same operation might be around 34ns, further highlighting the benefits of vectorization.
While GPUs are often the go-to solution for massive parallelism and achieving high performance, CPU vectorization remains a crucial optimization technique. For workloads that are not inherently suited for GPU architectures or when dealing with smaller datasets where GPU overhead might be significant, vectorized CPU code can offer an excellent balance of performance and efficiency. Furthermore, vectorization is often “free” in the sense that with modern compilers, well-written code can automatically be vectorized without requiring explicit GPU programming or specialized libraries.
Vectorizing More Complex Operations: An Example with SLEEFwrap
For more intricate computations, vectorization can become more challenging. Consider a problem involving a mixture of zero-mean tri-variate T distributions. A Gibbs sampler, often used in Bayesian statistics, requires calculating unnormalized conditional probabilities. A straightforward Julia implementation for this might be:
using Random, BenchmarkTools, SpecialFunctions
function update_individual_probs_v1!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
@inbounds for g ∈ 1:NG
Li11 = Li[1,g]
Li21 = Li[2,g]
Li31 = Li[3,g]
Li22 = Li[4,g]
Li32 = Li[5,g]
Li33 = Li[6,g]
νfactor = (ν[g] - 2) / ν[g]
exponent = T(-1.5) - T(0.5) * ν[g]
base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
for i ∈ 1:size(probabilities,1)
lx₁ = Li11 * x[i,1]
lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
end
end
end
function update_individual_probs_v1!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
update_individual_probs_v1!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end
In this code, Li
represents the inverse of the Cholesky factor of the covariance matrix. We are essentially calculating (Li * x)'(Li * x) for each of NG
groups. Let’s create some dummy data to benchmark:
T = Float32
NG = 6; N = 1024;
X = randn(T, N,3);
probabilities = Matrix{T}(undef, N, NG);
Li = rand(T, 6,NG);
ν = T(N / NG) + 4 .+ rand(T, NG);
baseπ = rand(T, NG);
baseπ ./= sum(baseπ);
Benchmarking update_individual_probs_v1!
yields:
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
---------------
minimum time: 118.813 μs (0.00% GC)
median time: 121.939 μs (0.00% GC)
mean time: 127.839 μs (0.00% GC)
maximum time: 195.095 μs (0.00% GC)
---------------
samples: 10000
evals/sample: 1
Simply adding the @simd
macro in this case is often insufficient to achieve significant vectorization. However, packages like SIMDPirates
and SLEEFwrap.jl
provide more advanced tools for explicit loop vectorization, even when dealing with special functions. SLEEFwrap
in particular allows for vectorizing loops containing special functions by leveraging the SLEEF library.
Using the @restrict_simd
macro from SLEEFwrap
, we can rewrite the function to encourage vectorization:
using SIMDPirates, SLEEFwrap
using SLEEFwrap: @restrict_simd
@generated function update_individual_probs!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ::AbstractVector{T}, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
quote
@inbounds for g ∈ 1:NG
Li11 = Li[1,g]
Li21 = Li[2,g]
Li31 = Li[3,g]
Li22 = Li[4,g]
Li32 = Li[5,g]
Li33 = Li[6,g]
νfactor = (ν[g] - 2) / ν[g]
exponent = T(-1.5) - T(0.5) * ν[g]
base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
@restrict_simd $T for i ∈ 1:size(probabilities,1)
lx₁ = Li11 * x[i,1]
lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
end
end
end
end
function update_individual_probs!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
update_individual_probs!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end
Benchmarking the vectorized update_individual_probs!
function demonstrates a dramatic speedup:
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
---------------
minimum time: 7.363 μs (0.00% GC)
median time: 7.867 μs (0.00% GC)
mean time: 7.990 μs (0.00% GC)
maximum time: 16.006 μs (0.00% GC)
---------------
samples: 10000
evals/sample: 4
The vectorized version is now significantly faster, showcasing the power of tools like SLEEFwrap
for optimizing more complex numerical code on CPUs. While GPUs excel in massively parallel scenarios, achieving such performance gains on CPUs through vectorization can be remarkably efficient and sometimes more practical, especially when the workload is not dominated by highly parallelizable operations or when GPU programming introduces excessive complexity.
Conclusion
CPU vectorization, especially with advanced instruction sets like AVX512 and libraries like SLEEFwrap, offers a powerful avenue for optimizing Julia code performance. While GPUs provide incredible computational power for suitable tasks, vectorization allows developers to unlock significant performance gains directly on the CPU. Understanding and leveraging these CPU-centric techniques is crucial for writing efficient Julia code, offering a valuable alternative or complement to GPU-based acceleration strategies depending on the specific application and hardware constraints. By carefully considering the nature of your computations and the available tools, you can choose the optimal path – be it CPU vectorization, GPU computing, or a combination of both – to achieve peak performance in your Julia applications.