Often, achieving significant performance gains in computing doesn’t require drastic hardware changes like immediately jumping to GPU comparisons. Sometimes, the most impactful improvements come from optimizing the code you already have to fully leverage your CPU’s capabilities. This can be surprisingly straightforward, and in many cases, it takes very little extra effort.
Consider a simple yet fundamental operation in numerical computing: the dot product. Here’s a Julia function to compute the dot product of two vectors:
function dot_product(x, y)
out = zero(promote_type(eltype(x), eltype(y)))
@inbounds for i in eachindex(x,y)
out += x[i] * y[i]
end
out
end
x, y = randn(256), randn(256);
On a modern computer equipped with AVX512 instruction set, let’s examine the assembly code generated by Julia for this seemingly simple function:
julia> @code_native dot_product(x, y)
.text
; Function dot_product {
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: REPL[4]:2
subq $40, %rsp
movq 24(%rdi), %rax
;} ; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %r8
;}}}}}}}
; Location: abstractarray.jl:218
; Function _all_match_first; {
; Location: abstractarray.jl:224
; Function #89; {
; Location: abstractarray.jl:218
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function size; {
; Location: array.jl:155
movq 24(%rsi), %rcx
;} ; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
movq %rcx, %rdx
sarq $63, %rdx
andnq %rcx, %rdx, %rcx
;}}}}}}}}}
; Function _all_match_first; {
; Location: promotion.jl:403
cmpq %rcx, %r8
;}
jne L300
; Location: abstractarray.jl:217
; Function eachindex; {
; Location: abstractarray.jl:214
; Function axes1; {
; Location: abstractarray.jl:93
; Function axes; {
; Location: abstractarray.jl:75
; Function map; {
; Location: tuple.jl:165
; Function Type; {
; Location: range.jl:314
; Function Type; {
; Location: range.jl:305
; Function max; {
; Location: promotion.jl:414
testq %rax, %rax
;}}}}}}}}}
jle L76
movq (%rdi), %rcx
movq (%rsi), %rdx
; Location: REPL[4]:3
cmpq $32, %r8
jae L88
vxorpd %xmm0, %xmm0, %xmm0
movl $1, %esi
jmp L259
L76:
vxorps %xmm0, %xmm0, %xmm0
; Location: REPL[4]:6
addq $40, %rsp
vzeroupper
retq
; Location: REPL[4]:3
L88:
movabsq $9223372036854775776, %rdi # imm = 0x7FFFFFFFFFFFFFE0
andq %r8, %rdi
leaq 1(%rdi), %rsi
vxorpd %xmm0, %xmm0, %xmm0
xorl %eax, %eax
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
nopl (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L128:
vmovupd (%rdx,%rax,8), %zmm4
vmovupd 64(%rdx,%rax,8), %zmm5
vmovupd 128(%rdx,%rax,8), %zmm6
vmovupd 192(%rdx,%rax,8), %zmm7
;} ; Function +; {
; Location: float.jl:395
vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
addq $32, %rax
cmpq %rax, %rdi
jne L128
vaddpd %zmm0, %zmm1, %zmm0
vaddpd %zmm0, %zmm2, %zmm0
vaddpd %zmm0, %zmm3, %zmm0
vextractf64x4 $1, %zmm0, %ymm1
vaddpd %zmm1, %zmm0, %zmm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %zmm1, %zmm0, %zmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddpd %zmm1, %zmm0, %zmm0
cmpq %rdi, %r8
;} ; Location: REPL[4]:3
je L292
L259:
addq $-1, %rsi
nopw (%rax,%rax)
; Location: REPL[4]:4
; Function getindex; {
; Location: array.jl:739
L272:
vmovsd (%rdx,%rsi,8), %xmm1 # xmm1 = mem[0],zero
;} ; Function +; {
; Location: float.jl:395
vfmadd231sd (%rcx,%rsi,8), %xmm1, %xmm0
;} ; Function iterate; {
; Location: range.jl:591
; Function ==; {
; Location: promotion.jl:403
addq $1, %rsi
cmpq %rsi, %r8
;}}
jne L272
; Location: REPL[4]:6
L292:
addq $40, %rsp
vzeroupper
retq
; Location: REPL[4]:3
; Function eachindex; {
; Location: abstractarray.jl:207
; Function eachindex; {
; Location: abstractarray.jl:218
L300:
movabsq $jl_system_image_data, %rax
movq %rax, 8(%rsp)
movabsq $jl_system_image_data, %rax
movq %rax, 16(%rsp)
movq %rdi, 24(%rsp)
movq %rsi, 32(%rsp)
movabsq $jl_invoke, %rax
movabsq $140450899053840, %rdi # imm = 0x7FBD45F24D10
leaq 8(%rsp), %rsi
movl $4, %edx
callq *%rax
ud2
nopw %cs:(%rax,%rax)
;}}}
Looking at the assembly code, particularly the loop labeled L128
, we can observe the utilization of zmm
registers. These are 512-bit registers, part of the AVX512 instruction set, allowing the processor to perform operations on 512 bits of data at once. Alongside zmm
(512-bit), there are also ymm
(256-bit) and xmm
(128-bit) registers, offering different levels of Single Instruction, Multiple Data (SIMD) parallelism.
L128:
vmovupd (%rdx,%rax,8), %zmm4
vmovupd 64(%rdx,%rax,8), %zmm5
vmovupd 128(%rdx,%rax,8), %zmm6
vmovupd 192(%rdx,%rax,8), %zmm7
vfmadd231pd (%rcx,%rax,8), %zmm4, %zmm0
vfmadd231pd 64(%rcx,%rax,8), %zmm5, %zmm1
vfmadd231pd 128(%rcx,%rax,8), %zmm6, %zmm2
vfmadd231pd 192(%rcx,%rax,8), %zmm7, %zmm3
addq $32, %rax
cmpq %rax, %rdi
jne L128
This loop efficiently loads 4 registers (a total of 32 double-precision floating-point numbers) from one array and then employs fused multiply-add instructions (vfmadd
) to multiply these values with corresponding numbers from the other array, accumulating the results. This approach not only dramatically accelerates the dot product calculation but also enhances accuracy compared to a naive iterative summation, resembling pairwise summation.
The performance gains are evident:
julia> using BenchmarkTools
julia> @btime dot_product($x, $y)
12.979 ns (0 allocations: 0 bytes)
4.738430453861962
julia> @btime $x' * $y
25.089 ns (0 allocations: 0 bytes)
4.738430453861962
For comparison, on a similarly clocked Ryzen CPU without AVX512, the minimum execution time is around 34 nanoseconds, highlighting the performance boost from AVX512.
While Comparing Gpus often comes to mind for heavy computational tasks, optimizing CPU code to leverage SIMD instructions is a crucial first step. Modern CPUs are powerful, and efficient code can extract remarkable performance from them, sometimes closing the gap and offering a viable alternative to GPU computing for certain workloads.
To illustrate a more complex scenario where CPU optimization through vectorization shines, consider a problem involving a mixture of zero-mean tri-variate T distributions. A Gibbs sampler for this problem requires calculating unnormalized conditional probabilities of group membership. Here’s an initial, straightforward Julia implementation:
using Random, BenchmarkTools, SpecialFunctions
function update_individual_probs_v1!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
@inbounds for g ∈ 1:NG
Li11 = Li[1,g]
Li21 = Li[2,g]
Li31 = Li[3,g]
Li22 = Li[4,g]
Li32 = Li[5,g]
Li33 = Li[6,g]
νfactor = (ν[g] - 2) / ν[g]
exponent = T(-1.5) - T(0.5) * ν[g]
base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
for i ∈ 1:size(probabilities,1)
lx₁ = Li11 * x[i,1]
lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 * x[i,3]
probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
end
end
end
function update_individual_probs_v1!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
update_individual_probs_v1!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end
In this code, Li
represents the inverse of the Cholesky factor of the covariance matrix. We are essentially calculating (Li x)'(Li x) for each of NG
groups. Let’s create some dummy data for benchmarking:
T = Float32
NG = 6; N = 1024;
X = randn(T, N,3);
probabilities = Matrix{T}(undef, N, NG);
Li = rand(T, 6,NG);
ν = T(N / NG) + 4 .+ rand(T, NG);
baseπ = rand(T, NG);
baseπ ./= sum(baseπ);
Benchmarking the initial version yields:
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
---------------
minimum time: 118.813 μs (0.00% GC)
median time: 121.939 μs (0.00% GC)
mean time: 127.839 μs (0.00% GC)
maximum time: 195.095 μs (0.00% GC)
---------------
samples: 10000
evals/sample: 1
Interestingly, simply adding the @simd
macro for loop vectorization doesn’t provide significant improvement in this case. However, by using a more advanced macro from SLEEFwrap, which facilitates vectorization even for loops containing special functions (via SLEEF), we can achieve substantial speedup.
using SIMDPirates, SLEEFwrap
using SLEEFwrap: @restrict_simd
@generated function update_individual_probs!(mt::MersenneTwister, probabilities::AbstractMatrix{T}, baseπ::AbstractVector{T}, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
quote
@inbounds for g ∈ 1:NG
Li11 = Li[1,g]
Li21 = Li[2,g]
Li31 = Li[3,g]
Li22 = Li[4,g]
Li32 = Li[5,g]
Li33 = Li[6,g]
νfactor = (ν[g] - 2) / ν[g]
exponent = T(-1.5) - T(0.5) * ν[g]
base = log(baseπ[g]) + log(Li11) + log(Li22) + log(Li33) + lgamma(-exponent) - lgamma(T(0.5)*ν[g]) - T(1.5)*log(ν[g])
@restrict_simd $T for i ∈ 1:size(probabilities,1)
lx₁ = Li11 * x[i,1]
lx₂ = Li21 * x[i,1] + Li22 * x[i,2]
lx₃ = Li31 * x[i,1] + Li32 * x[i,2] + Li33 = Li[6,g]
probabilities[i,g] = exp(base + exponent * log(one(T) + νfactor * (lx₁*lx₁ + lx₂*lx₂ + lx₃*lx₃)))
end
end
end
end
function update_individual_probs!(probabilities::AbstractMatrix{T}, baseπ, Li::AbstractMatrix{T}, ν, x::AbstractMatrix{T}, ::Val{NG}) where {T,NG}
update_individual_probs!(Random.GLOBAL_RNG, probabilities, baseπ, Li, ν, x, Val(NG))
end
Benchmarking the vectorized version:
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
---------------
minimum time: 7.363 μs (0.00% GC)
median time: 7.867 μs (0.00% GC)
mean time: 7.990 μs (0.00% GC)
maximum time: 16.006 μs (0.00% GC)
---------------
samples: 10000
evals/sample: 4
The vectorized version using @restrict_simd
achieves a significant performance improvement, reducing the execution time from around 120 microseconds to under 8 microseconds. This demonstrates that even for complex computations involving special functions, effective CPU vectorization can yield substantial speedups.
Before jumping to conclusions about comparing GPUs and their advantages, always consider the potential of optimizing your CPU code. Languages like Julia make it increasingly accessible to write high-performance code that leverages modern CPU features. Understanding and exploiting these capabilities can often provide remarkable performance gains, making CPU optimization a vital step even when considering the power of GPUs for demanding computational tasks.
Assembly code showcasing ZMM registers for AVX512 optimization in dot product calculation.
Benchmark results comparing the performance of original and vectorized code for T-distribution probability update.