SIMD
Targeting SIMD Vector Instructions
Simple Example: Intel AVX 4-way Double Precision
opts := SIMDGlobals.getOpts(AVX_4x64f); # default: real vectorization
t := TRC(DFT(16)).withTags(opts.tags);
rt := RandomRuleTree(t, opts);
c := CodeRuleTree(rt, opts);
PrintCode("AVX_DFT16", c, opts);
Stepwise Code Generation
opts.tags; # what are the tags
opts.tags[1].v; # vector length
opts.tags[1].isa; # targeted ISA
opts.vector; # check out the options used
spl := SPLRuleTree(rt); # There are SIMD SPL objects
s := SumsRuleTree(rt, opts); # and a SMP ISum
InfinityNormMat(MatSPL(s) - MatSPL(t)); # correctness check
Complex Vectorization Example
optsc := SIMDGlobals.getOpts(AVX_4x64f, # __m256d = (re,im,re,im)
rec(realVect := false, cplxVect := true));
rtc := RandomRuleTree(t, optsc); # complex vectorized DFT(16)
sc := SumsRuleTree(rtc, optsc);
cc := CodeRuleTree(rtc, optsc); # far fewer shuffle operations
PrintCode("AVXcplx_DFT16", cc, optsc);
Vectorization Rules
Simple Vectorization Rule Example
# spiral-core\namespaces\spiral\paradigms\vector\breakdown.gi
NewRulesFor(TTensorI, rec(
AxI_vec := rec(
forTransposition := false,
applicable := nt -> nt.hasTags() and IsVecVec(nt.params) and
(nt.isTag(1,AVecReg) or nt.isTag(1,AVecRegCx)) and
IsInt(nt.params[2]/nt.firstTag().v),
children := nt -> let(r := nt.params[2] / nt.firstTag().v,
isa := nt.firstTag().isa,
[[ When(r = 1,
When(nt.numTags() = 1,
nt.params[1].setWrap(VWrap(isa)),
nt.params[1].setWrap(
Drop(nt.getTags(), 1)).setWrap(VWrap(isa))
),
TTensorI(nt.params[1].setWrap(VWrap(isa)), r,
AVec, AVec).withTags(Drop(nt.getTags(), 1))
)]]
),
apply := (nt, c, cnt) -> VTensor(c[1], nt.firstTag().v)
)
));
Formula Rewrite
Kronecker Commute
# spiral-core\namespaces\spiral\paradigms\vector\breakdown.gi
NewRulesFor(TTensorI, rec(
IxA_vec := rec(forTransposition := false,
applicable := nt -> IsParPar(nt.params) and nt.hasTags() and
(nt.isTag(1,AVecReg) or nt.isTag(1,AVecRegCx)) and
IsInt(nt.params[2]/nt.firstTag().v),
children := nt -> let(pv := nt.getTags(), v := pv[1].v,
isa := pv[1].isa, d := nt.params[1].dims(),
[[
TL(d[1]*v, v, 1, 1).withTags(pv).setWrap(VWrapId),
When(Length(pv)=1, nt.params[1].setWrap(VWrap(isa)),
nt.params[1].setpv(Drop(pv, 1)).setWrap(VWrap(isa))),
TL(d[2]*v, d[2], 1, 1).withTags(pv).setWrap(VWrapId)
]]),
apply := (nt, c, cnt) -> let(
l := nt.params[2] / nt.firstTag().v,
A := c[1] * VTensor(c[2], nt.firstTag().v) * c[3],
NoDiagPullin(When(l=1, A , Tensor(I(l), A))))
)
));
Wrapping
VWrap Transformation
# spiral-core\namespaces\spiral\paradigms\vector\vwrap.gi
Class(VWrap, VWrapBase, rec(
__call__ := (self,isa) >> Checked(IsSIMD_ISA(isa),
WithBases(self, rec(operations:=PrintOps, isa:=isa))),
wrap := (self,r,t,opts) >> let(
isa := self.isa, v := isa.v,
tt := When(t.isReal(),
@_Base(paradigms.vector.sigmaspl.VTensor(r.node, v), r),
paradigms.vector.breakdown.AxI_vec(
TTensorI(TRC(t).withTags([AVecReg(isa)]), v,
AVec, AVec).withTags([AVecReg(isa)]),
paradigms.vector.breakdown.TRC_VRCLR(
TRC(t).withTags([AVecReg(isa)]), r))),
print := self >> Print(self.name, "(", self.isa, ")"),
));
VWrap in Context of Search
Used in DP to time sub-trees.
opts := SIMDGlobals.getOpts(AVX_4x64f);
t := DFT(2).setWrap(VWrap(AVX_4x64f));
rt := RandomRuleTree(t, opts);
wrt := t.wrap.wrap(rt, t, opts);
SPLRuleTree(wrt);
Special Role of Stride Permutations
TL: Lift Stride Permutation to Non-Terminal Level
# spiral-core\namespaces\spiral\paradigms\common\nonterms.gi
Class(TL, Tagged_tSPL_Container, rec(
abbrevs := [ (size, stride) -> Checked(ForAll([size, stride],
IsPosIntSym), [size, stride, 1, 1]),
(size, stride, left, right) ->
Checked(ForAll([size, stride, left, right], IsPosIntSym),
[size, stride, left, right]) ],
__call__ := arg >> let(self := arg[1], args := Drop(arg, 1),
Cond(args=[1,1,1,1], I(1), ApplyFunc(Inherited,args))),
dims := self >>
Replicate(2, self.params[1]*self.params[3]*self.params[4]),
terminate := self >> Tensor(I(self.params[3]),
L(self.params[1], self.params[2]), I(self.params[4])),
transpose := self >> TL(self.params[1],
self.params[1]/self.params[2], self.params[3],
self.params[4]).withTags(self.getTags()),
));
TL in Context of Search
Used in DP to time sub-trees.
t := TL(8,2,2,4);
t.terminate();
Architecture Specific Permutations
Looking up vectorized Implementations for TL
Import(paradigms.vector.sigmaspl);
opts := SIMDGlobals.getOpts(AVX_4x64f);
opts.breakdownRules.TL;
t := TL(16,4,1,1).withTags(opts.tags); # a in-register perm
rt := RandomRuleTree(t, opts);
HashLookup(opts.baseHashes[1], t); # the implementation is cached
s := SPLRuleTree(rt);
vp := Collect(s, VPerm)[1]; # SPL object carries its code
vp.code; # code generator refers to ISA
AVX_4x64f.rules; # ISA carries implementations
PrintCode("", vp.code(Y, X), opts); # for in-register perms
SIMD ISA Database and Bootstrapping a Vector Architecture
SIMD_ISA_DB; # central SIMD data base
SIMD_ISA_DB.installed(); # all the ISAs supported
Doc(AVX_4x64f); # The ISA carries all the relevant info
Print(SIMD_ISA_DB.buildBases); # How the base cases are built
AVX_4x64f.buildRules; # bootstrapping function
SIMD_ISA_DB.getBases(AVX_4x64f); # all the base cases needed
SIMD_ISA_DB.lookupBases(AVX_4x64f); # and how they are implemented
ISA Database and Hashed Breakdowns
Generic Breakdown Rules to Look Up Architecture Specific Code
# spiral-core\namespaces\spiral\paradigms\vector\bases\tl_bases.gi
NewRulesFor(TL, rec(
SIMD_ISA_Bases1 := rec(
forTransposition := false,
applicable := (self, t) >> t.isTag(1, AVecReg) and
let(isa := t.firstTag().isa, P:=t.params,
isa.active and ForAny(isa.supportedTL(),
e -> _TL_applicable(e, P[1], P[2], P[3], P[4]))),
apply := function(nt,C,cnt)
local isa, tl, ll, vprm, P;
P:=nt.params;
isa := nt.firstTag().isa;
tl := isa.getTL(P);
ll := P[3] / tl.perm.l;
vprm := tl.vperm;
return When(ll = 1, vprm,
BlockVPerm(ll, isa.v, vprm, tl.perm.spl));
end,
)
));
Stride Permutatiopn Identities as Rules
Generic Breakdown Rules to Look Up Architecture Specific Code
# spiral-core\namespaces\spiral\paradigms\common\breakdown.gi
NewRulesFor(TL, rec(
IxLxI_kmn_n := rec (forTransposition := false,
applicable := nt ->
Length(DivisorsIntDrop(nt.params[1]/nt.params[2])) > 0,
children := nt -> let(
N := nt.params[1], n := nt.params[2],
km := N/n, ml := DivisorsIntDrop(km),
l := nt.params[3], r := nt.params[4],
List(ml, m -> let( k := km/m, [
TL(k*n, n, l, r*m).withTags(nt.getTags()),
TL(m*n, n, k*l, r).withTags(nt.getTags())
]))
),
apply := (nt, c, cnt) -> let(
spl := c[1] * c[2],
When(nt.params[1] = nt.params[2]^2,
SymSPL(spl),
spl
)
)
));
SPL And Σ-SPL Vector Objects
Generate The Example
Import(paradigms.vector.sigmaspl);
opts := SIMDGlobals.getOpts(AVX_4x64f);
rt := RandomRuleTree(TRC(DFT(16)).withTags(opts.tags), opts);
s := SPLRuleTree(rt); # SPL Objects
ss := SumsRuleTree(rt, opts); # Sigma-SPL Objects
Inspecting the Vector Objects
Collect(s, VTensor)[1]; # Vectorized Tensor(., I(v))
Collect(ss, VTensor)[1]; # Vectorized Tensor(., I(v))
Collect(s, VPerm)[1]; # Vectorized Prm(.)
Collect(s, BlockVPerm)[1]; # Vectorized Tensor(I(.), Prm(.))
Collect(s, VContainer)[1]; # Provides context for rewriting
Collect(s, VRC)[1]; # Carries interleaved complex format
Collect(ss, VGath)[1]; # Tensor(Gath(.), I(v))
Collect(ss, VScat)[1]; # Tensor(Scat(.), I(v))
Collect(ss, VRCDiag)[1]; # Vectorized Diag(.)
Vector SPL Objects
Vector Tensor SPL Object
# spiral-core\namespaces\spiral\paradigms\vector\sigmaspl\vtensor.gi
Class(VTensor, Tensor, rec(
new := (self, L) >> SPL(WithBases(self, rec(
_children := [L[1]],
dimensions := When(IsBound(L[1].dims), L[1].dims(),
L[1].dimensions) * L[2], vlen := L[2]))),
from_rChildren := (self, rch) >> ObjId(self)(rch[1], self.vlen),
print := (self,i,is) >> Print(self.name, "(",
self.child(1).print(i+is,is), ", ", self.vlen, ")"),
toAMat := self >> Tensor(self.child(1), I(self.vlen)).toAMat(),
sums := self >> Inherit(self, rec(_children :=
[self.child(1).sums()])),
isPermutation := False,
dims := self >> self.child(1).dims() * self.vlen,
needInterleavedLeft := False,
needInterleavedRight := False,
transpose := self >> VTensor(self.child(1).transpose(), self.vlen),
isBlockTransitive := true,
cannotChangeDataFormat := False,
));
Vector Σ-SPL Objects
Vector Gather
# spiral-core\namespaces\spiral\paradigms\vector\sigmaspl\gather.gi
Class(VGath, BaseVGath, SumsBase, rec(
rChildren := self >> [self.func],
rSetChild := rSetChildFields("func"),
from_rChildren := (self, rch) >> ObjId(self)(rch[1], self.v),
new := (self, func, v) >> SPL(WithBases(self,
rec(func := func, v := v))).setDims(),
dims := self >> [self.v*self.func.domain(),
self.v*self.func.range()],
transpose := self >> VScat(self.func, self.v),
print := (self,i,is) >> Print(self.name, "(", self.func, ", ",
self.v,")", self.printA()),
toAMat := self >> let(v:=self.v, n :=
EvalScalar(v*self.func.domain()),
N := EvalScalar(v*self.func.range()),
func := fTensor(self.func, fId(v)).lambda(),
AMatMat(List([0..n-1], row -> BasisVec(N,
EvalScalar(func.at(row).ev()))))),
));
Vector RC Objects: Manage Data Layout
Vector RC Class
Class(VRC, RC, rec(
toAMat := (self) >> AMatMat(RCMatCyc(MatSPL(self.child(1)))),
new := meth(self, spl, v)
local res;
res := SPL(WithBases(self, rec(_children:=[spl], v:=v,
dimensions := spl.dimensions)));
res.dimensions := res.dims();
return res;
end,
print := (self, i, is) >> Print(self.__name__,
"(\n", Blanks(i+is), self.child(1).print(i+is,is), ", ",
#"\n", Blanks(i+is),
self.v,
#"\n", Blanks(i),
")", self.printA()),
unroll := self >> self,
transpose := self >> VRC(self.child(1).conjTranspose(), self.v),
vcost := self >> self.child(1).vcost(),
from_rChildren := (self, rch) >> ObjId(self)(rch[1], self.v)
));
VRC, VRCL, VRCR, VRCLR
v1 := VRC(Tensor(I(2), F(2)), 4); # Implicit data reorganization
MatSPL(v1);
v2 := VRCL(Tensor(I(2), F(2)), 4); # The L and R encodes
MatSPL(v2);
v3 := VRCR(Tensor(I(2), F(2)), 4); # which side goes from
MatSPL(v3);
v4 := VRCLR(Tensor(I(2), F(2)), 4); # interleaved complex format to
MatSPL(v4); # block split complex format
Terminate VRC, VRCL, VRCR, VRCLR
Import(paradigms.vector.rewrite);
opts := SIMDGlobals.getOpts(AVX_4x64f);
# see how the format gets propagated down the tree
v5 := VRC(Tensor(I(2), F(2)) * Tensor(F(2), I(2)), 4);
RulesVRC(v5);
# when propagated to the leftmost/rightmost tree leaves, terminate
v6 := VRCL(VTensor(F(2), 4), 4);
v7 := Rewrite(v6, [RulesVRCTerm], opts);
# termnination inserts VPerms to implement local data format change
v8 := VRCR(VTensor(F(2), 4), 4);
v9 := Rewrite(v9, [RulesVRCTerm], opts);
Vector SumsGen and Rewriting
Default Sumsgen Handles Vector SPL to Σ-SPL
opts := SIMDGlobals.getOpts(AVX_4x64f);
opts.sumsgen;
opts.sumsgen.VTensor;
opts.sumsgen.VPerm;
Rewrite Rule Strategies
opts.formulaStrategies;
opts.formulaStrategies.sigmaSpl;
opts.formulaStrategies.preRC;
opts.formulaStrategies.postProcess;
Compile Strategies
opts.compileStrategy;
Print(opts.vector.isa.fixProblems);
Vector CodeGen
Default Sumsgen handles SPL to Σ-SPL.
# spiral-core\namespaces\spiral\paradigms\vector\sigmaspl\codegen.gi
Class(VectorCodegen, DefaultCodegen, rec(
VContainer := (self, o, y, x, opts) >>
self(o.child(1), y, x, CopyFields(opts, rec(
vector := rec(
isa := o.isa,
SIMD := LocalConfig.cpuinfo.SIMDname )))),
VPrm_x_I := (self, o, y, x, opts) >>
self(VTensor(Prm(o.func), o.v), y, x, opts),
VPerm := (self, o, y, x, opts) >> o.code(y, x),
VTensor := (self, o, y, x, opts) >> let(
CastToVect := p -> StripList(List(Flat([p]), e ->
tcast(TPtr(TVect(opts.vector.isa.t.t, o.vlen)), e))),
self(o.child(1), CastToVect(y), CastToVect(x), opts)),
VGath := (self, o, y, x, opts) >> Cond(IsUnalignedPtrT(x.t),
self(VGath_u(fTensor(o.func, fBase(o.v, 0)), o.v), y, x, opts),
self(VTensor(Gath(o.func), o.v), y, x, opts)),
VScat := (self, o, y, x, opts) >> Cond(IsUnalignedPtrT(y.t),
self(VScat_u(fTensor(o.func, fBase(o.v, 0)), o.v), y, x, opts),
self(VTensor(Scat(o.func), o.v), y, x, opts))
));
Vector Instructions
Every ISA defines ISA specific instructions and polymorphic add,…
# spiral-core\namespaces\spiral\platforms\avx\code.gi
# __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int offset);
Class(vinsert_2l_4x64f, VecExp_4.binary(), rec(
ev := self >> let(
a := _unwrap(self.args[1].ev()),
b := _unwrap(self.args[2].ev()),
When( self.args[3].p[1] = 0,
b :: a{[3 .. 4]}, a{[1 .. 2]} :: b )),
computeType := self >> self.args[1].t,
));
# __m256d _mm256_unpackhi_pd(__m256d a, __m256d b);
Class(vunpackhi_4x64f, VecExp_4.binary(), rec(
semantic := (in1, in2, p) -> [in1[2], in2[2], in1[4], in2[4]],
ev := _evpack
));
#__m256 _mm256_blend_ps(__m256 m1, __m256 m2, const int mask);
Class(vblend_8x32f, VecExp_8.binary(), rec(
semantic := (in1, in2, p) ->
List( Zip2(TransposedMat([in1, in2]), p), e -> e[1][e[2]]),
params := self >> Replicate(8, [1,2]), ev := _evshuf2
));
Vector Strength Reduction and Fixup Rules
# spiral-core\namespaces\spiral\platforms\avx\sreduce.gi
RewriteRules(FixCodeAVX, rec(
fix_noneExp := Rule( noneExp, e -> e.t.zero()),
vpermf128_8x32f_to_vextract := Rule( [assign, [deref, @(1)],
[vpermf128_8x32f, @(2), @(3), @(4)]],
e -> let( p := @(4).val.p,
a := [[@(2).val, [0]], [@(2).val, [1]], [@(3).val,
[0]], [@(3).val, [1]]],
dst := tcast(TPtr(TVect(T_Real(32), 4)), @(1).val),
chain(
assign(deref(dst),
ApplyFunc(vextract_4l_8x32f, a[p[1]])),
assign( deref(dst+1),
ApplyFunc(vextract_4l_8x32f, a[p[2]])))
)),
addsub_4x64f_to_mul := Rule( [addsub_4x64f, _0, @(1)],
e -> mul(e.t.value([-1,1,-1,1]), @(1).val)),
addsub_8x32f_to_mul := Rule( [addsub_8x32f, _0, @(1)],
e -> mul(e.t.value([-1,1,-1,1,-1,1,-1,1]), @(1).val)),
avx_add_addsub_vzero := Rule([add, @(1), [@(2, [addsub_4x64f,
addsub_8x32f]), _0, @(3)]],
e -> ObjId(@(2).val)(@(1).val, @(3).val)),
));
Vector Unparser
Polymorphic Unparsing for standard icode, adds special instructions
# spiral-core\namespaces\spiral\platforms\avx\unparse.gi
Class(AVXUnparser, SSEUnparser, rec(
TVect := (self, t, vars, i, is) >> let(
ctype := self.ctype(t, _isa(self)),
Print(ctype, " ", self.infix(vars, ", "))),
vpack := (self, o, i, is) >> Cond( _avxT(o.t, self.opts),
Print("_mm256_set_", self.ctype_suffix(o.t, _isa(self)),
"(", self.infix(Reversed(o.args), ", "), ")"),
Inherited(o, i, is)),
sub := (self, o, i, is) >> Cond( _avxT(o.t, self.opts), let(
isa := _isa(self),
sfx := self.ctype_suffix(o.t, isa),
saturated := When(isa.isFixedPoint and
isa.saturatedArithmetic, "s", ""),
self.printf("_mm256_sub$1_$2($3, $4)", [saturated, sfx,
o.args[1], o.args[2]])),
Inherited(o, i, is)),
vextract_2l_4x64f := (self, o, i, is) >>
self.prefix("_mm256_extractf128_pd", o.args),
vstore_2l_4x64f := (self, o, i, is) >>
self.prefix("_mm256_extractf128_pd", o.args),
));
ISA Definition
ISA Definition file ties everyting together
# spiral-core\namespaces\spiral\platforms\avx\isa.gi
Class(AVX_4x64f, AVX_Intel, rec(
includes := () -> ["<include/omega64.h>"] :: _AVXINTRIN(),
v := 4,
t := TVect(T_Real(64), 4),
ctype := "double",
instr := [ vunpacklo_4x64f, vunpackhi_4x64f, vshuffle_4x64f,
vperm2_4x64f, vpermf128_4x64f, vperm_4x64f, vblend_4x64f ],
mul_cx := (self, opts) >>
((y, x, c) -> let( u1 := self.freshU(), u2 := self.freshU(),
u3 := self.freshU(),
decl([u1, u2, u3], chain(
assign(u1, mul(x, vunpacklo_4x64f(c, c))),
assign(u2, vshuffle_4x64f(x, x, [2,1,2,1])),
assign(u3, mul(u2, vunpackhi_4x64f(c, c))),
assign(y, addsub_4x64f(u1, u3)))))),
svload_init := (vt) -> [
[ y,x,opts) -> let(u1 := var.fresh_t("U", TVect(vt.t, 2)),
decl([u1], chain(
assign(u1, vload1sd_2x64f(x[1].toPtr(vt.t))),
assign(y, vinsert_2l_4x64f(vt.zero(), u1, [0]))))),
...
));