.. _simd: SIMD ==== .. only:: html .. contents:: Targeting SIMD Vector Instructions ++++++++++++++++++++++++++++++++++ Simple Example: Intel AVX 4-way Double Precision ------------------------------------------------ .. code-block:: none opts := SIMDGlobals.getOpts(AVX_4x64f); # default: real vectorization t := TRC(DFT(16)).withTags(opts.tags); rt := RandomRuleTree(t, opts); c := CodeRuleTree(rt, opts); PrintCode("AVX_DFT16", c, opts); Stepwise Code Generation ------------------------ .. code-block:: none opts.tags; # what are the tags opts.tags[1].v; # vector length opts.tags[1].isa; # targeted ISA opts.vector; # check out the options used spl := SPLRuleTree(rt); # There are SIMD SPL objects s := SumsRuleTree(rt, opts); # and a SMP ISum InfinityNormMat(MatSPL(s) - MatSPL(t)); # correctness check Complex Vectorization Example ----------------------------- .. code-block:: none optsc := SIMDGlobals.getOpts(AVX_4x64f, # __m256d = (re,im,re,im) rec(realVect := false, cplxVect := true)); rtc := RandomRuleTree(t, optsc); # complex vectorized DFT(16) sc := SumsRuleTree(rtc, optsc); cc := CodeRuleTree(rtc, optsc); # far fewer shuffle operations PrintCode("AVXcplx_DFT16", cc, optsc); Tags ++++ Real Vectorization ------------------ .. code-block:: none # spiral-core\namespaces\spiral\paradigms\vector\tags.gi Class(AVecReg, AGenericTag, rec( isReg := true, isRegCx := false, isVec := true, updateParams := meth(self) Checked(IsSIMD_ISA(self.params[1])); Checked(Length(self.params)=1); self.v := self.params[1].v; self.isa := self.params[1]; end, container := (self, spl) >> paradigms.vector.sigmaspl.VContainer(spl, self.isa) )); Complex Vectorization --------------------- .. code-block:: none Class(AVecRegCx, AVecReg, rec( updateParams := meth(self) Checked(IsSIMD_ISA(self.params[1])); Checked(Length(self.params)=1); self.v := self.params[1].v/2; self.isa := self.params[1]; end, container := (self, spl) >> paradigms.vector.sigmaspl.VContainer(spl, self.isa.cplx()), isRegCx := true )); Vectorization Rules +++++++++++++++++++ Simple Vectorization Rule Example --------------------------------- .. code-block:: none # spiral-core\namespaces\spiral\paradigms\vector\breakdown.gi NewRulesFor(TTensorI, rec( AxI_vec := rec( forTransposition := false, applicable := nt -> nt.hasTags() and IsVecVec(nt.params) and (nt.isTag(1,AVecReg) or nt.isTag(1,AVecRegCx)) and IsInt(nt.params[2]/nt.firstTag().v), children := nt -> let(r := nt.params[2] / nt.firstTag().v, isa := nt.firstTag().isa, [[ When(r = 1, When(nt.numTags() = 1, nt.params[1].setWrap(VWrap(isa)), nt.params[1].setWrap( Drop(nt.getTags(), 1)).setWrap(VWrap(isa)) ), TTensorI(nt.params[1].setWrap(VWrap(isa)), r, AVec, AVec).withTags(Drop(nt.getTags(), 1)) )]] ), apply := (nt, c, cnt) -> VTensor(c[1], nt.firstTag().v) ) )); Formula Rewrite --------------- Kronecker Commute ################# .. code-block:: none # spiral-core\namespaces\spiral\paradigms\vector\breakdown.gi NewRulesFor(TTensorI, rec( IxA_vec := rec(forTransposition := false, applicable := nt -> IsParPar(nt.params) and nt.hasTags() and (nt.isTag(1,AVecReg) or nt.isTag(1,AVecRegCx)) and IsInt(nt.params[2]/nt.firstTag().v), children := nt -> let(pv := nt.getTags(), v := pv[1].v, isa := pv[1].isa, d := nt.params[1].dims(), [[ TL(d[1]*v, v, 1, 1).withTags(pv).setWrap(VWrapId), When(Length(pv)=1, nt.params[1].setWrap(VWrap(isa)), nt.params[1].setpv(Drop(pv, 1)).setWrap(VWrap(isa))), TL(d[2]*v, d[2], 1, 1).withTags(pv).setWrap(VWrapId) ]]), apply := (nt, c, cnt) -> let( l := nt.params[2] / nt.firstTag().v, A := c[1] * VTensor(c[2], nt.firstTag().v) * c[3], NoDiagPullin(When(l=1, A , Tensor(I(l), A)))) ) )); Wrapping ++++++++ VWrap Transformation -------------------- .. code-block:: none # spiral-core\namespaces\spiral\paradigms\vector\vwrap.gi Class(VWrap, VWrapBase, rec( __call__ := (self,isa) >> Checked(IsSIMD_ISA(isa), WithBases(self, rec(operations:=PrintOps, isa:=isa))), wrap := (self,r,t,opts) >> let( isa := self.isa, v := isa.v, tt := When(t.isReal(), @_Base(paradigms.vector.sigmaspl.VTensor(r.node, v), r), paradigms.vector.breakdown.AxI_vec( TTensorI(TRC(t).withTags([AVecReg(isa)]), v, AVec, AVec).withTags([AVecReg(isa)]), paradigms.vector.breakdown.TRC_VRCLR( TRC(t).withTags([AVecReg(isa)]), r))), print := self >> Print(self.name, "(", self.isa, ")"), )); VWrap in Context of Search -------------------------- Used in DP to time sub-trees. .. code-block:: none opts := SIMDGlobals.getOpts(AVX_4x64f); t := DFT(2).setWrap(VWrap(AVX_4x64f)); rt := RandomRuleTree(t, opts); wrt := t.wrap.wrap(rt, t, opts); SPLRuleTree(wrt); Special Role of Stride Permutations +++++++++++++++++++++++++++++++++++ TL: Lift Stride Permutation to Non-Terminal Level ------------------------------------------------- .. code-block:: none # spiral-core\namespaces\spiral\paradigms\common\nonterms.gi Class(TL, Tagged_tSPL_Container, rec( abbrevs := [ (size, stride) -> Checked(ForAll([size, stride], IsPosIntSym), [size, stride, 1, 1]), (size, stride, left, right) -> Checked(ForAll([size, stride, left, right], IsPosIntSym), [size, stride, left, right]) ], __call__ := arg >> let(self := arg[1], args := Drop(arg, 1), Cond(args=[1,1,1,1], I(1), ApplyFunc(Inherited,args))), dims := self >> Replicate(2, self.params[1]*self.params[3]*self.params[4]), terminate := self >> Tensor(I(self.params[3]), L(self.params[1], self.params[2]), I(self.params[4])), transpose := self >> TL(self.params[1], self.params[1]/self.params[2], self.params[3], self.params[4]).withTags(self.getTags()), )); TL in Context of Search ----------------------- Used in DP to time sub-trees. .. code-block:: none t := TL(8,2,2,4); t.terminate(); Architecture Specific Permutations ++++++++++++++++++++++++++++++++++ Looking up vectorized Implementations for TL -------------------------------------------- .. code-block:: none Import(paradigms.vector.sigmaspl); opts := SIMDGlobals.getOpts(AVX_4x64f); opts.breakdownRules.TL; t := TL(16,4,1,1).withTags(opts.tags); # a in-register perm rt := RandomRuleTree(t, opts); HashLookup(opts.baseHashes[1], t); # the implementation is cached s := SPLRuleTree(rt); vp := Collect(s, VPerm)[1]; # SPL object carries its code vp.code; # code generator refers to ISA AVX_4x64f.rules; # ISA carries implementations PrintCode("", vp.code(Y, X), opts); # for in-register perms SIMD ISA Database and Bootstrapping a Vector Architecture --------------------------------------------------------- .. code-block:: none SIMD_ISA_DB; # central SIMD data base SIMD_ISA_DB.installed(); # all the ISAs supported Doc(AVX_4x64f); # The ISA carries all the relevant info Print(SIMD_ISA_DB.buildBases); # How the base cases are built AVX_4x64f.buildRules; # bootstrapping function SIMD_ISA_DB.getBases(AVX_4x64f); # all the base cases needed SIMD_ISA_DB.lookupBases(AVX_4x64f); # and how they are implemented ISA Database and Hashed Breakdowns ++++++++++++++++++++++++++++++++++ Generic Breakdown Rules to Look Up Architecture Specific Code ------------------------------------------------------------- .. code-block:: none # spiral-core\namespaces\spiral\paradigms\vector\bases\tl_bases.gi NewRulesFor(TL, rec( SIMD_ISA_Bases1 := rec( forTransposition := false, applicable := (self, t) >> t.isTag(1, AVecReg) and let(isa := t.firstTag().isa, P:=t.params, isa.active and ForAny(isa.supportedTL(), e -> _TL_applicable(e, P[1], P[2], P[3], P[4]))), apply := function(nt,C,cnt) local isa, tl, ll, vprm, P; P:=nt.params; isa := nt.firstTag().isa; tl := isa.getTL(P); ll := P[3] / tl.perm.l; vprm := tl.vperm; return When(ll = 1, vprm, BlockVPerm(ll, isa.v, vprm, tl.perm.spl)); end, ) )); Stride Permutatiopn Identities as Rules +++++++++++++++++++++++++++++++++++++++ Generic Breakdown Rules to Look Up Architecture Specific Code ------------------------------------------------------------- .. code-block:: none # spiral-core\namespaces\spiral\paradigms\common\breakdown.gi NewRulesFor(TL, rec( IxLxI_kmn_n := rec (forTransposition := false, applicable := nt -> Length(DivisorsIntDrop(nt.params[1]/nt.params[2])) > 0, children := nt -> let( N := nt.params[1], n := nt.params[2], km := N/n, ml := DivisorsIntDrop(km), l := nt.params[3], r := nt.params[4], List(ml, m -> let( k := km/m, [ TL(k*n, n, l, r*m).withTags(nt.getTags()), TL(m*n, n, k*l, r).withTags(nt.getTags()) ])) ), apply := (nt, c, cnt) -> let( spl := c[1] * c[2], When(nt.params[1] = nt.params[2]^2, SymSPL(spl), spl ) ) )); SPL And Σ-SPL Vector Objects ++++++++++++++++++++++++++++ Generate The Example -------------------- .. code-block:: none Import(paradigms.vector.sigmaspl); opts := SIMDGlobals.getOpts(AVX_4x64f); rt := RandomRuleTree(TRC(DFT(16)).withTags(opts.tags), opts); s := SPLRuleTree(rt); # SPL Objects ss := SumsRuleTree(rt, opts); # Sigma-SPL Objects Inspecting the Vector Objects ----------------------------- .. code-block:: none Collect(s, VTensor)[1]; # Vectorized Tensor(., I(v)) Collect(ss, VTensor)[1]; # Vectorized Tensor(., I(v)) Collect(s, VPerm)[1]; # Vectorized Prm(.) Collect(s, BlockVPerm)[1]; # Vectorized Tensor(I(.), Prm(.)) Collect(s, VContainer)[1]; # Provides context for rewriting Collect(s, VRC)[1]; # Carries interleaved complex format Collect(ss, VGath)[1]; # Tensor(Gath(.), I(v)) Collect(ss, VScat)[1]; # Tensor(Scat(.), I(v)) Collect(ss, VRCDiag)[1]; # Vectorized Diag(.) Vector SPL Objects ++++++++++++++++++ Vector Tensor SPL Object ------------------------ .. code-block:: none # spiral-core\namespaces\spiral\paradigms\vector\sigmaspl\vtensor.gi Class(VTensor, Tensor, rec( new := (self, L) >> SPL(WithBases(self, rec( _children := [L[1]], dimensions := When(IsBound(L[1].dims), L[1].dims(), L[1].dimensions) * L[2], vlen := L[2]))), from_rChildren := (self, rch) >> ObjId(self)(rch[1], self.vlen), print := (self,i,is) >> Print(self.name, "(", self.child(1).print(i+is,is), ", ", self.vlen, ")"), toAMat := self >> Tensor(self.child(1), I(self.vlen)).toAMat(), sums := self >> Inherit(self, rec(_children := [self.child(1).sums()])), isPermutation := False, dims := self >> self.child(1).dims() * self.vlen, needInterleavedLeft := False, needInterleavedRight := False, transpose := self >> VTensor(self.child(1).transpose(), self.vlen), isBlockTransitive := true, cannotChangeDataFormat := False, )); Vector Σ-SPL Objects ++++++++++++++++++++ Vector Gather ------------- .. code-block:: none # spiral-core\namespaces\spiral\paradigms\vector\sigmaspl\gather.gi Class(VGath, BaseVGath, SumsBase, rec( rChildren := self >> [self.func], rSetChild := rSetChildFields("func"), from_rChildren := (self, rch) >> ObjId(self)(rch[1], self.v), new := (self, func, v) >> SPL(WithBases(self, rec(func := func, v := v))).setDims(), dims := self >> [self.v*self.func.domain(), self.v*self.func.range()], transpose := self >> VScat(self.func, self.v), print := (self,i,is) >> Print(self.name, "(", self.func, ", ", self.v,")", self.printA()), toAMat := self >> let(v:=self.v, n := EvalScalar(v*self.func.domain()), N := EvalScalar(v*self.func.range()), func := fTensor(self.func, fId(v)).lambda(), AMatMat(List([0..n-1], row -> BasisVec(N, EvalScalar(func.at(row).ev()))))), )); Vector RC Objects: Manage Data Layout +++++++++++++++++++++++++++++++++++++ Vector RC Class --------------- .. code-block:: none Class(VRC, RC, rec( toAMat := (self) >> AMatMat(RCMatCyc(MatSPL(self.child(1)))), new := meth(self, spl, v) local res; res := SPL(WithBases(self, rec(_children:=[spl], v:=v, dimensions := spl.dimensions))); res.dimensions := res.dims(); return res; end, print := (self, i, is) >> Print(self.__name__, "(\n", Blanks(i+is), self.child(1).print(i+is,is), ", ", #"\n", Blanks(i+is), self.v, #"\n", Blanks(i), ")", self.printA()), unroll := self >> self, transpose := self >> VRC(self.child(1).conjTranspose(), self.v), vcost := self >> self.child(1).vcost(), from_rChildren := (self, rch) >> ObjId(self)(rch[1], self.v) )); VRC, VRCL, VRCR, VRCLR ---------------------- .. code-block:: none v1 := VRC(Tensor(I(2), F(2)), 4); # Implicit data reorganization MatSPL(v1); v2 := VRCL(Tensor(I(2), F(2)), 4); # The L and R encodes MatSPL(v2); v3 := VRCR(Tensor(I(2), F(2)), 4); # which side goes from MatSPL(v3); v4 := VRCLR(Tensor(I(2), F(2)), 4); # interleaved complex format to MatSPL(v4); # block split complex format Terminate VRC, VRCL, VRCR, VRCLR -------------------------------- .. code-block:: none Import(paradigms.vector.rewrite); opts := SIMDGlobals.getOpts(AVX_4x64f); # see how the format gets propagated down the tree v5 := VRC(Tensor(I(2), F(2)) * Tensor(F(2), I(2)), 4); RulesVRC(v5); # when propagated to the leftmost/rightmost tree leaves, terminate v6 := VRCL(VTensor(F(2), 4), 4); v7 := Rewrite(v6, [RulesVRCTerm], opts); # termnination inserts VPerms to implement local data format change v8 := VRCR(VTensor(F(2), 4), 4); v9 := Rewrite(v9, [RulesVRCTerm], opts); Vector SumsGen and Rewriting ++++++++++++++++++++++++++++ Default Sumsgen Handles Vector SPL to Σ-SPL ------------------------------------------- .. code-block:: none opts := SIMDGlobals.getOpts(AVX_4x64f); opts.sumsgen; opts.sumsgen.VTensor; opts.sumsgen.VPerm; Rewrite Rule Strategies ----------------------- .. code-block:: none opts.formulaStrategies; opts.formulaStrategies.sigmaSpl; opts.formulaStrategies.preRC; opts.formulaStrategies.postProcess; Compile Strategies ------------------ .. code-block:: none opts.compileStrategy; Print(opts.vector.isa.fixProblems); Vector CodeGen ++++++++++++++ Default Sumsgen handles SPL to Σ-SPL. .. code-block:: none # spiral-core\namespaces\spiral\paradigms\vector\sigmaspl\codegen.gi Class(VectorCodegen, DefaultCodegen, rec( VContainer := (self, o, y, x, opts) >> self(o.child(1), y, x, CopyFields(opts, rec( vector := rec( isa := o.isa, SIMD := LocalConfig.cpuinfo.SIMDname )))), VPrm_x_I := (self, o, y, x, opts) >> self(VTensor(Prm(o.func), o.v), y, x, opts), VPerm := (self, o, y, x, opts) >> o.code(y, x), VTensor := (self, o, y, x, opts) >> let( CastToVect := p -> StripList(List(Flat([p]), e -> tcast(TPtr(TVect(opts.vector.isa.t.t, o.vlen)), e))), self(o.child(1), CastToVect(y), CastToVect(x), opts)), VGath := (self, o, y, x, opts) >> Cond(IsUnalignedPtrT(x.t), self(VGath_u(fTensor(o.func, fBase(o.v, 0)), o.v), y, x, opts), self(VTensor(Gath(o.func), o.v), y, x, opts)), VScat := (self, o, y, x, opts) >> Cond(IsUnalignedPtrT(y.t), self(VScat_u(fTensor(o.func, fBase(o.v, 0)), o.v), y, x, opts), self(VTensor(Scat(o.func), o.v), y, x, opts)) )); Vector Instructions +++++++++++++++++++ Every ISA defines ISA specific instructions and polymorphic add,… .. code-block:: none # spiral-core\namespaces\spiral\platforms\avx\code.gi # __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int offset); Class(vinsert_2l_4x64f, VecExp_4.binary(), rec( ev := self >> let( a := _unwrap(self.args[1].ev()), b := _unwrap(self.args[2].ev()), When( self.args[3].p[1] = 0, b :: a{[3 .. 4]}, a{[1 .. 2]} :: b )), computeType := self >> self.args[1].t, )); # __m256d _mm256_unpackhi_pd(__m256d a, __m256d b); Class(vunpackhi_4x64f, VecExp_4.binary(), rec( semantic := (in1, in2, p) -> [in1[2], in2[2], in1[4], in2[4]], ev := _evpack )); #__m256 _mm256_blend_ps(__m256 m1, __m256 m2, const int mask); Class(vblend_8x32f, VecExp_8.binary(), rec( semantic := (in1, in2, p) -> List( Zip2(TransposedMat([in1, in2]), p), e -> e[1][e[2]]), params := self >> Replicate(8, [1,2]), ev := _evshuf2 )); Vector Strength Reduction and Fixup Rules +++++++++++++++++++++++++++++++++++++++++ .. code-block:: none # spiral-core\namespaces\spiral\platforms\avx\sreduce.gi RewriteRules(FixCodeAVX, rec( fix_noneExp := Rule( noneExp, e -> e.t.zero()), vpermf128_8x32f_to_vextract := Rule( [assign, [deref, @(1)], [vpermf128_8x32f, @(2), @(3), @(4)]], e -> let( p := @(4).val.p, a := [[@(2).val, [0]], [@(2).val, [1]], [@(3).val, [0]], [@(3).val, [1]]], dst := tcast(TPtr(TVect(T_Real(32), 4)), @(1).val), chain( assign(deref(dst), ApplyFunc(vextract_4l_8x32f, a[p[1]])), assign( deref(dst+1), ApplyFunc(vextract_4l_8x32f, a[p[2]]))) )), addsub_4x64f_to_mul := Rule( [addsub_4x64f, _0, @(1)], e -> mul(e.t.value([-1,1,-1,1]), @(1).val)), addsub_8x32f_to_mul := Rule( [addsub_8x32f, _0, @(1)], e -> mul(e.t.value([-1,1,-1,1,-1,1,-1,1]), @(1).val)), avx_add_addsub_vzero := Rule([add, @(1), [@(2, [addsub_4x64f, addsub_8x32f]), _0, @(3)]], e -> ObjId(@(2).val)(@(1).val, @(3).val)), )); Vector Unparser +++++++++++++++ Polymorphic Unparsing for standard icode, adds special instructions .. code-block:: none # spiral-core\namespaces\spiral\platforms\avx\unparse.gi Class(AVXUnparser, SSEUnparser, rec( TVect := (self, t, vars, i, is) >> let( ctype := self.ctype(t, _isa(self)), Print(ctype, " ", self.infix(vars, ", "))), vpack := (self, o, i, is) >> Cond( _avxT(o.t, self.opts), Print("_mm256_set_", self.ctype_suffix(o.t, _isa(self)), "(", self.infix(Reversed(o.args), ", "), ")"), Inherited(o, i, is)), sub := (self, o, i, is) >> Cond( _avxT(o.t, self.opts), let( isa := _isa(self), sfx := self.ctype_suffix(o.t, isa), saturated := When(isa.isFixedPoint and isa.saturatedArithmetic, "s", ""), self.printf("_mm256_sub$1_$2($3, $4)", [saturated, sfx, o.args[1], o.args[2]])), Inherited(o, i, is)), vextract_2l_4x64f := (self, o, i, is) >> self.prefix("_mm256_extractf128_pd", o.args), vstore_2l_4x64f := (self, o, i, is) >> self.prefix("_mm256_extractf128_pd", o.args), )); ISA Definition ++++++++++++++ ISA Definition file ties everyting together .. code-block:: none # spiral-core\namespaces\spiral\platforms\avx\isa.gi Class(AVX_4x64f, AVX_Intel, rec( includes := () -> [""] :: _AVXINTRIN(), v := 4, t := TVect(T_Real(64), 4), ctype := "double", instr := [ vunpacklo_4x64f, vunpackhi_4x64f, vshuffle_4x64f, vperm2_4x64f, vpermf128_4x64f, vperm_4x64f, vblend_4x64f ], mul_cx := (self, opts) >> ((y, x, c) -> let( u1 := self.freshU(), u2 := self.freshU(), u3 := self.freshU(), decl([u1, u2, u3], chain( assign(u1, mul(x, vunpacklo_4x64f(c, c))), assign(u2, vshuffle_4x64f(x, x, [2,1,2,1])), assign(u3, mul(u2, vunpackhi_4x64f(c, c))), assign(y, addsub_4x64f(u1, u3)))))), svload_init := (vt) -> [ [ y,x,opts) -> let(u1 := var.fresh_t("U", TVect(vt.t, 2)), decl([u1], chain( assign(u1, vload1sd_2x64f(x[1].toPtr(vt.t))), assign(y, vinsert_2l_4x64f(vt.zero(), u1, [0]))))), ... ));