| ; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s |
| ; REQUIRES: tpu |
| |
| ; Test EUP intrinsics code generation |
| |
| target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64" |
| target triple = "googletpu" |
| |
| declare i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float>) |
| declare i32 @llvm.tpu.pow2.v1024f32(<1024 x float>) |
| declare i32 @llvm.tpu.log2.v1024f32(<1024 x float>) |
| declare i32 @llvm.tpu.tanh.v1024f32(<1024 x float>) |
| declare i32 @llvm.tpu.rcp.v1024f32(<1024 x float>) |
| declare i32 @llvm.tpu.eup.push.v1024f32(<1024 x float>) |
| |
| declare <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32) |
| |
| ; CHECK-LABEL: rsqrt: |
| ; CHECK: (erf) = vrsqrt.f32 v0 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK v0 = vpop (erf) |
| define <1024 x float> @rsqrt(<1024 x float> %v) { |
| %f = call i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float> %v) |
| %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f) |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: pow2: |
| ; CHECK: (erf) = vpow2.f32 v0 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK v0 = vpop (erf) |
| define <1024 x float> @pow2(<1024 x float> %v) { |
| %f = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v) |
| %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f) |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: log2: |
| ; CHECK: (erf) = vlog2.f32 v0 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK v0 = vpop (erf) |
| define <1024 x float> @log2(<1024 x float> %v) { |
| %f = call i32 @llvm.tpu.log2.v1024f32(<1024 x float> %v) |
| %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f) |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: tanh: |
| ; CHECK: (erf) = vtanh.f32 v0 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK v0 = vpop (erf) |
| define <1024 x float> @tanh(<1024 x float> %v) { |
| %f = call i32 @llvm.tpu.tanh.v1024f32(<1024 x float> %v) |
| %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f) |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: rcp: |
| ; CHECK: (erf) = vrcp.f32 v0 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK v0 = vpop (erf) |
| define <1024 x float> @rcp(<1024 x float> %v) { |
| %f = call i32 @llvm.tpu.rcp.v1024f32(<1024 x float> %v) |
| %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f) |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: fdivrcpimm: |
| ; CHECK: (erf) = vrcp.f32 v0 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK: v0 = vpop (erf) |
| ; CHECK: v0 = vmul.f32 $1.0, v0 |
| define <1024 x float> @fdivrcpimm(<1024 x float> %v) { |
| %splatinsert = insertelement <1024 x float> undef, float 1., i32 0 |
| %splat = shufflevector <1024 x float> %splatinsert, <1024 x float> undef, <1024 x i32> zeroinitializer |
| %res = fdiv <1024 x float> %splat, %v |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: fdivrcpvar: |
| ; CHECK: (erf) = vrcp.f32 v0 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK: v0 = vpop (erf) |
| ; CHECK: v0 = vmul.f32 s0, v0 |
| define <1024 x float> @fdivrcpvar(float %a, <1024 x float> %v) { |
| %splatinsert = insertelement <1024 x float> undef, float %a, i32 0 |
| %splat = shufflevector <1024 x float> %splatinsert, <1024 x float> undef, <1024 x i32> zeroinitializer |
| %res = fdiv <1024 x float> %splat, %v |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: fdivrcpvec: |
| ; CHECK: (erf) = vrcp.f32 v1 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK: v1 = vpop (erf) |
| ; CHECK: v0 = vmul.f32 v0, v1 |
| define <1024 x float> @fdivrcpvec(<1024 x float> %vx, <1024 x float> %vy) { |
| %res = fdiv <1024 x float> %vx, %vy |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: push: |
| ; CHECK: (erf) = vpush v0 |
| ; CHECK: _ = vdelay $0x6 |
| ; CHECK v0 = vpop (erf) |
| define <1024 x float> @push(<1024 x float> %v) { |
| %f = call i32 @llvm.tpu.eup.push.v1024f32(<1024 x float> %v) |
| %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f) |
| ret <1024 x float> %res |
| } |
| |
| ; CHECK-LABEL: eup_scheduling: |
| ; CHECK: (erf) = vrcp.f32 |
| ; CHECK: _ = vdelay $0x1 |
| ; CHECK: (erf) = vtanh.f32 |
| ; CHECK: _ = vdelay $0x1 |
| ; CHECK: (erf) = vlog2.f32 |
| ; CHECK: _ = vdelay $0x1 |
| ; CHECK: (erf) = vpow2.f32 |
| ; CHECK: _ = vdelay $0x3 |
| ; CHECK: v{{[0-9]+}} = vpop (erf) |
| ; CHECK: v{{[0-9]+}} = vpop (erf) |
| ; CHECK: v{{[0-9]+}} = vpop (erf) |
| ; CHECK: v{{[0-9]+}} = vpop (erf) |
| define <1024 x float> @eup_scheduling(<1024 x float> %v) { |
| %f1 = call i32 @llvm.tpu.rcp.v1024f32(<1024 x float> %v) |
| %res1 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f1) |
| %f2 = call i32 @llvm.tpu.tanh.v1024f32(<1024 x float> %v) |
| %res2 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f2) |
| %f3 = call i32 @llvm.tpu.log2.v1024f32(<1024 x float> %v) |
| %res3 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f3) |
| %f4 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v) |
| %res4 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f4) |
| %res5 = fadd <1024 x float> %res1, %res2 |
| %res6 = fadd <1024 x float> %res3, %res4 |
| %res7 = fadd <1024 x float> %res6, %res5 |
| ret <1024 x float> %res7 |
| } |