blob: cc404de2e19ea38148b1d2568e981a222b92b8f6 [file] [log] [blame]
; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
; REQUIRES: tpu
; Test EUP intrinsics code generation
target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
target triple = "googletpu"
declare i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float>)
declare i32 @llvm.tpu.pow2.v1024f32(<1024 x float>)
declare i32 @llvm.tpu.log2.v1024f32(<1024 x float>)
declare i32 @llvm.tpu.tanh.v1024f32(<1024 x float>)
declare i32 @llvm.tpu.rcp.v1024f32(<1024 x float>)
declare i32 @llvm.tpu.eup.push.v1024f32(<1024 x float>)
declare <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32)
; CHECK-LABEL: rsqrt:
; CHECK: (erf) = vrsqrt.f32 v0
; CHECK: _ = vdelay $0x6
; CHECK v0 = vpop (erf)
define <1024 x float> @rsqrt(<1024 x float> %v) {
%f = call i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float> %v)
%res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
ret <1024 x float> %res
}
; CHECK-LABEL: pow2:
; CHECK: (erf) = vpow2.f32 v0
; CHECK: _ = vdelay $0x6
; CHECK v0 = vpop (erf)
define <1024 x float> @pow2(<1024 x float> %v) {
%f = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v)
%res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
ret <1024 x float> %res
}
; CHECK-LABEL: log2:
; CHECK: (erf) = vlog2.f32 v0
; CHECK: _ = vdelay $0x6
; CHECK v0 = vpop (erf)
define <1024 x float> @log2(<1024 x float> %v) {
%f = call i32 @llvm.tpu.log2.v1024f32(<1024 x float> %v)
%res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
ret <1024 x float> %res
}
; CHECK-LABEL: tanh:
; CHECK: (erf) = vtanh.f32 v0
; CHECK: _ = vdelay $0x6
; CHECK v0 = vpop (erf)
define <1024 x float> @tanh(<1024 x float> %v) {
%f = call i32 @llvm.tpu.tanh.v1024f32(<1024 x float> %v)
%res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
ret <1024 x float> %res
}
; CHECK-LABEL: rcp:
; CHECK: (erf) = vrcp.f32 v0
; CHECK: _ = vdelay $0x6
; CHECK v0 = vpop (erf)
define <1024 x float> @rcp(<1024 x float> %v) {
%f = call i32 @llvm.tpu.rcp.v1024f32(<1024 x float> %v)
%res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
ret <1024 x float> %res
}
; CHECK-LABEL: fdivrcpimm:
; CHECK: (erf) = vrcp.f32 v0
; CHECK: _ = vdelay $0x6
; CHECK: v0 = vpop (erf)
; CHECK: v0 = vmul.f32 $1.0, v0
define <1024 x float> @fdivrcpimm(<1024 x float> %v) {
%splatinsert = insertelement <1024 x float> undef, float 1., i32 0
%splat = shufflevector <1024 x float> %splatinsert, <1024 x float> undef, <1024 x i32> zeroinitializer
%res = fdiv <1024 x float> %splat, %v
ret <1024 x float> %res
}
; CHECK-LABEL: fdivrcpvar:
; CHECK: (erf) = vrcp.f32 v0
; CHECK: _ = vdelay $0x6
; CHECK: v0 = vpop (erf)
; CHECK: v0 = vmul.f32 s0, v0
define <1024 x float> @fdivrcpvar(float %a, <1024 x float> %v) {
%splatinsert = insertelement <1024 x float> undef, float %a, i32 0
%splat = shufflevector <1024 x float> %splatinsert, <1024 x float> undef, <1024 x i32> zeroinitializer
%res = fdiv <1024 x float> %splat, %v
ret <1024 x float> %res
}
; CHECK-LABEL: fdivrcpvec:
; CHECK: (erf) = vrcp.f32 v1
; CHECK: _ = vdelay $0x6
; CHECK: v1 = vpop (erf)
; CHECK: v0 = vmul.f32 v0, v1
define <1024 x float> @fdivrcpvec(<1024 x float> %vx, <1024 x float> %vy) {
%res = fdiv <1024 x float> %vx, %vy
ret <1024 x float> %res
}
; CHECK-LABEL: push:
; CHECK: (erf) = vpush v0
; CHECK: _ = vdelay $0x6
; CHECK v0 = vpop (erf)
define <1024 x float> @push(<1024 x float> %v) {
%f = call i32 @llvm.tpu.eup.push.v1024f32(<1024 x float> %v)
%res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
ret <1024 x float> %res
}
; CHECK-LABEL: eup_scheduling:
; CHECK: (erf) = vrcp.f32
; CHECK: _ = vdelay $0x1
; CHECK: (erf) = vtanh.f32
; CHECK: _ = vdelay $0x1
; CHECK: (erf) = vlog2.f32
; CHECK: _ = vdelay $0x1
; CHECK: (erf) = vpow2.f32
; CHECK: _ = vdelay $0x3
; CHECK: v{{[0-9]+}} = vpop (erf)
; CHECK: v{{[0-9]+}} = vpop (erf)
; CHECK: v{{[0-9]+}} = vpop (erf)
; CHECK: v{{[0-9]+}} = vpop (erf)
define <1024 x float> @eup_scheduling(<1024 x float> %v) {
%f1 = call i32 @llvm.tpu.rcp.v1024f32(<1024 x float> %v)
%res1 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f1)
%f2 = call i32 @llvm.tpu.tanh.v1024f32(<1024 x float> %v)
%res2 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f2)
%f3 = call i32 @llvm.tpu.log2.v1024f32(<1024 x float> %v)
%res3 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f3)
%f4 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v)
%res4 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f4)
%res5 = fadd <1024 x float> %res1, %res2
%res6 = fadd <1024 x float> %res3, %res4
%res7 = fadd <1024 x float> %res6, %res5
ret <1024 x float> %res7
}