blob: 4976fbc9806859ae83b96671510f352976c37fae [file] [log] [blame]
//===--------- TPUISelLowering.cpp - TPU DAG Lowering Implementation ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the TPUTargetLowering class.
//
//===----------------------------------------------------------------------===//
#include "TPUISelLowering.h"
#include "MCTargetDesc/TPUBaseInfo.h"
#include "MCTargetDesc/TPUMCTargetDesc.h"
#include "TPU.h"
#include "TPUCallingConv.h"
#include "TPUIRUtils.h"
#include "TPUMachineFunctionInfo.h"
#include "TPURegisterInfo.h"
#include "TPUSubtarget.h"
#include "TPUTargetMachine.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IntrinsicsTPU.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include <cassert>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <utility>
#define DEBUG_TYPE "tpu-lower"
using namespace llvm;
static cl::opt<bool> PropagateTpuEmbeddedMasks(
"tpu-enable-embedded-masks", cl::Hidden, cl::init(true),
cl::desc("Enables propagating embedded hardware masks "
"into special mask registers."));
static cl::opt<bool>
GenerateTpuVCMasks("tpu-enable-vcmasks", cl::Hidden, cl::init(true),
cl::desc("Enables generation of vcmask instructions to "
"create mask immediates whenever possible."));
static cl::opt<bool>
EmulateSignedDivRem("tpu-emulate-signed-divrem", cl::Hidden,
cl::init(false),
cl::desc("Enables emulation of signed div/rem via the "
"unsigned div/rem instructions"));
extern cl::opt<bool> TPUVerifierStrictIntoPtr;
bool TPUTargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg,
const DataLayout &DL) const {
// All aggregates on BarnaCore are allocated consecutive registers.
return IsBC && (Ty->isArrayTy() || Ty->isStructTy());
}
TPUTargetLowering::TPUTargetLowering(const TargetMachine &TM,
const TPUSubtarget &STI)
: TargetLowering(TM) {
ST = &STI;
TII = ST->getInstrInfo();
IsBC = ST->isPxcBarnaCore();
IsSC = ST->isSparseCore();
IsVFTC = ST->hasVfcTensorCore();
HasLPVF = ST->hasLPVF();
HasLPGL = ST->hasLPGL();
HasVMinMax = ST->hasVMinMax();
if (ST->hasV1024()) {
HasVPU = true;
VNI32 = MVT::v1024i32;
VNF32 = MVT::v1024f32;
// TODO(thomasraoux): Mask can be 2bits per elements on PFC:
// https://g3doc.corp.google.com/platforms/deepsea/logic/pfc/g3doc/isa/tensorcore.md#create-sublane-mask-instruction
VMNI1 = MVT::v1024i1;
} else if (ST->hasV16()) {
HasVPU = true;
VNI32 = MVT::v16i32;
VNF32 = MVT::v16f32;
VMNI1 = MVT::v16i1;
} else if (ST->hasV8()) {
HasVPU = true;
VNI32 = MVT::v8i32;
VNF32 = MVT::v8f32;
VMNI1 = MVT::v8i1;
} else {
// No vector support.
VNI32 = MVT::i32;
VNF32 = MVT::f32;
VMNI1 = MVT::i1;
}
VNBF16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VNF16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VNI16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VNI4 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VNI2 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VNI8 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VMNBF16I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VNI8I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VMN16I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VMN32I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
VMN64I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
if (HasLPVF || HasLPGL) {
if (ST->hasV8()) {
VNBF16 = MVT::v16bf16;
VNF16 = MVT::v16f16;
VNI16 = MVT::v16i16;
VNI8 = MVT::v32i8;
VNI4 = MVT::v64i4;
VNI2 = MVT::v128i2;
VMNBF16I1 = MVT::v16i1;
VNI8I1 = MVT::v32i1;
VNI1 = MVT::v256i1;
VMN16I1 = MVT::v16i1;
VMN32I1 = MVT::v32i1;
} else if (ST->hasV16()) {
VNBF16 = MVT::v32bf16;
VNF16 = MVT::v32f16;
VNI16 = MVT::v32i16;
VNI8 = MVT::v64i8;
VNI1 = MVT::v512i1;
VNI4 = MVT::v128i4;
VNI2 = MVT::v256i2;
VMNBF16I1 = MVT::v32i1;
VNI8I1 = MVT::v64i1;
VMN32I1 = MVT::v32i1;
VMN64I1 = MVT::v32i1;
} else {
llvm_unreachable("Unexpected VPU size.");
}
}
// Set up the register classes.
addRegisterClass(MVT::i32, &TPU::GPRRegClass);
addRegisterClass(MVT::f32, &TPU::GPRRegClass);
addRegisterClass(MVT::bf16, &TPU::GPRRegClass);
addRegisterClass(MVT::i1, &TPU::PPRRegClass);
// MVT::i8 is not legal in GPR.
if (IsSC) {
// SparseCore is hijacking the mmx data type for cbreg.
addRegisterClass(MVT::x86mmx, &TPU::CBRRegClass);
}
if (HasVPU) {
if (IsBC) {
// BarnaCore has Vregs and Vaggregs that both have the same type, so
// use VPR_AGGRegClass which is the superclass of both. Restricting a
// regclass to a strict subset is trivial.
addRegisterClass(VNI32, &TPU::VPR_AGGRegClass);
addRegisterClass(VNF32, &TPU::VPR_AGGRegClass);
} else {
addRegisterClass(VNI32, &TPU::VPRRegClass);
addRegisterClass(VNF32, &TPU::VPRRegClass);
}
addRegisterClass(VMNI1, &TPU::MPRRegClass);
}
if (HasLPVF || HasLPGL) {
addRegisterClass(VNBF16, &TPU::VPRRegClass);
addRegisterClass(VNF16, &TPU::VPRRegClass);
addRegisterClass(VNI16, &TPU::VPRRegClass);
addRegisterClass(VNI8, &TPU::VPRRegClass);
addRegisterClass(VNI4, &TPU::VPRRegClass);
addRegisterClass(VNI2, &TPU::VPRRegClass);
addRegisterClass(VNI1, &TPU::VPRRegClass);
if (ST->hasV8())
addRegisterClass(VMN16I1, &TPU::MPRRegClass);
addRegisterClass(VMN32I1, &TPU::MPRRegClass);
if (ST->hasV16())
addRegisterClass(VMN64I1, &TPU::MPRRegClass);
}
// Compute derived properties from the register classes
TRI = ST->getRegisterInfo();
computeRegisterProperties(TRI);
setStackPointerRegisterToSaveRestore(TPU::SPS);
setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i8, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i8, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BRCOND, MVT::Other, Legal);
setOperationAction(ISD::SETCC, MVT::i32, Legal);
setOperationAction(ISD::SETCC, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i32, Legal);
setOperationAction(ISD::SELECT, MVT::f32, Legal);
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, VNF32, Expand);
setOperationAction(ISD::SELECT_CC, VNI32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
setOperationAction(ISD::VASTART, MVT::Other, Expand);
setOperationAction(ISD::VAARG, MVT::Other, Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
if (IsSC) {
setOperationAction(ISD::SDIVREM, MVT::i32, Legal);
setOperationAction(ISD::UDIVREM, MVT::i32, Legal);
setOperationAction(ISD::FDIV, MVT::f32, Custom);
} else {
setOperationAction(ISD::SDIVREM, VNI32, Expand);
setOperationAction(ISD::UDIVREM, VNI32, Expand);
}
// We rely on the combiner to expand into DIVREM.
auto SDivRemAction = EmulateSignedDivRem ? Custom : Expand;
setOperationAction(ISD::SDIV, MVT::i32, SDivRemAction);
setOperationAction(ISD::SREM, MVT::i32, SDivRemAction);
setOperationAction(ISD::UDIV, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
// We do not currently VDiv i32.
setOperationAction(ISD::SDIV, VNI32, Expand);
setOperationAction(ISD::UDIV, VNI32, Expand);
setOperationAction(ISD::SREM, VNI32, Expand);
setOperationAction(ISD::UREM, VNI32, Expand);
for (const auto &VT : {MVT::i32, VNI32}) {
setOperationAction(ISD::MUL, VT, Legal);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::SHL_PARTS, VT, Expand);
setOperationAction(ISD::SRL_PARTS, VT, Expand);
setOperationAction(ISD::SRA_PARTS, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTPOP, VT, Legal);
setOperationAction(ISD::CTLZ, VT, Legal);
setOperationAction(ISD::CTTZ, VT, Expand);
}
// If VMul i32 is not natively supported, we need to emulate it.
if (!IsSC && !IsVFTC)
setOperationAction(ISD::MUL, VNI32, Custom);
// For Jellyfish do a custom lowering of i32 MUL
if (!ST->hasSMul32())
setOperationAction(ISD::MUL, MVT::i32, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
if (IsVFTC || IsSC) {
setOperationAction(ISD::UMAX, MVT::i32, Legal);
setOperationAction(ISD::UMIN, MVT::i32, Legal);
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
} // else we will fail lowering.
setOperationAction(ISD::FNEG, MVT::f32, Legal);
setOperationAction(ISD::FNEG, VNF32, Legal);
// Extended load operations for i1 types must be promoted
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
}
setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXIMUM, VNF32, Legal);
setOperationAction(ISD::FMINIMUM, VNF32, Legal);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
setOperationAction(ISD::FCOPYSIGN, VNF32, Legal);
if (HasLPGL) {
setOperationAction(ISD::FMAXIMUM, VNBF16, Legal);
setOperationAction(ISD::FMINIMUM, VNBF16, Legal);
}
// Unordered comparisons not supported.
setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
setCondCodeAction(ISD::SETO, MVT::f32, Expand);
setCondCodeAction(ISD::SETUEQ, VNF32, Expand);
setCondCodeAction(ISD::SETUGT, VNF32, Expand);
setCondCodeAction(ISD::SETUGE, VNF32, Expand);
setCondCodeAction(ISD::SETULT, VNF32, Expand);
setCondCodeAction(ISD::SETULE, VNF32, Expand);
setCondCodeAction(ISD::SETONE, VNF32, Expand);
setCondCodeAction(ISD::SETUO, VNF32, Expand);
setCondCodeAction(ISD::SETO, VNF32, Expand);
if (HasLPGL) {
setCondCodeAction(ISD::SETUEQ, VNBF16, Expand);
setCondCodeAction(ISD::SETUGT, VNBF16, Expand);
setCondCodeAction(ISD::SETUGE, VNBF16, Expand);
setCondCodeAction(ISD::SETULT, VNBF16, Expand);
setCondCodeAction(ISD::SETULE, VNBF16, Expand);
setCondCodeAction(ISD::SETONE, VNBF16, Expand);
setCondCodeAction(ISD::SETUO, VNBF16, Expand);
setCondCodeAction(ISD::SETO, VNBF16, Expand);
}
if (HasVMinMax) {
setOperationAction(ISD::UMAX, VNI32, Legal);
setOperationAction(ISD::UMIN, VNI32, Legal);
if (HasLPGL) {
setOperationAction(ISD::UMAX, VNI16, Legal);
setOperationAction(ISD::UMIN, VNI16, Legal);
}
}
// Unsigned scalar comparisons supported for VF and SC subtargets.
LegalizeAction UnsignedCmpLegalizeAction = Custom;
if (ST->hasUnsignedScalarCompare()) {
UnsignedCmpLegalizeAction = Legal;
}
setCondCodeAction(ISD::SETUGT, MVT::i32, UnsignedCmpLegalizeAction);
setCondCodeAction(ISD::SETUGE, MVT::i32, UnsignedCmpLegalizeAction);
setCondCodeAction(ISD::SETULT, MVT::i32, UnsignedCmpLegalizeAction);
setCondCodeAction(ISD::SETULE, MVT::i32, UnsignedCmpLegalizeAction);
// Unsigned scalar comparisons supported for SC subtargets.
UnsignedCmpLegalizeAction = Custom;
if (ST->hasUnsignedVectorCompare()) {
UnsignedCmpLegalizeAction = Legal;
}
setCondCodeAction(ISD::SETUGT, VNI32, UnsignedCmpLegalizeAction);
setCondCodeAction(ISD::SETUGE, VNI32, UnsignedCmpLegalizeAction);
setCondCodeAction(ISD::SETULT, VNI32, UnsignedCmpLegalizeAction);
setCondCodeAction(ISD::SETULE, VNI32, UnsignedCmpLegalizeAction);
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
// We could match this during isel in tablegen, but we want a bit more
// control.
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
// Function alignments (log2)
setMinFunctionAlignment(Align(2));
setPrefFunctionAlignment(Align(2));
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setJumpIsExpensive(false);
// TODO(jmolloy): This is a hangover from Lanai. Evaluate if jumptables are
// needed or useful.
setMinimumJumpTableEntries(100);
// We'd run into trouble with pointer word sizes if we let native selection
// DAG lower these.
MaxStoresPerMemset = 0; // For @llvm.memset -> sequence of stores
MaxStoresPerMemsetOptSize = 0;
MaxStoresPerMemcpy = 0; // For @llvm.memcpy -> sequence of stores
MaxStoresPerMemcpyOptSize = 0;
MaxStoresPerMemmove = 0; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 0;
// Booleans always contain 0 or 1.
setBooleanContents(ZeroOrOneBooleanContent);
}
SDValue TPUTargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
SDValue TargetAddr = DAG.getTargetGlobalAddress(
GV, DL, getPointerTy(DAG.getDataLayout()), Offset);
return DAG.getNode(TPUISD::WRAPPER, DL, MVT::i32, TargetAddr);
}
SDValue TPUTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
CondCodeSDNode *Cond = cast<CondCodeSDNode>(Op.getOperand(2).getNode());
assert(!Op.getOperand(0).getValueType().isFloatingPoint() &&
ISD::isUnsignedIntSetCC(Cond->get()) &&
"Comparisons involving floating-point and signed-int types should not "
"be custom lowered as they are either expanded or legal.");
ISD::CondCode SignedCond;
switch (Cond->get()) {
default:
llvm_unreachable("Unknown signed condcode?");
case ISD::CondCode::SETULT:
SignedCond = ISD::CondCode::SETLT;
break;
case ISD::CondCode::SETULE:
SignedCond = ISD::CondCode::SETLE;
break;
case ISD::CondCode::SETUGT:
SignedCond = ISD::CondCode::SETGT;
break;
case ISD::CondCode::SETUGE:
SignedCond = ISD::CondCode::SETGE;
break;
}
SDLoc DL(Op);
// Generate unsigned setcc as:
// %x = setcc signed %a, %b
// %y = xor %a, %b // one if bitwise different.
// %z = setcc slt %y, 0 // sign bit different?
// xor %x, %z
EVT VT = Op.getValueType();
EVT InputVT = Op.getOperand(0).getValueType();
SDValue X =
DAG.getSetCC(DL, VT, Op.getOperand(0), Op.getOperand(1), SignedCond);
SDValue Y = DAG.getNode(ISD::XOR, DL, InputVT, Op.getOperand(0),
Op.getOperand(1));
SDValue Z = DAG.getSetCC(DL, VT, Y, DAG.getConstant(0, DL, InputVT),
ISD::CondCode::SETLT);
return DAG.getNode(ISD::XOR, DL, VT, X, Z);
}
SDValue TPUTargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
assert(ST->isSparseCore());
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
SDValue Splat = DAG.getNode(TPUISD::SPLAT, DL, VNF32, Y);
auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
auto PredInvert = DAG.getTargetConstant(APInt(1, 0), DL, MVT::i1);
MachineSDNode *VRcpPush =
DAG.getMachineNode(TPU::VRCP, DL, MVT::i32, {Splat, PredReg, PredInvert});
addTPUMemOperand(DAG, VRcpPush, /*IsPush=*/true, &TPU::ERFPRRegClass);
MachineSDNode *VRcpPop = DAG.getMachineNode(
TPU::VRES_EUP, DL, VNF32, {SDValue(VRcpPush, 0), PredReg, PredInvert});
addTPUMemOperand(DAG, VRcpPop, /*IsPush=*/false, &TPU::ERFPRRegClass);
SDValue Srcp =
SDValue(DAG.getMachineNode(TPU::scVREADr, SDLoc(Op), MVT::f32,
{SDValue(VRcpPop, 0),
DAG.getTargetConstant(0, DL, MVT::i32),
PredReg, PredInvert}),
0);
SDValue FDivRes = DAG.getNode(ISD::FMUL, DL, MVT::f32, X, Srcp);
return FDivRes;
}
SDValue TPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
assert(Op.getValueType() == MVT::i32 && "Expected i32 when Lowering SDIV32");
// To emulate signed division, we:
// 1. Take the absolute value of the operands
// 2. Perform an unsigned divide of the operands
// 3. Possibly negate the result of (2.).
unsigned UnsignedOpCode;
switch (Op.getOpcode()) {
default:
llvm_unreachable("Unknown signed divrem opcode");
case ISD::SDIV:
UnsignedOpCode = ISD::UDIV;
break;
case ISD::SREM:
UnsignedOpCode = ISD::UREM;
break;
}
EVT VT = Op.getValueType();
// 1. Compute abs(x), abs(y): abs(x) = x ^ (x >> 31) - (x >> 31)
//
// Note: we do this slightly differently than LLO, which uses
// compares+selects, but we end up with the same number of instructions.
// http://google3/platforms/xla/service/jellyfish/llo_region_builder.cc?l=950&rcl=378412916
SDValue XMask = DAG.getNode(ISD::SRA, DL, VT, X, DAG.getConstant(31, DL, VT));
SDValue YMask = DAG.getNode(ISD::SRA, DL, VT, Y, DAG.getConstant(31, DL, VT));
SDValue XInv = DAG.getNode(ISD::XOR, DL, VT, X, XMask);
SDValue YInv = DAG.getNode(ISD::XOR, DL, VT, Y, YMask);
SDValue XAbs = DAG.getNode(ISD::SUB, DL, VT, XInv, XMask);
SDValue YAbs = DAG.getNode(ISD::SUB, DL, VT, YInv, YMask);
// 2. Compute unsigned div/rem.
SDValue AbsResult = DAG.getNode(UnsignedOpCode, DL, VT, XAbs, YAbs);
// 3. Possibly negate the result of the unsigned div/rem.
SDValue SignMask;
if (Op.getOpcode() == ISD::SDIV) {
SignMask = DAG.getNode(ISD::XOR, DL, VT, XMask, YMask);
} else {
// For rem, the sign is determined by the dividend (X), defined the same way
// as the remainder operator % in C:
// (a % b) == a - (a / b) * b
SignMask = XMask;
}
// SignMask is either all zeros or all ones (in which case the result should
// be negative). When it is all ones, we can use this mask to negate the two's
// complement result similar to finding abs(x):
// result = abs_result ^ mask - mask
SDValue AbsResultInv = DAG.getNode(ISD::XOR, DL, VT, AbsResult, SignMask);
SDValue SignedResult = DAG.getNode(ISD::SUB, DL, VT, AbsResultInv, SignMask);
return SignedResult;
}
SDValue TPUTargetLowering::LowerMUL32(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
assert(Op.getValueType() == MVT::i32 && "Expected i32 when Lowering MUL32");
// Expand a MUL i32 operation using the UMUL24 node for Jellyfish.
// The decomposition looks like:
// c = mul i32 a, b
// -->
// ll = umul24 i32 a, b
// al = srl i32 a, 24
// bl = srl i32 b, 24
// lh = umul24 i32 al, b
// hl = umul24 i32 a, bl
// sum = add i32 lh, hl
// shiftsum = shl i32 sum, 24
// c = add i32 shiftsum, ll
EVT VT = Op.getValueType();
KnownBits KBX = DAG.computeKnownBits(X);
KnownBits KBY = DAG.computeKnownBits(Y);
bool X_is_24bit = (KBX.Zero & 0xFF000000U) == 0xFF000000U;
bool Y_is_24bit = (KBY.Zero & 0xFF000000U) == 0xFF000000U;
// Using the fact that the smul.u24 instruction automatically zeroes out the
// upper bits of the operands. This saves the need to do it ourselves.
SDValue Low_Low = DAG.getNode(TPUISD::UMUL24, DL, VT, X, Y);
SDValue High_Low, Low_High;
if (!X_is_24bit) {
SDValue HighX =
DAG.getNode(ISD::SRL, DL, VT, X, DAG.getConstant(24, DL, VT));
High_Low = DAG.getNode(TPUISD::UMUL24, DL, VT, HighX, Y);
}
if (!Y_is_24bit) {
SDValue HighY =
DAG.getNode(ISD::SRL, DL, VT, Y, DAG.getConstant(24, DL, VT));
Low_High = DAG.getNode(TPUISD::UMUL24, DL, VT, X, HighY);
}
SDValue MixedSum;
if (High_Low && Low_High) {
MixedSum = DAG.getNode(ISD::ADD, DL, VT, High_Low, Low_High);
} else if (High_Low) {
MixedSum = High_Low;
} else if (Low_High) {
MixedSum = Low_High;
} else {
return Low_Low;
}
SDValue ShiftedSum =
DAG.getNode(ISD::SHL, DL, VT, MixedSum, DAG.getConstant(24, DL, VT));
return DAG.getNode(ISD::ADD, DL, VT, Low_Low, ShiftedSum);
}
// Handle the lowering of the simple cases where one operand is a constant.
// This uses non-adjacent form (NAF).
SDValue TPUTargetLowering::SimpleEmulVMUL32(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
if (Y.getOpcode() != TPUISD::SPLAT)
return SDValue();
ConstantSDNode *C = isConstOrConstSplat(Y.getOperand(0));
if (C == nullptr)
return SDValue();
int M = C->getZExtValue();
int HighestOne = -1;
int NonZeroEntries = 0;
std::array<int, 32> SignedDigit;
SignedDigit.fill(0);
// Following algortihm taken from:
// https://en.wikipedia.org/wiki/Non-adjacent_form
int64_t e = std::abs(M);
const int s = M < 0 ? -1 : 1;
int i = 0;
while (e > 0) {
int zi = 0;
if (e % 2 == 1) {
zi = 2 - (e % 4);
if (zi != 0) {
++NonZeroEntries;
}
}
SignedDigit[i] = s * zi;
if (SignedDigit[i] == 1) {
HighestOne = i;
}
e = (e - zi) / 2;
++i;
}
// Initialize the running sum. Set the running sum to the maximal
// shifted positive value (i.e., largest i such that zi == 1 and MulAmt
// has V<<i as a term NAF).
SDValue Res;
if (HighestOne == -1) {
Res =
DAG.getNode(TPUISD::SPLAT, DL, VNI32, DAG.getConstant(0, DL, MVT::i32));
} else {
Res = DAG.getNode(TPUISD::SPLAT, DL, VNI32,
DAG.getConstant(HighestOne, DL, MVT::i32));
Res = DAG.getNode(ISD::SHL, DL, VNI32, X, Res);
SignedDigit[HighestOne] = 0;
}
// Assemble multiplication from shift, add, sub using NAF form and
// running sum.
for (size_t i = 0; i < SignedDigit.size(); ++i) {
if (SignedDigit[i] == 0) {
continue;
}
SDValue op = X;
// Shifted multiplicand (v<<i).
if (i > 0) {
SDValue I = DAG.getNode(TPUISD::SPLAT, DL, VNI32,
DAG.getConstant(i, DL, MVT::i32));
op = DAG.getNode(ISD::SHL, DL, VNI32, X, I);
}
if (SignedDigit[i] == 1) {
Res = DAG.getNode(ISD::ADD, DL, VNI32, Res, op);
} else if (SignedDigit[i] == -1) {
Res = DAG.getNode(ISD::SUB, DL, VNI32, Res, op);
}
}
return Res;
}
// Logic to lower down VMUL32 copied from LLO region builder.
SDValue TPUTargetLowering::LowerVMUL32(SDValue Op, SelectionDAG &DAG) const {
if (SDValue V = SimpleEmulVMUL32(Op, DAG))
return V;
SDLoc DL(Op);
SDValue lhs = Op.getOperand(0);
SDValue rhs = Op.getOperand(1);
// Multiword multiplication. Splits the inputs up into 3 11-bit words using
// fmul, uses VmulU11 to form their products.
// Computes int32(x * y). We use this as a 11 bit x 11 bit -> 22 bit integer
// multiplication primitive without losing precision.
// Generates code:
// uint32 u0 = u & 0x7FF;
// uint32 u1 = (u >> 11) & 0x7FF;
// uint32 u2 = u >> 22;
// uint32 v0 = v & 0x7FF;
// uint32 v1 = (v >> 11) & 0x7FF;
// uint32 v2 = v >> 22;
// return u0 * v0 + ((u1 * v0 + u0 * v1) << 11) +
// ((u0 * v2 + u1 * v1 + u2 * v0) << 22);
auto VSplatImm32 = [&](int I) {
return DAG.getNode(TPUISD::SPLAT, DL, VNI32,
DAG.getConstant(I, DL, MVT::i32));
};
auto VandU32 = [&](SDValue X, SDValue Y) {
return DAG.getNode(ISD::AND, DL, VNI32, X, Y);
};
auto VaddS32 = [&](SDValue X, SDValue Y) {
return DAG.getNode(ISD::ADD, DL, VNI32, X, Y);
};
auto VshrlU32 = [&](SDValue X, SDValue Y) {
return DAG.getNode(ISD::SRL, DL, VNI32, X, Y);
};
auto VshllU32 = [&](SDValue X, SDValue Y) {
return DAG.getNode(ISD::SHL, DL, VNI32, X, Y);
};
auto VcvtS32ToF32 = [&](SDValue X) {
return DAG.getNode(ISD::SINT_TO_FP, DL, VNF32, X);
};
// Computes int32(x * y). We use this as a 11 bit x 11 bit -> 22 bit integer
// multiplication primitive without losing precision.
auto VmulU11 = [&](SDValue Lhs, SDValue Rhs) {
auto Product =
DAG.getNode(ISD::FMUL, DL, VNF32, VcvtS32ToF32(Lhs), VcvtS32ToF32(Rhs));
return DAG.getNode(ISD::FP_TO_SINT, DL, VNI32, Product);
};
auto mask = VSplatImm32(0x7FF);
auto k11 = VSplatImm32(11);
auto k22 = VSplatImm32(22);
auto u0 = VandU32(lhs, mask);
auto u1 = VandU32(VshrlU32(lhs, k11), mask);
auto u2 = VshrlU32(lhs, k22);
auto v0 = VandU32(rhs, mask);
auto v1 = VandU32(VshrlU32(rhs, k11), mask);
auto v2 = VshrlU32(rhs, k22);
auto w0 = VmulU11(u0, v0);
auto w1 = VmulU11(u1, v0);
w1 = VaddS32(w1, VmulU11(u0, v1));
w1 = VshllU32(w1, k11);
auto w2 = VmulU11(u0, v2);
w2 = VaddS32(w2, VmulU11(u1, v1));
w2 = VaddS32(w2, VmulU11(u2, v0));
w2 = VshllU32(w2, k22);
return VaddS32(VaddS32(w0, w1), w2);
}
SDValue TPUTargetLowering::LowerADDRSPACECAST(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
unsigned SrcAS = ASC->getSrcAddressSpace();
unsigned DestAS = ASC->getDestAddressSpace();
if ((SrcAS == TPUAS_Smem && DestAS == TPUAS_SmemAny) ||
(SrcAS == TPUAS_Hbm && DestAS == TPUAS_HbmAny) ||
(SrcAS == TPUAS_Sflag && DestAS == TPUAS_SflagAny) ||
(SrcAS == TPUAS_Sflag && DestAS == TPUAS_SflagTile)) {
return DAG.getNode(ISD::BITCAST, dl, MVT::i32, ASC->getOperand(0));
}
if (!TPUVerifierStrictIntoPtr)
return DAG.getNode(ISD::BITCAST, dl, MVT::i32, ASC->getOperand(0));
report_fatal_error("Unsupported addrspace cast " + Twine(SrcAS) + "->" +
Twine(DestAS) + ".\n");
}
SDValue TPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default:
llvm_unreachable("unimplemented operand");
case ISD::SETCC:
return LowerSETCC(Op, DAG);
case ISD::GlobalAddress:
return LowerGlobalAddress(Op, DAG);
case ISD::SDIV:
case ISD::SREM:
return LowerSDIV32(Op, DAG);
case ISD::FDIV:
if (!ST->hasVPU())
llvm_unreachable("fdiv on scalar core is not supported.");
return LowerFDIV32(Op, DAG);
case ISD::MUL: {
if (Op.getValueType() == MVT::i32)
return LowerMUL32(Op, DAG);
return LowerVMUL32(Op, DAG);
}
case ISD::ADDRSPACECAST:
return LowerADDRSPACECAST(Op, DAG);
case TPUISD::SPLAT:
// We're doing some specific type checking, because this is a special case
// for MVT::v32i8 when the DAG legalizer tries to promote MVT::i8.
if (isTypeLegal(Op->getOperand(0).getValueType()))
llvm_unreachable(
"This should only happen if the splat element isn't legal.");
EVT VT = Op->getOperand(0).getValueType();
if (!VT.isSimple() || !VT.isInteger() || VT != MVT::i8)
llvm_unreachable("This should only happen on scalar type MVT::i8, "
"which is being promoted.");
// We're promoting the MVT::i8 Splat element and match it later.
return DAG.getNode(
TPUISD::SPLAT, SDLoc(Op), Op->getSimpleValueType(0),
DAG.getTargetConstant(Op->getConstantOperandAPInt(0).zext(32),
SDLoc(Op), MVT::i32));
}
}
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
static bool isMaskVT(MVT VT, const TPUSubtarget &ST) {
return VT.getScalarType() == MVT::i1 &&
/* This check is for real low precision i1 types */
VT.getSizeInBits() != 8 * ST.vectorSizeInBytes();
}
// Custom version of CCInfo.AnalyzeFormalArguments, supporting scalar and vector
// stacks. Hacks the memory offsets split into two stacks into the ArgLocs.
static void analyzeFormalArguments(const TPUTargetLowering &TLI,
const TPUSubtarget *ST,
const SmallVectorImpl<ISD::InputArg> &Ins,
CCState &CCInfo,
SmallVector<CCValAssign, 16> &ArgLocs) {
int NumBytesScalar = 0;
int NumBytesVector = 0;
unsigned NumArgs = Ins.size();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Ins[i].VT;
ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
int PrevNumBytes = CCInfo.getNextStackOffset();
if (CC_TPU(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo))
report_fatal_error("unable to allocate function argument #" + Twine(i));
if (!ST->isTPUABIEnabled() || CCInfo.getCallingConv() == CallingConv::Fast)
continue;
CCValAssign &CCV = ArgLocs[i];
if (int StackOffsetDelta = CCInfo.getNextStackOffset() - PrevNumBytes) {
if (ArgVT.isVector()) {
assert(ST->hasVPU());
// This is a trick using the API in order to adjust the LocMemOffset,
// because we have two separate stacks for scalar and vector.
if (isMaskVT(ArgVT, *ST)) {
int AlignedStackOffsetDelta =
alignTo(StackOffsetDelta, ST->vectorSizeInBytes());
StackOffsetDelta = AlignedStackOffsetDelta;
}
assert(StackOffsetDelta == ST->vectorSizeInBytes());
CCV.convertToMem(NumBytesVector);
NumBytesVector += StackOffsetDelta;
} else {
// Same comment as above.
CCV.convertToMem(NumBytesScalar);
NumBytesScalar += StackOffsetDelta;
}
}
}
}
// Transform physical registers into virtual registers and
// generate load operations for arguments places on the stack.
SDValue TPUTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
switch (CallConv) {
case CallingConv::Fast:
case CallingConv::C:
break;
default:
report_fatal_error("Unsupported calling convention");
}
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
if (ST->isTPUABIEnabled())
RegInfo.addLiveIn(TPU::LR);
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
analyzeFormalArguments(*this, ST, Ins, CCInfo, ArgLocs);
DenseMap<unsigned, SmallVector<Register, 4>> OrigArgToRegLoc;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
assert(!MF.getFunction().hasStructRetAttr());
assert(!IsVarArg);
assert(VA.getLocInfo() == CCValAssign::Full);
EVT VT = VA.getLocVT();
Register VirtReg;
switch (VT.getSimpleVT().SimpleTy) {
default:
llvm_unreachable("Unhandled type in call lowering!");
case MVT::i8:
case MVT::i16:
case MVT::i32:
case MVT::f32:
VirtReg = RegInfo.createVirtualRegister(&TPU::GPRRegClass);
break;
case MVT::i1:
assert(!ST->isTPUABIEnabled());
VirtReg = RegInfo.createVirtualRegister(&TPU::PPRRegClass);
break;
case MVT::x86mmx:
assert(ST->hasVPU());
VirtReg = RegInfo.createVirtualRegister(&TPU::CBRRegClass);
break;
case MVT::v8i32:
case MVT::v8f32:
case MVT::v16bf16:
case MVT::v16f16:
case MVT::v16i16:
case MVT::v32i8:
case MVT::v64i4:
case MVT::v128i2:
case MVT::v256i1:
case MVT::v16i32:
case MVT::v16f32:
case MVT::v32bf16:
case MVT::v32f16:
case MVT::v32i16:
case MVT::v64i8:
case MVT::v128i4:
case MVT::v256i2:
case MVT::v512i1:
case MVT::v1024i32:
case MVT::v1024f32:
assert(ST->hasVPU());
if (IsBC && TPU::VAGGRegClass.contains(VA.getLocReg())) {
assert(!ST->isTPUABIEnabled());
VirtReg = RegInfo.createVirtualRegister(&TPU::VAGGRegClass);
} else {
VirtReg = RegInfo.createVirtualRegister(&TPU::VPRRegClass);
}
break;
case MVT::v64i1:
assert(ST->hasVPU());
if (ST->hasV8())
llvm_unreachable("Unexpected mask type.");
VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
break;
case MVT::v16i1:
assert(ST->hasVPU());
VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
break;
case MVT::v32i1:
assert(ST->hasVPU());
if (ST->hasV8() && !HasLPGL)
llvm_unreachable("Needs +lp.");
VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
break;
case MVT::v8i1:
case MVT::v1024i1:
assert(ST->hasVPU());
VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
break;
}
if (VA.isRegLoc()) {
OrigArgToRegLoc[Ins[i].getOrigArgIndex()].push_back(VA.getLocReg());
RegInfo.addLiveIn(VA.getLocReg(), VirtReg);
InVals.push_back(DAG.getCopyFromReg(Chain, DL, VirtReg, VT));
} else { // VA.isRegLoc()
assert(VA.isMemLoc());
assert(!VA.needsCustom());
MachineFunction &MF = DAG.getMachineFunction();
unsigned LocMemOffset = VA.getLocMemOffset();
// In order to make it easier for the callee, the stack pointer in the
// caller is incremented such that it points to a free slot in the callee
// for the return address. Adjusting the argument offsets here.
if (!VA.getValVT().isVector())
LocMemOffset += ST->scalarSizeInBytes();
unsigned AdjustedLocMemOffset =
TPU::adjustForWordSize(
APInt(32, LocMemOffset),
VA.getValVT().isVector() ? TPUAS_TileSpmem : TPUAS_Smem, *ST)
.getZExtValue();
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT ArgVT = Ins[i].ArgVT;
int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), AdjustedLocMemOffset,
/*IsImmutable=*/false);
auto PtrVT = getPointerTy(DAG.getDataLayout());
unsigned Opcode;
SDValue StackPtr;
if (isMaskVT(VA.getValVT(), *ST)) {
assert(ST->hasVPU());
Opcode = TPU::RESTORE_MPRs;
StackPtr = DAG.getRegister(TPU::FPV, MVT::i32);
} else if (VA.getValVT().isVector()) {
assert(ST->hasVPU());
Opcode = TPU::RESTORE_VPRs;
StackPtr = DAG.getRegister(TPU::FPV, MVT::i32);
} else {
Opcode = TPU::RESTORE_GPRs;
StackPtr = DAG.getRegister(TPU::FPS, MVT::i32);
}
SmallVector<SDValue, 8> Ops;
SDValue TFI = DAG.getTargetFrameIndex(FI, PtrVT);
auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
auto PredInvert = DAG.getTargetConstant(APInt(1, 0), DL, MVT::i1);
Ops.push_back(StackPtr);
Ops.push_back(TFI);
Ops.push_back(PredReg);
Ops.push_back(PredInvert);
MVT ValVT = VA.getValVT();
MachineSDNode *MN = DAG.getMachineNode(
Opcode, DL, isMaskVT(ValVT, *ST) ? VMNI1 : ValVT, Ops);
auto *MemRef = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
MachineMemOperand::MOLoad, /*s=*/4,
/*base_alignment=*/llvm::Align(4));
DAG.setNodeMemRefs(MN, {MemRef});
SDValue Arg = SDValue(MN, 0);
InVals.push_back(Arg);
}
}
if (IsBC) {
// On BarnaCore, we obtain aggregates as function inputs and refer to them
// by their base register throughout the function. We need to block the
// register allocator from clobbering them. Aggregates are identified by
// multiple registers having the same input argument index.
TPUMachineFunctionInfo &MFInfo = *MF.getInfo<TPUMachineFunctionInfo>();
for (auto &Range : OrigArgToRegLoc) {
if (Range.second.size() == 1)
continue;
// Note that we rely on the range already being sorted from above.
MFInfo.addBarnaCoreAggregateRange(Range.second.front() - TPU::VAGG0,
Range.second.back() - TPU::VAGG0 + 1);
}
}
return Chain;
}
SDValue
TPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
// CCValAssign - represent the assignment of the return value to a location
SmallVector<CCValAssign, 16> RVLocs;
MachineFunction &MF = DAG.getMachineFunction();
// CCState - Info about the registers and stack slot.
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
// Analize return values.
CCInfo.AnalyzeReturn(Outs, RetCC_TPU);
SmallVector<SDValue, 4> RetOps(1, Chain);
// Copy the result values into the output registers.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign &VA = RVLocs[i];
// FIXME(b/237788792): Finalize return ABI.
assert(VA.isRegLoc() && "Can only return in registers!");
assert(!VA.needsCustom());
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Chain);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
RetOps[0] = Chain; // Update chain
// We're checking the call graph here and setting whether or not a function is
// an entry function. At least on our system, this is good enough.
TPUMachineFunctionInfo &MFInfo = *MF.getInfo<TPUMachineFunctionInfo>();
// Ugly cast, CallGraph should really take a const Module. FIXME(hgreving):
// maybe try to change upstream. The cast here is safe because nobody will
// change the Module.
CallGraph CG(*const_cast<Module *>(MF.getMMI().getModule()));
const CallGraphNode *CGN = CG[&MF.getFunction()];
// There's a always at least one null node referencing the function.
if (CGN->getNumReferences() == 1)
MFInfo.setIsTopLevel(true);
else
MFInfo.setIsTopLevel(false);
if (!ST->isTPUABIEnabled() || MFInfo.isTopLevel())
return DAG.getNode(TPUISD::HALT, DL, MVT::Other,
ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
return DAG.getNode(TPUISD::RET, DL, MVT::Other,
ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
}
//===----------------------------------------------------------------------===//
// Custom Lowerings
//===----------------------------------------------------------------------===//
SDValue TPUTargetLowering::PerformSCALAR_TO_VECTORCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
const SDValue &Val = N->getOperand(0);
MVT VecVT = N->getSimpleValueType(0);
return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);
}
SDValue TPUTargetLowering::PerformINSERT_VECTOR_ELTCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
const SDValue &Vec = N->getOperand(0);
const SDValue &Val = N->getOperand(1);
auto PredReg = DCI.DAG.getRegister(TPU::Palways, MVT::i1);
auto PredInvert = DCI.DAG.getTargetConstant(APInt(1, 0), SDLoc(N), MVT::i1);
MVT VecVT = N->getSimpleValueType(0);
SDValue SplatVal = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);
SmallVector<SDValue, 8> Ops;
SDValue Vseq = SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), VNI32,
PredReg, PredInvert),
0);
Ops.push_back(Vseq);
SDValue Mask;
if (const ConstantSDNode *Idx =
dyn_cast<ConstantSDNode>(N->getOperand(2).getNode())) {
Ops.push_back(DCI.DAG.getTargetConstant(*Idx->getConstantIntValue(),
SDLoc(N), MVT::i32));
Ops.push_back(PredReg);
Ops.push_back(PredInvert);
Mask =
SDValue(DCI.DAG.getMachineNode(TPU::VMLANEi, SDLoc(N), VMNI1, Ops), 0);
} else {
Ops.push_back(SDValue(cast<SDNode>(N->getOperand(2).getNode()), 0));
Ops.push_back(PredReg);
Ops.push_back(PredInvert);
Mask =
SDValue(DCI.DAG.getMachineNode(TPU::VMLANEr, SDLoc(N), VMNI1, Ops), 0);
}
return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), VecVT, Mask, SplatVal, Vec);
}
bool TPUTargetLowering::isNonNaNFPConstSplat(SDValue N) const {
if (N->getOpcode() == TPUISD::SPLAT) {
if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
return !CN->isNaN();
}
return false;
}
EVT TPUTargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {
// We're returning something that makes sense, though it is useless since we
// neither know the memory space, nor can we let selection DAG to the LLVM
// MemOp lowering. See header file for explanation.
return VNI32;
}
SDValue TPUTargetLowering::PerformSETCCCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
// We help the DAG combiner by recognizing ordered setcc of splats that can't
// be NaN. LLVM can do that if BUILD_VECTOR, but we combine early into SPLAT,
// hence this code.
if (!isNonNaNFPConstSplat(N->getOperand(0)) ||
!isNonNaNFPConstSplat(N->getOperand(1)))
return SDValue();
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
// TODO(hgreving): what about SETO?
ISD::CondCode NoNaNCC = getFCmpCodeWithoutNaN(CC);
if (NoNaNCC != CC)
return DCI.DAG.getSetCC(SDLoc(N), N->getSimpleValueType(0),
N->getOperand(0), N->getOperand(1), NoNaNCC);
return SDValue();
}
SDValue TPUTargetLowering::getSupportedVCMask(SelectionDAG &DAG, int VectorMask,
SDLoc Loc) const {
if (!ST->hasVCMasks() || !GenerateTpuVCMasks)
return SDValue();
int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
int FullMask = (1 << MaskSizeInBits) - 1;
// Technically `< MaskSizeInBits` would be enough because a full mask should
// be covered by embedded masks.
for (int i = 1; i <= MaskSizeInBits; i++) {
int CompareMask = (1 << i) - 1;
for (int j = 0; j < MaskSizeInBits; j++) {
int RotCompareMask =
(CompareMask << j | CompareMask >> (MaskSizeInBits - j)) & FullMask;
if (VectorMask == RotCompareMask) {
int S = j * 4;
int E = ((i + j - 1) % MaskSizeInBits) * 4 + 3;
assert(S < EVT(VMNI1).getSizeInBits() * 4);
assert(E < EVT(VMNI1).getSizeInBits() * 4);
auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
auto PredInvert = DAG.getTargetConstant(APInt(1, 0), Loc, MVT::i1);
return SDValue(
DAG.getMachineNode(
TPU::VCMASKi, Loc, VMNI1,
DAG.getTargetConstant(APInt(32, E << 8 | S), Loc, MVT::i32),
PredReg, PredInvert),
0);
}
}
}
return SDValue();
}
SDValue TPUTargetLowering::getSupportedVCMask(SelectionDAG &DAG,
SDNode *N) const {
if (!ST->hasVCMasks() || !GenerateTpuVCMasks)
return SDValue();
int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
if (N->getNumOperands() != MaskSizeInBits)
return SDValue();
int BuildVectorMask = 0;
for (int i = 0; i < MaskSizeInBits; i++) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i));
if (C == nullptr)
return SDValue();
BuildVectorMask |= C->getZExtValue() << i;
}
return getSupportedVCMask(DAG, BuildVectorMask, SDLoc(N));
}
SDValue TPUTargetLowering::PerformBUILD_VECTORCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
// Combine a BUILD_VECTOR(42, 42, 42, 42, ...) -> SPLAT(42)
MVT VecVT = N->getSimpleValueType(0);
if (VecVT != VNI32 && VecVT != VNF32 && VecVT != VMNI1) {
if (!HasLPGL)
return SDValue();
if (VecVT != VNBF16 && VecVT != VNI8)
return SDValue();
}
MVT ScalarVT;
if (VecVT == VNI32)
ScalarVT = MVT::i32;
else if (VecVT == VNF32)
ScalarVT = MVT::f32;
else if (VecVT == VNBF16)
ScalarVT = MVT::bf16;
else if (VecVT == VNI8)
ScalarVT = MVT::i8;
else if (VecVT == VMNI1)
// Low precision build_vector masks are currently not supported.
ScalarVT = MVT::i1;
else
llvm_unreachable("Bad vector ty!");
// Checking for supported embedded hardware masks. I would have preferred to
// do this in tablegen, and this would be possible with sth like this:
//
// def tpuvm17 : PatLeaf<(build_vector), [{
// return isMask7f(N);
// }]>;
//
// let Predicates = [HasV8,NotBC] in {
// def : Pat<(vNi1 (Splat -1)), (COPY !cast<TPUReg>("M16"))>;
// def : Pat<(vNi1 (tpuvm17)), (COPY !cast<TPUReg>("M17"))>;
//
// However, since we already combine BUILD_VECTOR here, we would have to check
// for the embedded masks here anyway and potentially bail combine.
// Additionally, it is harder to turn on/off the feature in tablegen. Lastly,
// we may run into cases with instructions not supporting the special mask, in
// which case we probably want to legalize them, and this will be easier if we
// combine the hardware mask here. All of the above is the reason why the code
// is here, and not in tablegen.
//
if (ScalarVT == MVT::i1) {
Register EmbeddedMask = getSupportedEmbeddedMask(N);
if (EmbeddedMask != TPU::NoRegister)
return DCI.DAG.getCopyFromReg(DCI.DAG.getEntryNode(), SDLoc(N),
EmbeddedMask, VMNI1);
SDValue VMCreate = getSupportedVCMask(DCI.DAG, N);
if (VMCreate.getNode())
return VMCreate;
}
unsigned VecSize = MVT(VecVT).getVectorNumElements();
bool IsSplat = true;
bool IsVlaneSeq = true;
assert(N->getNumOperands() == VecSize);
SDValue Val0 = N->getOperand(0);
int IC = -1;
if (Val0.getSimpleValueType() != ScalarVT)
return SDValue();
for (unsigned I = 0; I < VecSize; ++I) {
if (N->getOperand(I) != Val0 && !N->getOperand(I).isUndef())
IsSplat = false;
ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(N->getOperand(I));
if (!ValC) {
IsVlaneSeq = false;
continue;
}
if (ValC->getZExtValue() != IC++ + 1)
IsVlaneSeq = false;
if (!IsVlaneSeq && !IsSplat)
break;
}
if (IsSplat)
return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val0);
auto PredReg = DCI.DAG.getRegister(TPU::Palways, MVT::i1);
auto PredInvert = DCI.DAG.getTargetConstant(APInt(1, 0), SDLoc(N), MVT::i1);
if (IsVlaneSeq)
return SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), VNI32,
PredReg, PredInvert),
0);
// BUILD_VECTOR(a, b, c, d, ...) -> VSEL(Splat(a), ...)
// This is really ugly but is the only way :(
// Pick an initial splat value.
SDValue InitialSplatted = N->getOperand(VecSize - 1);
SDValue V = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, InitialSplatted);
for (unsigned I = 0; I < VecSize; ++I) {
if (N->getOperand(I)->isUndef() || N->getOperand(I) == InitialSplatted)
continue;
SDValue SplatVal =
DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, N->getOperand(I));
SDValue VMCreate = getSupportedVCMask(DCI.DAG, 1 << I, SDLoc(N));
SDValue Mask;
if (VMCreate.getNode()) {
Mask = VMCreate;
} else {
SmallVector<SDValue, 8> Ops;
SDValue Vseq = SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N),
VNI32, PredReg, PredInvert),
0);
Ops.push_back(Vseq);
Ops.push_back(DCI.DAG.getTargetConstant(I, SDLoc(N), MVT::i32));
Ops.push_back(PredReg);
Ops.push_back(PredInvert);
Mask = SDValue(DCI.DAG.getMachineNode(TPU::VMLANEi, SDLoc(N), VMNI1, Ops),
0);
}
// And use that mask to select-in this value.
V = DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), VecVT, Mask, SplatVal, V);
}
return V;
}
SDValue TPUTargetLowering::PerformVECTOR_SHUFFLECombine(
ShuffleVectorSDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
// Combine a VECTOR_SHUFFLE(1, 2, 3, 4, 5, 6, 7, 0) -> VROTDOWN()
// or VECTOR_SHUFFLE(VECTOR_INSERT(x,y, n), n, n, ...) -> VSPLAT(y)
// or VECTOR_SHUFFLE(x, x, x, x, x, x, x, x) -> VSPLAT(VROTDOWN())
MVT VecVT = N->getSimpleValueType(0);
if (VecVT != VNI32 && VecVT != VNF32 && VecVT != VMNI1)
return SDValue();
assert(N->getNumOperands() == 2);
SDValue Val = N->getOperand(0);
unsigned VecSize = MVT(VecVT).getVectorNumElements();
bool IsSequence = true;
bool IsSame = true;
unsigned Offset = N->getMaskElt(0);
for (unsigned I = 0; I < VecSize; ++I) {
if (N->getMaskElt(I) != (I + Offset) % VecSize)
IsSequence = false;
if (N->getMaskElt(I) != Offset)
IsSame = false;
}
bool NeedsTrunc = false;
if (VecVT == VMNI1) {
Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VNI32, Val);
VecVT = VNI32;
NeedsTrunc = true;
}
// Helper function to trucate the result if we performed extension
// of the operation from i1
auto TruncateReturnIfNeed = [&](SDValue V) {
if (NeedsTrunc)
return DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), VMNI1, V);
return V;
};
if (IsSequence && ST->isSparseCore())
return TruncateReturnIfNeed(
DCI.DAG.getNode(TPUISD::VROTDOWN, SDLoc(N), VecVT, Val,
DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32)));
if (!IsSame && ST->isSparseCore()) {
// SparseCore has a vector permute that permutes the elements into all lanes
// of a vector based on a vector mask.
SmallVector<SDValue, 8> MaskElements;
for (int El : N->getMask())
MaskElements.push_back(DCI.DAG.getConstant(El, SDLoc(N), MVT::i32));
SDValue VMask =
DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VNI32, MaskElements);
return TruncateReturnIfNeed(
DCI.DAG.getNode(TPUISD::VPERMUTE, SDLoc(N), VecVT, Val, VMask));
}
if (!IsSame)
return SDValue();
// On tensorcore we cannot use rotdown to move any element into lane 0.
if (!ST->isSparseCore() && Offset != 0)
return SDValue();
MVT ScalarVT = VecVT == VNI32 ? MVT::i32 : MVT::f32;
// If the value replicated comes from an insert, splat directly the original
// value
if (N->getOperand(0).getOpcode() == ISD::INSERT_VECTOR_ELT) {
SDNode *ExtractElt = cast<SDNode>(N->getOperand(0));
const ConstantSDNode *Idx =
cast<ConstantSDNode>(ExtractElt->getOperand(2).getNode());
if (Idx->getConstantIntValue()->getZExtValue() == Offset) {
SDValue ExtractedVal = ExtractElt->getOperand(1);
MVT ExtractedSplatVT = NeedsTrunc ? VMNI1 : VecVT;
return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), ExtractedSplatVT,
ExtractedVal);
}
}
if (ST->hasBroadcast()) {
// SparseCore has a vector broadcast that broadcasts the element at Offset
// into all lanes of a vector without traversing the scalar side.
return TruncateReturnIfNeed(
DCI.DAG.getNode(TPUISD::VBROADCAST, SDLoc(N), VecVT, Val,
DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32)));
}
// Extract the splatted value from the vector and re-splat it.
// Rotate the vector if the offset is not zero.
if (Offset != 0) {
Val = DCI.DAG.getNode(TPUISD::VROTDOWN, SDLoc(N), VecVT, Val,
DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32));
}
Val = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ScalarVT, Val,
DCI.DAG.getConstant(0, SDLoc(N), MVT::i32));
Val = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);
return TruncateReturnIfNeed(Val);
}
SDValue TPUTargetLowering::PerformVSELECTCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
if (N->getValueType(0) != VMNI1)
return SDValue();
SDValue Cond = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDValue Op2 = N->getOperand(2);
if (Op1.getOpcode() == TPUISD::SPLAT && Op2.getOpcode() == TPUISD::SPLAT &&
isa<ConstantSDNode>(Op1->getOperand(0)) &&
isa<ConstantSDNode>(Op2->getOperand(0))) {
bool TrueVal = cast<ConstantSDNode>(Op1->getOperand(0))->getLimitedValue();
bool FalseVal = cast<ConstantSDNode>(Op2->getOperand(0))->getLimitedValue();
if (TrueVal == FalseVal)
// select(C, X, X) -> X
return Op1;
if (TrueVal == true && FalseVal == false)
// select(C, 1, 0) -> C
return Cond;
assert(TrueVal == false && FalseVal == true);
// select(C, 0, 1) -> !C === C xor -1
return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VMNI1, Cond, Op2);
}
// select(C, X, Y) -> (C & X) | (~C & Y)
SDValue CAndX = DCI.DAG.getNode(ISD::AND, SDLoc(N), VMNI1, Cond, Op1);
SDValue NotC = DCI.DAG.getNode(
ISD::XOR, SDLoc(N), VMNI1, Cond,
DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VMNI1,
DCI.DAG.getConstant(-1, SDLoc(N), MVT::i1)));
SDValue NotCAndY = DCI.DAG.getNode(ISD::AND, SDLoc(N), VMNI1, NotC, Op2);
return DCI.DAG.getNode(ISD::OR, SDLoc(N), VMNI1, CAndX, NotCAndY);
}
SDValue TPUTargetLowering::PerformBcInsertValueCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
// Combine llvm.tpu.bc.insertvalue.loopindex -> BC_INSERTVALUE.
// The intrinsic takes an array and returns an array. This is lowered to
// %a = merge_values a0,a1,a2,...,an-1
// %b1,b2,...,bn-1 = @llvm.tpu.bc.insertvalue.loopindex %a, %c
//
// We don't care about the values of any physical registers. We've already
// reserved a block of registers for this aggregate, all we need to do is
// keep the zeroth register to plumb through as the base value.
//
// Here we replace the intrinsic with an BC_INSERTVALUE of the base register
// and a MERGE_VALUES result, with the base register in value 0 and the rest
// UNDEF. The optimizer will then clean things up.
SDLoc DL(N);
SDValue BaseReg = N->getOperand(1);
SDValue InsertedValue = N->getOperand(2);
EVT VT = BaseReg.getValueType();
SDValue NewN =
DCI.DAG.getNode(TPUISD::BC_INSERTVALUE, DL, VT, BaseReg, InsertedValue);
SmallVector<SDValue, 4> Vs(N->getNumValues(), DCI.DAG.getUNDEF(VT));
Vs[0] = NewN;
return DCI.DAG.getMergeValues(Vs, DL);
}
SDValue TPUTargetLowering::PerformBcExtractValueCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
// Combine llvm.tpu.bc.extractvalue.loopindex -> BC_EXTRACTVALUE.
// The intrinsic takes an array and returns a vector. This is lowered to
// %a = merge_values a0,a1,a2,...,an-1
// %b:v8f32 = @llvm.tpu.bc.extractvalue.loopindex %a
//
// We don't care about the values of any physical registers. We've already
// reserved a block of registers for this aggregate, all we need to do is
// keep the zeroth register to plumb through as the base value.
//
// We're already accessing MERGE_VALUES:0, so just rewrite in place.
SDLoc DL(N);
SDValue BaseReg = N->getOperand(1);
EVT VT = BaseReg.getValueType();
return DCI.DAG.getNode(TPUISD::BC_EXTRACTVALUE, DL, VT, BaseReg);
}
SDValue TPUTargetLowering::PerformPtrToIntCombine(SDNode *N) const {
return N->getOperand(1);
}
const char *TPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch (Opcode) {
default:
return "<TPU unknown opcode>";
case TPUISD::HALT:
return "TPUISD::HALT";
case TPUISD::VROTDOWN:
return "TPUISD::VROTDOWN";
case TPUISD::VBROADCAST:
return "TPUISD::VBROADCAST";
case TPUISD::VPERMUTE:
return "TPUISD::VPERMUTE";
case TPUISD::SPLAT:
return "TPUISD::SPLAT";
case TPUISD::WRAPPER:
return "TPUISD::WRAPPER";
case TPUISD::BC_INSERTVALUE:
return "TPUISD::BC_INSERTVALUE";
case TPUISD::BC_EXTRACTVALUE:
return "TPUISD::BC_EXTRACTVALUE";
case TPUISD::UMUL24:
return "TPUISD::UMUL24";
case TPUISD::CALL:
return "TPUISD::CALL";
}
}
SDValue TPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
case ISD::BUILD_VECTOR:
return PerformBUILD_VECTORCombine(N, DCI);
case ISD::VECTOR_SHUFFLE:
return PerformVECTOR_SHUFFLECombine(cast<ShuffleVectorSDNode>(N), DCI);
case ISD::INSERT_VECTOR_ELT:
return PerformINSERT_VECTOR_ELTCombine(N, DCI);
case ISD::SCALAR_TO_VECTOR:
return PerformSCALAR_TO_VECTORCombine(N, DCI);
case ISD::VSELECT:
return PerformVSELECTCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
switch (N->getConstantOperandVal(0)) {
default:
return SDValue();
case Intrinsic::tpu_bc_insertvalue_loopindex:
return PerformBcInsertValueCombine(N, DCI);
case Intrinsic::tpu_bc_extractvalue_loopindex:
return PerformBcExtractValueCombine(N, DCI);
case Intrinsic::tpu_inttoptr:
case Intrinsic::tpu_ptrtoint:
return PerformPtrToIntCombine(N);
}
case ISD::SETCC:
return PerformSETCCCombine(N, DCI);
default:
break;
}
return SDValue();
}
std::optional<bool>
TPUTargetLowering::IsFifoAccess(MachineInstr &MI,
const TargetRegisterClass *RegClass) const {
const MCInstrDesc &MCID = TII->get(MI.getOpcode());
for (auto I = MCID.opInfo_begin(), IE = MCID.opInfo_end(); I != IE; I++) {
if (I->RegClass == RegClass->getID()) {
// For push instruction the destination register needs to match the
// given reg class. For Pop instruction of the operands needs to match
// the given reg class.
if (I == MCID.opInfo_begin())
return false;
else if (I != MCID.opInfo_begin())
return true;
}
}
return std::nullopt;
}
bool TPUTargetLowering::UsesSpecialReg(
MachineInstr &MI, const TargetRegisterClass *RegClass) const {
const MCInstrDesc &MCID = TII->get(MI.getOpcode());
for (auto I = MCID.opInfo_begin(), IE = MCID.opInfo_end(); I != IE; I++) {
if (I->RegClass == RegClass->getID()) {
return true;
}
}
return false;
}
Register TPUTargetLowering::getSupportedEmbeddedMask(SDNode *N) const {
if (!ST->hasEmbeddedMasks() || !PropagateTpuEmbeddedMasks)
return TPU::NoRegister;
assert(N->getOpcode() == ISD::BUILD_VECTOR);
// See e.g. go/vfc-sc-isa#vector-modify-mask-instructions.
DenseMap<int, Register> SupportedEmbeddedMasks{
{0xff, TPU::M16}, {0x7f, TPU::M17}, {0x3f, TPU::M18}, {0x1f, TPU::M19},
{0xf, TPU::M20}, {0x7, TPU::M21}, {0x3, TPU::M22}, {0x1, TPU::M23},
};
int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
if (N->getNumOperands() != MaskSizeInBits)
return TPU::NoRegister;
auto MatchesBitMask = [MaskSizeInBits, N](int BitMask) {
for (int i = 0; i < MaskSizeInBits; i++) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i));
if (C == nullptr)
return false;
if (C->getZExtValue() != ((BitMask >> i) & 0x1))
return false;
}
return true;
};
for (auto &KV : SupportedEmbeddedMasks) {
if (MatchesBitMask(KV.first))
return KV.second;
}
return TPU::NoRegister;
}
void TPUTargetLowering::SetDependency(MachineInstr &MI, MachineBasicBlock *MBB,
const TargetRegisterClass *RegClass,
bool IsPush) const {
const TPUTargetMachine &TM =
static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
MachinePointerInfo MPI(TM.getFifoPSV(IsPush, RegClass));
auto *MemRef = MBB->getParent()->getMachineMemOperand(
MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
/*base_alignment=*/llvm::Align(4));
MI.addMemOperand(*MBB->getParent(), MemRef);
}
// DWG needs dependencies with all matmul.
// The first matmul after a DWG need dependencies with all matpush.
// DWG can be re-ordered across matpush instructions.
// This function adds the memory operators to enforce this order.
MachineBasicBlock *TPUTargetLowering::SetDWGDep(MachineInstr &MI,
MachineBasicBlock *MBB) const {
MachineRegisterInfo &RegInfo = MBB->getParent()->getRegInfo();
SmallDenseSet<MachineInstr *, 32> DWGUses;
Register Dst = MI.getOperand(0).getReg();
for (MachineInstr &MIUser : RegInfo.use_instructions(Dst)) {
assert(MIUser.getParent() == MBB &&
"matmul use WDG from a different block, this case is currently not "
"supported");
DWGUses.insert(&MIUser);
}
if (DWGUses.empty())
return MBB;
auto E = MBB->end();
MachineInstr *FirstMatMul = nullptr;
for (auto I = MI.getIterator(); I != E; I++) {
if (DWGUses.count(&(*I)) > 0) {
FirstMatMul = &(*I);
break;
}
}
assert(FirstMatMul != nullptr && "didn't find any matmul");
// The first MatMul needs to have an explicit dependency with gsfn as it
// triggers the copy from gsfn/gsft to gmr. This means the following push
// cannot be re-ordered across the first matmul.
const TargetRegisterClass *GSFNRegClass =
RegInfo.getRegClass(MI.getOperand(1).getReg());
SetDependency(*FirstMatMul, MBB, GSFNRegClass);
// DWG cannot be re-ordered across any matmul instruction so add a dependency
// to push MRF to represent that.
const TargetRegisterClass *MRFRegClass =
RegInfo.getRegClass(FirstMatMul->getOperand(0).getReg());
SetDependency(MI, MBB, MRFRegClass, /*isPush=*/true);
return MBB;
}
MachineBasicBlock *
TPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// Generic handling of instructions that to sets dependencies
if (static_cast<const TPUInstrInfo *>(TII)->isDWGInst(MI)) {
return SetDWGDep(MI, MBB);
}
bool IsSpecialRegAccess = false;
for (auto Fifo : FifoClasses) {
if (auto IsPop = IsFifoAccess(MI, Fifo)) {
SetDependency(MI, MBB, Fifo, !*IsPop);
IsSpecialRegAccess = true;
}
}
for (auto ImplicitReg : SpecialStagingReg) {
if (UsesSpecialReg(MI, ImplicitReg)) {
SetDependency(MI, MBB, ImplicitReg);
IsSpecialRegAccess = true;
}
}
// Instruction with special register accesses only need to be modified to have
// an extra pseudo source.
if (IsSpecialRegAccess)
return MBB;
auto &ST = MI.getMF()->getSubtarget<TPUSubtarget>();
unsigned PopOpcode = TPU::SPOP_V2SF;
const TargetRegisterClass *RegClass = &TPU::V2SFPRRegClass;
if (ST.hasVfcTensorCore()) {
PopOpcode = TPU::SPOP_SFRF;
RegClass = &TPU::SFRFPRRegClass;
}
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unknown instruction for custom emission!");
case TPU::VROTDOWNri:
return EmitVROTDOWN(MI, MBB);
case TPU::VFREADi:
return EmitVecOrSFlagToScalar(
MI, MBB, ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEi : TPU::VSYNCMOVEi,
1, PopOpcode, RegClass);
case TPU::VFREADr:
return EmitVecOrSFlagToScalar(
MI, MBB, ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEr : TPU::VSYNCMOVEr,
1, PopOpcode, RegClass);
case TPU::VFREADDONEi:
return EmitVecOrSFlagToScalar(
MI, MBB,
ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEDONEi : TPU::VSYNCMOVEDONEi,
1, PopOpcode, RegClass);
case TPU::VFREADDONEr:
return EmitVecOrSFlagToScalar(
MI, MBB,
ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEDONEr : TPU::VSYNCMOVEDONEr,
1, PopOpcode, RegClass);
case TPU::VFREADPAi:
return EmitVecOrSFlagToScalar(MI, MBB, TPU::tcvfVSYNCMOVEPAi, 1, PopOpcode,
RegClass);
case TPU::VFREADPAr:
return EmitVecOrSFlagToScalar(MI, MBB, TPU::tcvfVSYNCMOVEPAr, 1, PopOpcode,
RegClass);
case TPU::VREAD:
assert(!IsSC);
return EmitVecOrSFlagToScalar(MI, MBB, TPU::VPUSH, 1, TPU::SPOP_V2SF,
&TPU::V2SFPRRegClass);
case TPU::scVREADi:
assert(IsSC);
return EmitVecOrSFlagToScalar(MI, MBB, TPU::scVPUSHi, 2, PopOpcode,
RegClass);
case TPU::scVREADr:
assert(IsSC);
return EmitVecOrSFlagToScalar(MI, MBB, TPU::scVPUSHr, 2, PopOpcode,
RegClass);
case TPU::VMREAD:
return EmitVmread(MI, MBB);
}
}
MachineBasicBlock *TPUTargetLowering::EmitVecOrSFlagToScalar(
MachineInstr &MI, MachineBasicBlock *MBB, unsigned PushOpcode,
int NumOfInputs, unsigned PopOpcode,
const TargetRegisterClass *RegClass) const {
auto &MRI = MBB->getParent()->getRegInfo();
auto InsertPt = MI.getIterator();
const unsigned FifoReg = MRI.createVirtualRegister(RegClass);
MachineInstrBuilder MIB =
BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(PushOpcode), FifoReg);
for (int i = 1; i <= NumOfInputs; i++)
MIB.add(MI.getOperand(i));
MachineInstr *Push = AddDefaultPred(MIB);
MachineInstr *Pop =
AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
TII->get(PopOpcode), MI.getOperand(0).getReg())
.addReg(FifoReg, getKillRegState(true)));
MI.eraseFromParent();
for (auto &I : {Push, Pop}) {
const TPUTargetMachine &TM =
static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
MachinePointerInfo MPI(TM.getFifoPSV(I == Push, RegClass));
auto *MemRef = MBB->getParent()->getMachineMemOperand(
MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
/*base_alignment=*/llvm::Align(4));
I->addMemOperand(*MBB->getParent(), MemRef);
}
return MBB;
}
MachineBasicBlock *TPUTargetLowering::EmitVmread(MachineInstr &MI,
MachineBasicBlock *MBB) const {
auto &MRI = MBB->getParent()->getRegInfo();
auto InsertPt = MI.getIterator();
unsigned ZeroReg = MRI.createVirtualRegister(&TPU::VPRRegClass);
AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
TII->get(TPU::VIMMI), ZeroReg)
.addImm(0));
AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
TII->get(TPU::VSELir),
MI.getOperand(0).getReg())
.add(MI.getOperand(1))
.addImm(1)
.addReg(ZeroReg));
MI.eraseFromParent();
return MBB;
}
MachineBasicBlock *
TPUTargetLowering::EmitVROTDOWN(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// Emit VROTDOWNri as a sequence of N VROTDOWNr's.
auto &MRI = MBB->getParent()->getRegInfo();
unsigned Imm = MI.getOperand(2).getImm();
auto OpReg = MI.getOperand(1).getReg();
auto FinalReg = MI.getOperand(0).getReg();
auto InsertPt = MI.getIterator();
if (Imm == 0) {
BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(TPU::COPY), FinalReg)
.addReg(OpReg);
MI.eraseFromParent();
return MBB;
}
// TODO(hgreving): Sparsecore and Viperfish should be able to use
// one xlane instruction.
MachineInstr *TheMI = &MI;
for (unsigned I = 0; I < Imm; ++I) {
unsigned OutReg = (I == (Imm - 1))
? FinalReg
: MRI.createVirtualRegister(&TPU::VPRRegClass);
TheMI = AddDefaultPred(
BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
TII->get(TPU::VROTDOWNr), OutReg)
.addReg(OpReg, getKillRegState(true)));
OpReg = OutReg;
}
MI.eraseFromParent();
return MBB;
}
bool TPUTargetLowering::allowsMemoryAccess(LLVMContext &Context,
const DataLayout &DL, EVT VT,
unsigned AddrSpace, Align Alignment,
MachineMemOperand::Flags Flags,
unsigned *Fast) const {
// Disallow load/store we don't support natively.
if (VT != MVT::i32 && VT != MVT::f32 && VT != VNF32 && VT != VNI32)
return false;
bool Allows = TargetLowering::allowsMemoryAccess(Context, DL, VT, AddrSpace,
Alignment, Flags, Fast);
if (Allows)
*Fast = 1;
return Allows;
}
bool TPUTargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
unsigned *Fast) const {
// No memory access on TPU requires alignment > 4 bytes.
if (Alignment >= Align(4))
return true;
return false;
}
bool TPUTargetLowering::allowsMisalignedMemoryAccesses(
LLT LT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
unsigned *Fast) const {
// No memory access on TPU requires alignment > 4 bytes.
if (Alignment >= Align(4))
return true;
return false;
}
void TPUTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
MachineBasicBlock *MBB = MI.getParent();
const TPUTargetMachine &TM =
static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
if (MI.getOpcode() == TPU::INIT_STACK) {
// Move stack initialization to the very top of the function.
assert(ST->isTPUABIEnabled());
MI.setFlags(MachineInstr::FrameSetup);
MI.moveBefore(&*MBB->instr_begin());
return;
}
if (MI.getOpcode() == TPU::bcVST_concat ||
MI.getOpcode() == TPU::bcVST_concat_aliaddr) {
MachinePointerInfo MPI(
TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ConcatReg));
auto *MemRef = MBB->getParent()->getMachineMemOperand(
MPI, MachineMemOperand::MOStore, /*s=*/4, /*base_alignment=*/llvm::Align(4));
MI.addMemOperand(*MBB->getParent(), MemRef);
return;
}
if (MI.getOpcode() == TPU::bcVSHIFT ||
MI.getOpcode() == TPU::bcVSHIFT_aliaddr) {
{
MachinePointerInfo MPI(
TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ShiftReg));
auto *MemRef = MBB->getParent()->getMachineMemOperand(
MPI, MachineMemOperand::MOStore, /*s=*/4, /*base_alignment=*/llvm::Align(4));
MI.addMemOperand(*MBB->getParent(), MemRef);
}
{
MachinePointerInfo MPI(
TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ConcatReg));
auto *MemRef = MBB->getParent()->getMachineMemOperand(
MPI, MachineMemOperand::MOLoad, /*s=*/4, /*base_alignment=*/llvm::Align(4));
MI.addMemOperand(*MBB->getParent(), MemRef);
}
return;
}
// We rely on convention of brcond ordering to match bcLOOP_END correctly.
// Ensure we actually matched correctly here. bcLOOP_END should point back
// to its own block (single block loops only are allowed).
assert(MI.getOpcode() == TPU::bcLOOP_END);
assert(MI.getOperand(0).getMBB() == MI.getParent() &&
"bcLOOP_END does not point to its parent!");
MI.getParent()->setMachineBlockAddressTaken();
}
// Custom version of CCInfo.AnalyzeCallOperands, supporting scalar and vector
// stacks. Hacks the memory offsets split into two stacks into the ArgLocs and
// returns the scalar and vector sizes for call parameters. Also returns the
// extra bytes used for alignment of the vector stack of masks.
static void analyzeCallOperands(const TPUTargetLowering &TLI,
const TPUSubtarget *ST,
const TargetLowering::CallLoweringInfo &CLI,
CCState &CCInfo,
SmallVector<CCValAssign, 16> &ArgLocs,
int &NumBytesScalar, int &NumBytesVector,
int &ExtraAlignBytesVector) {
const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
unsigned NumOps = Outs.size();
for (unsigned i = 0; i != NumOps; ++i) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
int PrevNumBytes = CCInfo.getNextStackOffset();
if (CC_TPU(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo)) {
#ifndef NDEBUG
dbgs() << "Call operand #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << '\n';
#endif
llvm_unreachable(nullptr);
}
if (!ST->isTPUABIEnabled() || CCInfo.getCallingConv() == CallingConv::Fast)
continue;
assert(!ArgLocs[i].isMemLoc() ||
PrevNumBytes == ArgLocs[i].getLocMemOffset());
CCValAssign &CCV = ArgLocs[i];
if (int StackOffsetDelta = CCInfo.getNextStackOffset() - PrevNumBytes) {
if (ArgVT.isVector()) {
assert(ST->hasVPU());
// This is a trick using the API in order to adjust the LocMemOffset,
// because we have two separate stacks for scalar and vector.
if (isMaskVT(ArgVT, *ST)) {
int AlignedStackOffsetDelta =
alignTo(StackOffsetDelta, ST->vectorSizeInBytes());
ExtraAlignBytesVector += AlignedStackOffsetDelta - StackOffsetDelta;
StackOffsetDelta = AlignedStackOffsetDelta;
}
assert(StackOffsetDelta == ST->vectorSizeInBytes());
CCV.convertToMem(NumBytesVector);
NumBytesVector += StackOffsetDelta;
} else {
assert(StackOffsetDelta == ST->scalarSizeInBytes());
// Same comment as above.
CCV.convertToMem(NumBytesScalar);
NumBytesScalar += StackOffsetDelta;
}
}
}
assert(CCInfo.getCallingConv() == CallingConv::Fast ||
ArgLocs.size() == NumOps);
}
SDValue TPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &DL = CLI.DL;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
// Not supported.
assert(!IsVarArg);
// FIXME(b/237788792): Support return values.
assert(CLI.RetTy->isVoidTy() &&
"Return values should be passed by reference");
// No support for tail calls right now.
IsTailCall = false;
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
// How many bytes are to be pushed on the scalar stack.
int NumBytesScalar = 0;
// How many bytes are to be pushed on the vector stack.
int NumBytesVector = 0;
// Extra bytes added for vector memory alignment, used for masks.
int ExtraAlignBytesVector = 0;
analyzeCallOperands(*this, ST, CLI, CCInfo, ArgLocs, NumBytesScalar,
NumBytesVector, ExtraAlignBytesVector);
assert(NumBytesScalar + NumBytesVector - ExtraAlignBytesVector ==
CCInfo.getNextStackOffset());
Chain = DAG.getCALLSEQ_START(Chain, NumBytesScalar, NumBytesVector, DL);
SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
SmallVector<SDValue, 12> MemOpChains;
// Walk the register assignments, inserting copies.
for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
CCValAssign &VA = ArgLocs[I];
assert(VA.getValVT() == VA.getLocVT());
SDValue Arg = OutVals[I];
if (VA.isRegLoc()) {
// Promote the value if needed.
switch (VA.getLocInfo()) {
case CCValAssign::Full:
break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
default:
llvm_unreachable("Unknown loc info!");
}
// Arguments that can be passed on register must be kept at RegsToPass
// vector
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else { // VA.isRegLoc()
assert(VA.isMemLoc());
assert(!VA.needsCustom());
auto PtrVT = getPointerTy(DAG.getDataLayout());
MachineFunction &MF = DAG.getMachineFunction();
unsigned LocMemOffset = VA.getLocMemOffset();
// In order to make it easier for the callee, the stack pointer in the
// caller is incremented such that it points to a free slot in the callee
// for the return address. Adjusting the argument offsets here.
if (!VA.getValVT().isVector())
LocMemOffset += ST->scalarSizeInBytes();
else
assert(ST->hasVPU());
unsigned AdjustedLocMemOffset =
TPU::adjustForWordSize(
APInt(32, LocMemOffset),
VA.getValVT().isVector() ? TPUAS_TileSpmem : TPUAS_Smem, *ST)
.getZExtValue();
SDValue PtrOff = DAG.getIntPtrConstant(AdjustedLocMemOffset, DL);
// Stack pointer (not frame pointer) based after call stack adjustments.
SDValue DstAddr = DAG.getNode(
ISD::ADD, DL, PtrVT,
DAG.getRegister(VA.getValVT().isVector() ? TPU::SPV : TPU::SPS,
MVT::i32),
PtrOff);
MachinePointerInfo DstInfo =
VA.getValVT().isVector()
? MachinePointerInfo(TPUAS_TileSpmem, LocMemOffset)
: MachinePointerInfo::getStack(MF, LocMemOffset);
SDValue Store;
if (isMaskVT(VA.getValVT(), *ST)) {
SDValue Select =
DAG.getNode(ISD::VSELECT, DL, VNI32, Arg,
DAG.getNode(TPUISD::SPLAT, DL, VNI32,
DAG.getConstant(0xFFFFFFFF, DL, MVT::i32)),
DAG.getNode(TPUISD::SPLAT, DL, VNI32,
DAG.getConstant(0, DL, MVT::i32)));
Store = DAG.getStore(Chain, DL, Select, DstAddr, DstInfo);
} else {
Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
}
MemOpChains.push_back(Store);
}
}
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
SDValue InFlag;
// Build a sequence of copy-to-reg nodes chained together with token chain and
// flag operands which copy the outgoing args into registers. The InFlag is
// necessary since all emitted instructions must be stuck together.
for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
RegsToPass[I].second, InFlag);
InFlag = Chain.getValue(1);
}
// If the callee is a GlobalAddress node (quite common, every direct call is)
// turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
// Likewise ExternalSymbol -> TargetExternalSymbol.
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL,
getPointerTy(DAG.getDataLayout()), 0);
Callee = DAG.getNode(TPUISD::WRAPPER, DL, MVT::i32, Callee);
// Function always return void.
SDVTList NodeTys = DAG.getVTList(MVT::isVoid, MVT::Glue);
SmallVector<SDValue, 8> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask =
TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
// Add argument registers to the end of the list so that they are
// known live into the call.
for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
Ops.push_back(DAG.getRegister(RegsToPass[I].first,
RegsToPass[I].second.getValueType()));
if (InFlag.getNode())
Ops.push_back(InFlag);
Chain = DAG.getNode(CallConv == CallingConv::Fast ? TPUISD::CALL_FAST
: TPUISD::CALL,
DL, NodeTys, ArrayRef<SDValue>(&Ops[0], Ops.size()));
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
Chain = DAG.getCALLSEQ_END(
Chain,
DAG.getConstant(NumBytesScalar, DL, getPointerTy(DAG.getDataLayout()),
true),
DAG.getConstant(NumBytesVector, DL, getPointerTy(DAG.getDataLayout()),
true),
InFlag, DL);
InFlag = Chain.getValue(1);
return Chain;
}
bool TPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
const TPUTargetMachine &TM =
static_cast<const TPUTargetMachine &>(MF.getTarget());
switch (Intrinsic) {
case Intrinsic::tpu_syncadd:
case Intrinsic::tpu_syncadd_done:
case Intrinsic::tpu_syncadd_notdone:
case Intrinsic::tpu_syncadd_remote:
case Intrinsic::tpu_syncadd_remote_done:
case Intrinsic::tpu_syncadd_remote_doneinv:
case Intrinsic::tpu_syncadd_tile:
case Intrinsic::tpu_syncset_done:
case Intrinsic::tpu_syncset_notdone:
case Intrinsic::tpu_syncset_remote:
case Intrinsic::tpu_syncset_remote_doneinv:
case Intrinsic::tpu_syncdonemov:
Info.opc = (Intrinsic == Intrinsic::tpu_syncdonemov)
? ISD::INTRINSIC_W_CHAIN
: ISD::INTRINSIC_VOID;
Info.memVT = MVT::i32;
Info.ptrVal = I.getOperand(0);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_vld_shuffle:
case Intrinsic::tpu_vld_strided:
case Intrinsic::tpu_vld_indexed:
case Intrinsic::tpu_vld_replicate_evenodd_sublanes:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
Info.ptrVal = I.getOperand(0);
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_rdcbreg_smem_base:
case Intrinsic::tpu_rdcbreg_tilespmem_base:
case Intrinsic::tpu_rdcbreg_size:
case Intrinsic::tpu_rdcbreg_offset:
Info.opc = ISD::INTRINSIC_W_CHAIN;
// FIXME(hgreving): re-visit memory operand strategy for this. The reason
// for this to read memory at all are the cb.upd semantics that are not
// modeled through register dependencies.
Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
Info.ptrVal = nullptr;
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_sld_cb:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
// FIXME(hgreving): re-visit memory operand strategy for this. We don't
// have a pointer and PSV values also don't work well here (upstream bug:
// can't set address space).
Info.ptrVal = nullptr;
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_sld_cb_upd:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
// FIXME(hgreving): re-visit memory operand strategy for this. We don't
// have a pointer and PSV values also don't work well here (upstream bug:
// can't set address space).
Info.ptrVal = nullptr;
Info.size = MemoryLocation::UnknownSize;
// upd modeled as store.
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_vld_msk:
case Intrinsic::tpu_vld_msk_strided:
case Intrinsic::tpu_vld_msk_idx_strided:
case Intrinsic::tpu_vld_msk_idx:
case Intrinsic::tpu_vld_msk_idx_np:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
Info.ptrVal = I.getOperand(1);
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_vst_strided:
case Intrinsic::tpu_vst_indexed:
case Intrinsic::tpu_vst_evenodd_sublanes:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(0)->getType());
Info.size = MemoryLocation::UnknownSize;
Info.ptrVal = I.getOperand(1);
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_sst_cb:
case Intrinsic::tpu_sst_cb_upd:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(0)->getType());
Info.size = MemoryLocation::UnknownSize;
// FIXME(hgreving): re-visit memory operand strategy for this. We don't
// have a pointer and PSV values also don't work well here (upstream bug:
// can't set address space).
Info.ptrVal = nullptr;
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_vst_msk_idx_add:
case Intrinsic::tpu_vst_msk_idx_add_np:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(3)->getType());
Info.size = MemoryLocation::UnknownSize;
Info.ptrVal = I.getOperand(1);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_vst_msk_idx_ret_add_np:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getOperand(3)->getType());
Info.size = MemoryLocation::UnknownSize;
Info.ptrVal = I.getOperand(1);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_vst_msk:
case Intrinsic::tpu_vst_msk_add:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(2)->getType());
Info.size = MemoryLocation::UnknownSize;
Info.ptrVal = I.getOperand(1);
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_vst_cb_msk:
case Intrinsic::tpu_vst_cb_msk_add:
case Intrinsic::tpu_vst_cb_msk_add_strided:
case Intrinsic::tpu_vst_cb_msk_idx:
case Intrinsic::tpu_vst_cb_msk_idx_add:
case Intrinsic::tpu_vst_cb_msk_strided:
case Intrinsic::tpu_vst_cb_upd_msk:
case Intrinsic::tpu_vst_cb_upd_msk_add:
case Intrinsic::tpu_vst_cb_upd_msk_add_strided:
case Intrinsic::tpu_vst_cb_upd_msk_strided:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(3)->getType());
Info.size = MemoryLocation::UnknownSize;
// FIXME(hgreving): re-visit memory operand strategy for this. We don't
// have a pointer and PSV values also don't work well here (upstream bug:
// can't set address space).
Info.ptrVal = nullptr;
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_vld_cb_msk:
case Intrinsic::tpu_vld_cb_msk_idx:
case Intrinsic::tpu_vld_cb_msk_idx_np:
case Intrinsic::tpu_vld_cb_msk_strided:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
Info.size = MemoryLocation::UnknownSize;
Info.ptrVal = nullptr;
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_vld_cb_upd_msk:
case Intrinsic::tpu_vld_cb_upd_msk_strided:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
Info.size = MemoryLocation::UnknownSize;
// FIXME(hgreving): re-visit memory operand strategy for this. We don't
// have a pointer and PSV values also don't work well here (upstream bug:
// can't set address space).
Info.ptrVal = nullptr;
// upd modeled as store
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_vst_msk_strided:
case Intrinsic::tpu_vst_msk_add_strided:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(3)->getType());
Info.size = MemoryLocation::UnknownSize;
Info.ptrVal = I.getOperand(1);
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_vst_msk_idx:
case Intrinsic::tpu_vst_msk_idx_np:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(3)->getType());
Info.size = MemoryLocation::UnknownSize;
Info.ptrVal = I.getOperand(1);
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_vst_msk_idx_strided:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(4)->getType());
Info.size = MemoryLocation::UnknownSize;
Info.ptrVal = I.getOperand(1);
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_dma_hbm_to_smem:
case Intrinsic::tpu_dma_hbm_to_smem_sc_simple:
case Intrinsic::tpu_dma_hbm_to_vmem:
case Intrinsic::tpu_dma_hbm_to_spmem_sc_simple:
case Intrinsic::tpu_dma_hbm_to_hbm:
case Intrinsic::tpu_dma_hbm_to_hbm_sc_simple:
case Intrinsic::tpu_dma_hbm_to_hib:
case Intrinsic::tpu_dma_hbm_to_vmem_hib_update:
case Intrinsic::tpu_dma_smem_to_hbm:
case Intrinsic::tpu_dma_smem_to_hbm_sc_simple:
case Intrinsic::tpu_dma_vmem_to_hbm:
case Intrinsic::tpu_dma_spmem_to_hbm_sc_simple:
case Intrinsic::tpu_dma_spmem_to_spmem_sc_simple:
case Intrinsic::tpu_dma_timem_to_hbm:
case Intrinsic::tpu_dma_timem_to_hbm_sc_simple:
case Intrinsic::tpu_dma_hbm_to_simem_sc_simple:
case Intrinsic::tpu_dma_hbm_to_timem:
case Intrinsic::tpu_dma_hbm_to_timem_sc_simple:
case Intrinsic::tpu_dma_hbm_to_smem_single_strided:
case Intrinsic::tpu_dma_hbm_to_vmem_single_strided:
case Intrinsic::tpu_dma_smem_to_hbm_single_strided:
case Intrinsic::tpu_dma_vmem_to_hbm_single_strided:
case Intrinsic::tpu_dma_hbm_to_smem_general:
case Intrinsic::tpu_dma_hbm_to_vmem_general:
case Intrinsic::tpu_dma_smem_to_hbm_general:
case Intrinsic::tpu_dma_vmem_to_hbm_general:
case Intrinsic::tpu_dma_hbm_to_hbm_sc_general:
case Intrinsic::tpu_dma_smem_to_smem_sc_general:
case Intrinsic::tpu_dma_hbm_to_smem_sc_general:
case Intrinsic::tpu_dma_hbm_to_timem_sc_general:
case Intrinsic::tpu_dma_hbm_to_spmem_sc_general:
case Intrinsic::tpu_dma_smem_to_hbm_sc_general:
case Intrinsic::tpu_dma_timem_to_hbm_sc_general:
case Intrinsic::tpu_dma_spmem_to_spmem_sc_general:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = VNI32;
// Access multiple pointers so set it to null so that alias analysis don't
// make any assumption.
// TODO(thomasraoux): We could have a finer grain aliasing information by
// adding several memory operands and actually add the pointers.
Info.ptrVal = nullptr;
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_dma_hbm_to_iova_sc_simple:
case Intrinsic::tpu_dma_iova_to_hbm_sc_simple:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::v1024i32;
// Same comments as above.
Info.ptrVal = nullptr;
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_spmem_to_smem:
case Intrinsic::tpu_stream_indirect_gather_cb_spmem_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_tilespmem_tileN_to_smem:
case Intrinsic::tpu_stream_indirect_gather_cb_tilespmem_tileN_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_upd_add_f32_hbm_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_cb_upd_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_upd_add_s32_hbm_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_cb_upd_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_upd_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_upd_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_upd_spmem_to_smem:
case Intrinsic::tpu_stream_indirect_gather_cb_upd_spmem_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_cb_upd_tilespmem_tileN_to_smem:
case Intrinsic::
tpu_stream_indirect_gather_cb_upd_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_spmem_to_smem:
case Intrinsic::tpu_stream_indirect_gather_spmem_to_tilespmem:
case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_smem:
case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_spmem:
case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_spmem:
case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_smem_to_spmem:
case Intrinsic::tpu_stream_indirect_scatter_cb_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_hbm:
case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_spmem:
case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_upd_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_indirect_scatter_cb_upd_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_upd_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_indirect_scatter_cb_upd_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_upd_smem_to_spmem:
case Intrinsic::tpu_stream_indirect_scatter_cb_upd_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_hbm:
case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_smem_to_spmem:
case Intrinsic::tpu_stream_indirect_scatter_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm:
case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_spmem:
case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_add_f32_hbm_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_add_s32_hbm_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_spmem_to_smem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_spmem_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_tilespmem_tileN_to_smem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_tilespmem_tileN_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_add_f32_hbm_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_add_s32_hbm_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_spmem_to_smem:
case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_tilespmem_tileN_to_smem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_cb_upd_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_hbm_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_smem:
case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_smem:
case Intrinsic::
tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_smem_to_spmem:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_hbm:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_smem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_hbm:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_spmem:
case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm:
case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_indirect_vreg_scatter_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_gather_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_linear_gather_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_linear_gather_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_linear_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_linear_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_spmem_to_smem:
case Intrinsic::tpu_stream_linear_gather_cb_spmem_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_tilespmem_tileN_to_smem:
case Intrinsic::tpu_stream_linear_gather_cb_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_linear_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_linear_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_spmem_to_smem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_spmem_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_cb_upd_tilespmem_tileN_to_smem:
case Intrinsic::
tpu_stream_linear_gather_cb_upd_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_hbm_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_spmem_to_smem:
case Intrinsic::tpu_stream_linear_gather_spmem_to_tilespmem:
case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_smem:
case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_spmem:
case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_spmem:
case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_smem_to_spmem:
case Intrinsic::tpu_stream_linear_scatter_cb_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_hbm:
case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_spmem:
case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_smem_to_spmem:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_hbm:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_linear_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_smem_to_spmem:
case Intrinsic::tpu_stream_linear_scatter_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm:
case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_spmem:
case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_gather_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_strided_gather_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_strided_gather_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_strided_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_strided_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_spmem_to_smem:
case Intrinsic::tpu_stream_strided_gather_cb_spmem_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_tilespmem_tileN_to_smem:
case Intrinsic::tpu_stream_strided_gather_cb_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_strided_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_spmem_to_tilespmem:
case Intrinsic::
tpu_stream_strided_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_spmem_to_smem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_spmem_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_cb_upd_tilespmem_tileN_to_smem:
case Intrinsic::
tpu_stream_strided_gather_cb_upd_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_hbm4b_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_hbm_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_spmem_to_smem:
case Intrinsic::tpu_stream_strided_gather_spmem_to_tilespmem:
case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_smem:
case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_tilespmem:
case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_spmem:
case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_spmem:
case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_smem_to_spmem:
case Intrinsic::tpu_stream_strided_scatter_cb_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_hbm:
case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_spmem:
case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_upd_add_f32_smem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_strided_scatter_cb_upd_add_f32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_upd_add_s32_smem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
case Intrinsic::
tpu_stream_strided_scatter_cb_upd_add_s32_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_upd_smem_to_spmem:
case Intrinsic::tpu_stream_strided_scatter_cb_upd_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_hbm:
case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_spmem:
case Intrinsic::
tpu_stream_strided_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_smem_to_spmem:
case Intrinsic::tpu_stream_strided_scatter_smem_to_tilespmem_tileN:
case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm:
case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm4b:
case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_spmem:
case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_tilespmem_tileN:
// We don't actually need to add memory operands for stream. We'd get
// regular barriers from the DAG builder otherwise, but we're doing it
// right and stick to memory edges here.
assert(IsSC);
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = VNI32;
// TODO(hgreving): We could have a finer grain aliasing information by
// adding several memory operands. We currently only attach the TileSpmem
// memory operand, because that's all we currently consider when analyzing
// the DAG's edge later. We also don't want to hard-code the operand
// number, because there are too many stream intrinsics. Instead, we're
// just searching the operands.
Info.ptrVal = nullptr;
for (auto &Op : I.operands()) {
if (!Op->getType()->isPointerTy())
continue;
if (Op->getType()->getPointerAddressSpace() != TPUAS_TileSpmem)
continue;
Info.ptrVal = Op;
break;
}
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_bc_load_aliaddr:
case Intrinsic::tpu_bc_load_aliaddr_flm:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getType());
Info.ptrVal = I.getOperand(0);
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::tpu_bc_store_aliaddr:
case Intrinsic::tpu_bc_store_aliaddr_flm:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(I.getOperand(0)->getType());
Info.ptrVal = I.getOperand(1);
Info.size = MemoryLocation::UnknownSize;
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::tpu_bc_loop_end: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i1;
Info.ptrVal = TM.getPSV(TPUTargetMachine::PSV_BarnaCoreChannel_LoopEnd);
Info.size = 1;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
default:
return false;
}
}
void TPUTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
KnownBits Known2;
Known.resetAll();
switch (Op.getOpcode()) {
default:
break;
case TPUISD::UMUL24:
unsigned BitWidth = 32;
Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
// Instruction zero out the 8 top bits.
Known.Zero.setHighBits(8);
Known2.Zero.setHighBits(8);
// If low bits are zero in either operand, output low known-0 bits.
// Also compute a conservative estimate for high known-0 bits.
unsigned TrailZ =
Known.countMinTrailingZeros() + Known2.countMinTrailingZeros();
unsigned LeadZ =
std::max(Known.countMinLeadingZeros() + Known2.countMinLeadingZeros(),
BitWidth) -
BitWidth;
Known.resetAll();
Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
break;
}
}
void TPUTargetLowering::addTPUMemOperand(SelectionDAG &DAG, SDNode *N,
bool IsPush,
const TargetRegisterClass *RC) const {
// Add a MachineMemOperand to N, marking it as a push or pop of the given
// register class.
MachineSDNode *MN = cast<MachineSDNode>(N);
MachinePointerInfo MPI(
static_cast<const TPUTargetMachine &>(getTargetMachine())
.getFifoPSV(IsPush, RC));
auto *MemRef = DAG.getMachineFunction().getMachineMemOperand(
MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
/*base_alignment=*/llvm::Align(4));
DAG.setNodeMemRefs(MN, {MemRef});
}