| //===--------- TPUISelLowering.cpp - TPU DAG Lowering Implementation ------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements the TPUTargetLowering class. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "TPUISelLowering.h" |
| #include "MCTargetDesc/TPUBaseInfo.h" |
| #include "MCTargetDesc/TPUMCTargetDesc.h" |
| #include "TPU.h" |
| #include "TPUCallingConv.h" |
| #include "TPUIRUtils.h" |
| #include "TPUMachineFunctionInfo.h" |
| #include "TPURegisterInfo.h" |
| #include "TPUSubtarget.h" |
| #include "TPUTargetMachine.h" |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/StringSwitch.h" |
| #include "llvm/Analysis/CallGraph.h" |
| #include "llvm/Analysis/MemoryLocation.h" |
| #include "llvm/CodeGen/Analysis.h" |
| #include "llvm/CodeGen/CallingConvLower.h" |
| #include "llvm/CodeGen/ISDOpcodes.h" |
| #include "llvm/CodeGen/MachineBasicBlock.h" |
| #include "llvm/CodeGen/MachineFrameInfo.h" |
| #include "llvm/CodeGen/MachineFunction.h" |
| #include "llvm/CodeGen/MachineInstr.h" |
| #include "llvm/CodeGen/MachineInstrBuilder.h" |
| #include "llvm/CodeGen/MachineMemOperand.h" |
| #include "llvm/CodeGen/MachineModuleInfo.h" |
| #include "llvm/CodeGen/MachineRegisterInfo.h" |
| #include "llvm/CodeGen/RuntimeLibcalls.h" |
| #include "llvm/CodeGen/SelectionDAG.h" |
| #include "llvm/CodeGen/SelectionDAGNodes.h" |
| #include "llvm/CodeGen/TargetCallingConv.h" |
| #include "llvm/CodeGen/ValueTypes.h" |
| #include "llvm/IR/CallingConv.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/GlobalValue.h" |
| #include "llvm/IR/IntrinsicsTPU.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/CodeGen.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/KnownBits.h" |
| #include "llvm/Support/MachineValueType.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Target/TargetMachine.h" |
| #include <cassert> |
| #include <cmath> |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <utility> |
| |
| #define DEBUG_TYPE "tpu-lower" |
| |
| using namespace llvm; |
| |
| static cl::opt<bool> PropagateTpuEmbeddedMasks( |
| "tpu-enable-embedded-masks", cl::Hidden, cl::init(true), |
| cl::desc("Enables propagating embedded hardware masks " |
| "into special mask registers.")); |
| |
| static cl::opt<bool> |
| GenerateTpuVCMasks("tpu-enable-vcmasks", cl::Hidden, cl::init(true), |
| cl::desc("Enables generation of vcmask instructions to " |
| "create mask immediates whenever possible.")); |
| |
| static cl::opt<bool> |
| EmulateSignedDivRem("tpu-emulate-signed-divrem", cl::Hidden, |
| cl::init(false), |
| cl::desc("Enables emulation of signed div/rem via the " |
| "unsigned div/rem instructions")); |
| |
| extern cl::opt<bool> TPUVerifierStrictIntoPtr; |
| |
| bool TPUTargetLowering::functionArgumentNeedsConsecutiveRegisters( |
| Type *Ty, CallingConv::ID CallConv, bool isVarArg, |
| const DataLayout &DL) const { |
| // All aggregates on BarnaCore are allocated consecutive registers. |
| return IsBC && (Ty->isArrayTy() || Ty->isStructTy()); |
| } |
| |
| TPUTargetLowering::TPUTargetLowering(const TargetMachine &TM, |
| const TPUSubtarget &STI) |
| : TargetLowering(TM) { |
| ST = &STI; |
| TII = ST->getInstrInfo(); |
| |
| IsBC = ST->isPxcBarnaCore(); |
| IsSC = ST->isSparseCore(); |
| IsVFTC = ST->hasVfcTensorCore(); |
| HasLPVF = ST->hasLPVF(); |
| HasLPGL = ST->hasLPGL(); |
| HasVMinMax = ST->hasVMinMax(); |
| |
| if (ST->hasV1024()) { |
| HasVPU = true; |
| VNI32 = MVT::v1024i32; |
| VNF32 = MVT::v1024f32; |
| // TODO(thomasraoux): Mask can be 2bits per elements on PFC: |
| // https://g3doc.corp.google.com/platforms/deepsea/logic/pfc/g3doc/isa/tensorcore.md#create-sublane-mask-instruction |
| VMNI1 = MVT::v1024i1; |
| } else if (ST->hasV16()) { |
| HasVPU = true; |
| VNI32 = MVT::v16i32; |
| VNF32 = MVT::v16f32; |
| VMNI1 = MVT::v16i1; |
| } else if (ST->hasV8()) { |
| HasVPU = true; |
| VNI32 = MVT::v8i32; |
| VNF32 = MVT::v8f32; |
| VMNI1 = MVT::v8i1; |
| } else { |
| // No vector support. |
| VNI32 = MVT::i32; |
| VNF32 = MVT::f32; |
| VMNI1 = MVT::i1; |
| } |
| |
| VNBF16 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VNF16 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VNI16 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VNI4 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VNI2 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VNI8 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VMNBF16I1 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VNI8I1 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VMN16I1 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VMN32I1 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| VMN64I1 = MVT::INVALID_SIMPLE_VALUE_TYPE; |
| if (HasLPVF || HasLPGL) { |
| if (ST->hasV8()) { |
| VNBF16 = MVT::v16bf16; |
| VNF16 = MVT::v16f16; |
| VNI16 = MVT::v16i16; |
| VNI8 = MVT::v32i8; |
| VNI4 = MVT::v64i4; |
| VNI2 = MVT::v128i2; |
| VMNBF16I1 = MVT::v16i1; |
| VNI8I1 = MVT::v32i1; |
| VNI1 = MVT::v256i1; |
| VMN16I1 = MVT::v16i1; |
| VMN32I1 = MVT::v32i1; |
| } else if (ST->hasV16()) { |
| VNBF16 = MVT::v32bf16; |
| VNF16 = MVT::v32f16; |
| VNI16 = MVT::v32i16; |
| VNI8 = MVT::v64i8; |
| VNI1 = MVT::v512i1; |
| VNI4 = MVT::v128i4; |
| VNI2 = MVT::v256i2; |
| VMNBF16I1 = MVT::v32i1; |
| VNI8I1 = MVT::v64i1; |
| VMN32I1 = MVT::v32i1; |
| VMN64I1 = MVT::v32i1; |
| } else { |
| llvm_unreachable("Unexpected VPU size."); |
| } |
| } |
| |
| // Set up the register classes. |
| addRegisterClass(MVT::i32, &TPU::GPRRegClass); |
| addRegisterClass(MVT::f32, &TPU::GPRRegClass); |
| addRegisterClass(MVT::bf16, &TPU::GPRRegClass); |
| addRegisterClass(MVT::i1, &TPU::PPRRegClass); |
| // MVT::i8 is not legal in GPR. |
| |
| if (IsSC) { |
| // SparseCore is hijacking the mmx data type for cbreg. |
| addRegisterClass(MVT::x86mmx, &TPU::CBRRegClass); |
| } |
| |
| if (HasVPU) { |
| if (IsBC) { |
| // BarnaCore has Vregs and Vaggregs that both have the same type, so |
| // use VPR_AGGRegClass which is the superclass of both. Restricting a |
| // regclass to a strict subset is trivial. |
| addRegisterClass(VNI32, &TPU::VPR_AGGRegClass); |
| addRegisterClass(VNF32, &TPU::VPR_AGGRegClass); |
| } else { |
| addRegisterClass(VNI32, &TPU::VPRRegClass); |
| addRegisterClass(VNF32, &TPU::VPRRegClass); |
| } |
| addRegisterClass(VMNI1, &TPU::MPRRegClass); |
| } |
| if (HasLPVF || HasLPGL) { |
| addRegisterClass(VNBF16, &TPU::VPRRegClass); |
| addRegisterClass(VNF16, &TPU::VPRRegClass); |
| addRegisterClass(VNI16, &TPU::VPRRegClass); |
| addRegisterClass(VNI8, &TPU::VPRRegClass); |
| addRegisterClass(VNI4, &TPU::VPRRegClass); |
| addRegisterClass(VNI2, &TPU::VPRRegClass); |
| addRegisterClass(VNI1, &TPU::VPRRegClass); |
| if (ST->hasV8()) |
| addRegisterClass(VMN16I1, &TPU::MPRRegClass); |
| addRegisterClass(VMN32I1, &TPU::MPRRegClass); |
| if (ST->hasV16()) |
| addRegisterClass(VMN64I1, &TPU::MPRRegClass); |
| } |
| |
| // Compute derived properties from the register classes |
| TRI = ST->getRegisterInfo(); |
| computeRegisterProperties(TRI); |
| |
| setStackPointerRegisterToSaveRestore(TPU::SPS); |
| |
| setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i8, Expand); |
| setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i8, Expand); |
| |
| setOperationAction(ISD::BR_CC, MVT::i32, Expand); |
| setOperationAction(ISD::BR_CC, MVT::f32, Expand); |
| setOperationAction(ISD::BR_CC, MVT::i1, Expand); |
| setOperationAction(ISD::BR_JT, MVT::Other, Expand); |
| setOperationAction(ISD::BRCOND, MVT::Other, Legal); |
| setOperationAction(ISD::SETCC, MVT::i32, Legal); |
| setOperationAction(ISD::SETCC, MVT::i1, Promote); |
| setOperationAction(ISD::SELECT, MVT::i32, Legal); |
| setOperationAction(ISD::SELECT, MVT::f32, Legal); |
| setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); |
| setOperationAction(ISD::SELECT_CC, VNF32, Expand); |
| setOperationAction(ISD::SELECT_CC, VNI32, Expand); |
| setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); |
| |
| setOperationAction(ISD::VASTART, MVT::Other, Expand); |
| setOperationAction(ISD::VAARG, MVT::Other, Expand); |
| setOperationAction(ISD::VACOPY, MVT::Other, Expand); |
| setOperationAction(ISD::VAEND, MVT::Other, Expand); |
| |
| if (IsSC) { |
| setOperationAction(ISD::SDIVREM, MVT::i32, Legal); |
| setOperationAction(ISD::UDIVREM, MVT::i32, Legal); |
| setOperationAction(ISD::FDIV, MVT::f32, Custom); |
| } else { |
| setOperationAction(ISD::SDIVREM, VNI32, Expand); |
| setOperationAction(ISD::UDIVREM, VNI32, Expand); |
| } |
| |
| // We rely on the combiner to expand into DIVREM. |
| auto SDivRemAction = EmulateSignedDivRem ? Custom : Expand; |
| setOperationAction(ISD::SDIV, MVT::i32, SDivRemAction); |
| setOperationAction(ISD::SREM, MVT::i32, SDivRemAction); |
| setOperationAction(ISD::UDIV, MVT::i32, Expand); |
| setOperationAction(ISD::UREM, MVT::i32, Expand); |
| |
| // We do not currently VDiv i32. |
| setOperationAction(ISD::SDIV, VNI32, Expand); |
| setOperationAction(ISD::UDIV, VNI32, Expand); |
| setOperationAction(ISD::SREM, VNI32, Expand); |
| setOperationAction(ISD::UREM, VNI32, Expand); |
| |
| for (const auto &VT : {MVT::i32, VNI32}) { |
| setOperationAction(ISD::MUL, VT, Legal); |
| setOperationAction(ISD::MULHU, VT, Expand); |
| setOperationAction(ISD::MULHS, VT, Expand); |
| setOperationAction(ISD::UMUL_LOHI, VT, Expand); |
| setOperationAction(ISD::SMUL_LOHI, VT, Expand); |
| |
| setOperationAction(ISD::ROTR, VT, Expand); |
| setOperationAction(ISD::ROTL, VT, Expand); |
| setOperationAction(ISD::SHL_PARTS, VT, Expand); |
| setOperationAction(ISD::SRL_PARTS, VT, Expand); |
| setOperationAction(ISD::SRA_PARTS, VT, Expand); |
| |
| setOperationAction(ISD::BSWAP, VT, Expand); |
| setOperationAction(ISD::CTPOP, VT, Legal); |
| setOperationAction(ISD::CTLZ, VT, Legal); |
| setOperationAction(ISD::CTTZ, VT, Expand); |
| } |
| |
| // If VMul i32 is not natively supported, we need to emulate it. |
| if (!IsSC && !IsVFTC) |
| setOperationAction(ISD::MUL, VNI32, Custom); |
| // For Jellyfish do a custom lowering of i32 MUL |
| if (!ST->hasSMul32()) |
| setOperationAction(ISD::MUL, MVT::i32, Custom); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); |
| setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); |
| |
| setOperationAction(ISD::ConstantFP, MVT::f32, Legal); |
| if (IsVFTC || IsSC) { |
| setOperationAction(ISD::UMAX, MVT::i32, Legal); |
| setOperationAction(ISD::UMIN, MVT::i32, Legal); |
| setOperationAction(ISD::FFLOOR, MVT::f32, Legal); |
| setOperationAction(ISD::FCEIL, MVT::f32, Legal); |
| } // else we will fail lowering. |
| setOperationAction(ISD::FNEG, MVT::f32, Legal); |
| setOperationAction(ISD::FNEG, VNF32, Legal); |
| |
| // Extended load operations for i1 types must be promoted |
| for (MVT VT : MVT::integer_valuetypes()) { |
| setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); |
| setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); |
| setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); |
| } |
| setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); |
| setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); |
| setOperationAction(ISD::FMAXIMUM, VNF32, Legal); |
| setOperationAction(ISD::FMINIMUM, VNF32, Legal); |
| setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); |
| setOperationAction(ISD::FCOPYSIGN, VNF32, Legal); |
| if (HasLPGL) { |
| setOperationAction(ISD::FMAXIMUM, VNBF16, Legal); |
| setOperationAction(ISD::FMINIMUM, VNBF16, Legal); |
| } |
| |
| // Unordered comparisons not supported. |
| setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETULT, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETULE, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETONE, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETUO, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETO, MVT::f32, Expand); |
| setCondCodeAction(ISD::SETUEQ, VNF32, Expand); |
| setCondCodeAction(ISD::SETUGT, VNF32, Expand); |
| setCondCodeAction(ISD::SETUGE, VNF32, Expand); |
| setCondCodeAction(ISD::SETULT, VNF32, Expand); |
| setCondCodeAction(ISD::SETULE, VNF32, Expand); |
| setCondCodeAction(ISD::SETONE, VNF32, Expand); |
| setCondCodeAction(ISD::SETUO, VNF32, Expand); |
| setCondCodeAction(ISD::SETO, VNF32, Expand); |
| if (HasLPGL) { |
| setCondCodeAction(ISD::SETUEQ, VNBF16, Expand); |
| setCondCodeAction(ISD::SETUGT, VNBF16, Expand); |
| setCondCodeAction(ISD::SETUGE, VNBF16, Expand); |
| setCondCodeAction(ISD::SETULT, VNBF16, Expand); |
| setCondCodeAction(ISD::SETULE, VNBF16, Expand); |
| setCondCodeAction(ISD::SETONE, VNBF16, Expand); |
| setCondCodeAction(ISD::SETUO, VNBF16, Expand); |
| setCondCodeAction(ISD::SETO, VNBF16, Expand); |
| } |
| if (HasVMinMax) { |
| setOperationAction(ISD::UMAX, VNI32, Legal); |
| setOperationAction(ISD::UMIN, VNI32, Legal); |
| if (HasLPGL) { |
| setOperationAction(ISD::UMAX, VNI16, Legal); |
| setOperationAction(ISD::UMIN, VNI16, Legal); |
| } |
| } |
| |
| // Unsigned scalar comparisons supported for VF and SC subtargets. |
| LegalizeAction UnsignedCmpLegalizeAction = Custom; |
| if (ST->hasUnsignedScalarCompare()) { |
| UnsignedCmpLegalizeAction = Legal; |
| } |
| setCondCodeAction(ISD::SETUGT, MVT::i32, UnsignedCmpLegalizeAction); |
| setCondCodeAction(ISD::SETUGE, MVT::i32, UnsignedCmpLegalizeAction); |
| setCondCodeAction(ISD::SETULT, MVT::i32, UnsignedCmpLegalizeAction); |
| setCondCodeAction(ISD::SETULE, MVT::i32, UnsignedCmpLegalizeAction); |
| |
| // Unsigned scalar comparisons supported for SC subtargets. |
| UnsignedCmpLegalizeAction = Custom; |
| if (ST->hasUnsignedVectorCompare()) { |
| UnsignedCmpLegalizeAction = Legal; |
| } |
| setCondCodeAction(ISD::SETUGT, VNI32, UnsignedCmpLegalizeAction); |
| setCondCodeAction(ISD::SETUGE, VNI32, UnsignedCmpLegalizeAction); |
| setCondCodeAction(ISD::SETULT, VNI32, UnsignedCmpLegalizeAction); |
| setCondCodeAction(ISD::SETULE, VNI32, UnsignedCmpLegalizeAction); |
| |
| setTargetDAGCombine(ISD::BUILD_VECTOR); |
| setTargetDAGCombine(ISD::VECTOR_SHUFFLE); |
| setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); |
| setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); |
| setTargetDAGCombine(ISD::VSELECT); |
| setTargetDAGCombine(ISD::SETCC); |
| setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); |
| |
| // We could match this during isel in tablegen, but we want a bit more |
| // control. |
| setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); |
| |
| // Function alignments (log2) |
| setMinFunctionAlignment(Align(2)); |
| setPrefFunctionAlignment(Align(2)); |
| |
| setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); |
| |
| setJumpIsExpensive(false); |
| |
| // TODO(jmolloy): This is a hangover from Lanai. Evaluate if jumptables are |
| // needed or useful. |
| setMinimumJumpTableEntries(100); |
| |
| // We'd run into trouble with pointer word sizes if we let native selection |
| // DAG lower these. |
| MaxStoresPerMemset = 0; // For @llvm.memset -> sequence of stores |
| MaxStoresPerMemsetOptSize = 0; |
| MaxStoresPerMemcpy = 0; // For @llvm.memcpy -> sequence of stores |
| MaxStoresPerMemcpyOptSize = 0; |
| MaxStoresPerMemmove = 0; // For @llvm.memmove -> sequence of stores |
| MaxStoresPerMemmoveOptSize = 0; |
| |
| // Booleans always contain 0 or 1. |
| setBooleanContents(ZeroOrOneBooleanContent); |
| } |
| |
| SDValue TPUTargetLowering::LowerGlobalAddress(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); |
| int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); |
| |
| SDValue TargetAddr = DAG.getTargetGlobalAddress( |
| GV, DL, getPointerTy(DAG.getDataLayout()), Offset); |
| return DAG.getNode(TPUISD::WRAPPER, DL, MVT::i32, TargetAddr); |
| } |
| |
| SDValue TPUTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { |
| CondCodeSDNode *Cond = cast<CondCodeSDNode>(Op.getOperand(2).getNode()); |
| assert(!Op.getOperand(0).getValueType().isFloatingPoint() && |
| ISD::isUnsignedIntSetCC(Cond->get()) && |
| "Comparisons involving floating-point and signed-int types should not " |
| "be custom lowered as they are either expanded or legal."); |
| |
| ISD::CondCode SignedCond; |
| switch (Cond->get()) { |
| default: |
| llvm_unreachable("Unknown signed condcode?"); |
| case ISD::CondCode::SETULT: |
| SignedCond = ISD::CondCode::SETLT; |
| break; |
| case ISD::CondCode::SETULE: |
| SignedCond = ISD::CondCode::SETLE; |
| break; |
| case ISD::CondCode::SETUGT: |
| SignedCond = ISD::CondCode::SETGT; |
| break; |
| case ISD::CondCode::SETUGE: |
| SignedCond = ISD::CondCode::SETGE; |
| break; |
| } |
| SDLoc DL(Op); |
| |
| // Generate unsigned setcc as: |
| // %x = setcc signed %a, %b |
| // %y = xor %a, %b // one if bitwise different. |
| // %z = setcc slt %y, 0 // sign bit different? |
| // xor %x, %z |
| EVT VT = Op.getValueType(); |
| EVT InputVT = Op.getOperand(0).getValueType(); |
| SDValue X = |
| DAG.getSetCC(DL, VT, Op.getOperand(0), Op.getOperand(1), SignedCond); |
| SDValue Y = DAG.getNode(ISD::XOR, DL, InputVT, Op.getOperand(0), |
| Op.getOperand(1)); |
| SDValue Z = DAG.getSetCC(DL, VT, Y, DAG.getConstant(0, DL, InputVT), |
| ISD::CondCode::SETLT); |
| return DAG.getNode(ISD::XOR, DL, VT, X, Z); |
| } |
| |
| SDValue TPUTargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| assert(ST->isSparseCore()); |
| SDValue X = Op.getOperand(0); |
| SDValue Y = Op.getOperand(1); |
| SDValue Splat = DAG.getNode(TPUISD::SPLAT, DL, VNF32, Y); |
| auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1); |
| auto PredInvert = DAG.getTargetConstant(APInt(1, 0), DL, MVT::i1); |
| MachineSDNode *VRcpPush = |
| DAG.getMachineNode(TPU::VRCP, DL, MVT::i32, {Splat, PredReg, PredInvert}); |
| addTPUMemOperand(DAG, VRcpPush, /*IsPush=*/true, &TPU::ERFPRRegClass); |
| MachineSDNode *VRcpPop = DAG.getMachineNode( |
| TPU::VRES_EUP, DL, VNF32, {SDValue(VRcpPush, 0), PredReg, PredInvert}); |
| addTPUMemOperand(DAG, VRcpPop, /*IsPush=*/false, &TPU::ERFPRRegClass); |
| SDValue Srcp = |
| SDValue(DAG.getMachineNode(TPU::scVREADr, SDLoc(Op), MVT::f32, |
| {SDValue(VRcpPop, 0), |
| DAG.getTargetConstant(0, DL, MVT::i32), |
| PredReg, PredInvert}), |
| 0); |
| SDValue FDivRes = DAG.getNode(ISD::FMUL, DL, MVT::f32, X, Srcp); |
| return FDivRes; |
| } |
| |
| SDValue TPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| SDValue X = Op.getOperand(0); |
| SDValue Y = Op.getOperand(1); |
| assert(Op.getValueType() == MVT::i32 && "Expected i32 when Lowering SDIV32"); |
| |
| // To emulate signed division, we: |
| // 1. Take the absolute value of the operands |
| // 2. Perform an unsigned divide of the operands |
| // 3. Possibly negate the result of (2.). |
| unsigned UnsignedOpCode; |
| switch (Op.getOpcode()) { |
| default: |
| llvm_unreachable("Unknown signed divrem opcode"); |
| case ISD::SDIV: |
| UnsignedOpCode = ISD::UDIV; |
| break; |
| case ISD::SREM: |
| UnsignedOpCode = ISD::UREM; |
| break; |
| } |
| |
| EVT VT = Op.getValueType(); |
| |
| // 1. Compute abs(x), abs(y): abs(x) = x ^ (x >> 31) - (x >> 31) |
| // |
| // Note: we do this slightly differently than LLO, which uses |
| // compares+selects, but we end up with the same number of instructions. |
| // http://google3/platforms/xla/service/jellyfish/llo_region_builder.cc?l=950&rcl=378412916 |
| SDValue XMask = DAG.getNode(ISD::SRA, DL, VT, X, DAG.getConstant(31, DL, VT)); |
| SDValue YMask = DAG.getNode(ISD::SRA, DL, VT, Y, DAG.getConstant(31, DL, VT)); |
| |
| SDValue XInv = DAG.getNode(ISD::XOR, DL, VT, X, XMask); |
| SDValue YInv = DAG.getNode(ISD::XOR, DL, VT, Y, YMask); |
| |
| SDValue XAbs = DAG.getNode(ISD::SUB, DL, VT, XInv, XMask); |
| SDValue YAbs = DAG.getNode(ISD::SUB, DL, VT, YInv, YMask); |
| |
| // 2. Compute unsigned div/rem. |
| SDValue AbsResult = DAG.getNode(UnsignedOpCode, DL, VT, XAbs, YAbs); |
| |
| // 3. Possibly negate the result of the unsigned div/rem. |
| SDValue SignMask; |
| if (Op.getOpcode() == ISD::SDIV) { |
| SignMask = DAG.getNode(ISD::XOR, DL, VT, XMask, YMask); |
| } else { |
| // For rem, the sign is determined by the dividend (X), defined the same way |
| // as the remainder operator % in C: |
| // (a % b) == a - (a / b) * b |
| SignMask = XMask; |
| } |
| // SignMask is either all zeros or all ones (in which case the result should |
| // be negative). When it is all ones, we can use this mask to negate the two's |
| // complement result similar to finding abs(x): |
| // result = abs_result ^ mask - mask |
| SDValue AbsResultInv = DAG.getNode(ISD::XOR, DL, VT, AbsResult, SignMask); |
| SDValue SignedResult = DAG.getNode(ISD::SUB, DL, VT, AbsResultInv, SignMask); |
| |
| return SignedResult; |
| } |
| |
| SDValue TPUTargetLowering::LowerMUL32(SDValue Op, SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| SDValue X = Op.getOperand(0); |
| SDValue Y = Op.getOperand(1); |
| assert(Op.getValueType() == MVT::i32 && "Expected i32 when Lowering MUL32"); |
| // Expand a MUL i32 operation using the UMUL24 node for Jellyfish. |
| // The decomposition looks like: |
| // c = mul i32 a, b |
| // --> |
| // ll = umul24 i32 a, b |
| // al = srl i32 a, 24 |
| // bl = srl i32 b, 24 |
| // lh = umul24 i32 al, b |
| // hl = umul24 i32 a, bl |
| // sum = add i32 lh, hl |
| // shiftsum = shl i32 sum, 24 |
| // c = add i32 shiftsum, ll |
| EVT VT = Op.getValueType(); |
| KnownBits KBX = DAG.computeKnownBits(X); |
| KnownBits KBY = DAG.computeKnownBits(Y); |
| bool X_is_24bit = (KBX.Zero & 0xFF000000U) == 0xFF000000U; |
| bool Y_is_24bit = (KBY.Zero & 0xFF000000U) == 0xFF000000U; |
| // Using the fact that the smul.u24 instruction automatically zeroes out the |
| // upper bits of the operands. This saves the need to do it ourselves. |
| SDValue Low_Low = DAG.getNode(TPUISD::UMUL24, DL, VT, X, Y); |
| SDValue High_Low, Low_High; |
| if (!X_is_24bit) { |
| SDValue HighX = |
| DAG.getNode(ISD::SRL, DL, VT, X, DAG.getConstant(24, DL, VT)); |
| High_Low = DAG.getNode(TPUISD::UMUL24, DL, VT, HighX, Y); |
| } |
| if (!Y_is_24bit) { |
| SDValue HighY = |
| DAG.getNode(ISD::SRL, DL, VT, Y, DAG.getConstant(24, DL, VT)); |
| Low_High = DAG.getNode(TPUISD::UMUL24, DL, VT, X, HighY); |
| } |
| SDValue MixedSum; |
| if (High_Low && Low_High) { |
| MixedSum = DAG.getNode(ISD::ADD, DL, VT, High_Low, Low_High); |
| } else if (High_Low) { |
| MixedSum = High_Low; |
| } else if (Low_High) { |
| MixedSum = Low_High; |
| } else { |
| return Low_Low; |
| } |
| SDValue ShiftedSum = |
| DAG.getNode(ISD::SHL, DL, VT, MixedSum, DAG.getConstant(24, DL, VT)); |
| return DAG.getNode(ISD::ADD, DL, VT, Low_Low, ShiftedSum); |
| } |
| |
| // Handle the lowering of the simple cases where one operand is a constant. |
| // This uses non-adjacent form (NAF). |
| SDValue TPUTargetLowering::SimpleEmulVMUL32(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc DL(Op); |
| SDValue X = Op.getOperand(0); |
| SDValue Y = Op.getOperand(1); |
| if (Y.getOpcode() != TPUISD::SPLAT) |
| return SDValue(); |
| ConstantSDNode *C = isConstOrConstSplat(Y.getOperand(0)); |
| if (C == nullptr) |
| return SDValue(); |
| int M = C->getZExtValue(); |
| int HighestOne = -1; |
| int NonZeroEntries = 0; |
| std::array<int, 32> SignedDigit; |
| SignedDigit.fill(0); |
| |
| // Following algortihm taken from: |
| // https://en.wikipedia.org/wiki/Non-adjacent_form |
| int64_t e = std::abs(M); |
| const int s = M < 0 ? -1 : 1; |
| int i = 0; |
| while (e > 0) { |
| int zi = 0; |
| if (e % 2 == 1) { |
| zi = 2 - (e % 4); |
| if (zi != 0) { |
| ++NonZeroEntries; |
| } |
| } |
| SignedDigit[i] = s * zi; |
| if (SignedDigit[i] == 1) { |
| HighestOne = i; |
| } |
| e = (e - zi) / 2; |
| ++i; |
| } |
| |
| // Initialize the running sum. Set the running sum to the maximal |
| // shifted positive value (i.e., largest i such that zi == 1 and MulAmt |
| // has V<<i as a term NAF). |
| SDValue Res; |
| if (HighestOne == -1) { |
| Res = |
| DAG.getNode(TPUISD::SPLAT, DL, VNI32, DAG.getConstant(0, DL, MVT::i32)); |
| } else { |
| Res = DAG.getNode(TPUISD::SPLAT, DL, VNI32, |
| DAG.getConstant(HighestOne, DL, MVT::i32)); |
| Res = DAG.getNode(ISD::SHL, DL, VNI32, X, Res); |
| SignedDigit[HighestOne] = 0; |
| } |
| |
| // Assemble multiplication from shift, add, sub using NAF form and |
| // running sum. |
| for (size_t i = 0; i < SignedDigit.size(); ++i) { |
| if (SignedDigit[i] == 0) { |
| continue; |
| } |
| |
| SDValue op = X; |
| // Shifted multiplicand (v<<i). |
| if (i > 0) { |
| SDValue I = DAG.getNode(TPUISD::SPLAT, DL, VNI32, |
| DAG.getConstant(i, DL, MVT::i32)); |
| op = DAG.getNode(ISD::SHL, DL, VNI32, X, I); |
| } |
| if (SignedDigit[i] == 1) { |
| Res = DAG.getNode(ISD::ADD, DL, VNI32, Res, op); |
| } else if (SignedDigit[i] == -1) { |
| Res = DAG.getNode(ISD::SUB, DL, VNI32, Res, op); |
| } |
| } |
| return Res; |
| } |
| |
| // Logic to lower down VMUL32 copied from LLO region builder. |
| SDValue TPUTargetLowering::LowerVMUL32(SDValue Op, SelectionDAG &DAG) const { |
| if (SDValue V = SimpleEmulVMUL32(Op, DAG)) |
| return V; |
| SDLoc DL(Op); |
| SDValue lhs = Op.getOperand(0); |
| SDValue rhs = Op.getOperand(1); |
| // Multiword multiplication. Splits the inputs up into 3 11-bit words using |
| // fmul, uses VmulU11 to form their products. |
| // Computes int32(x * y). We use this as a 11 bit x 11 bit -> 22 bit integer |
| // multiplication primitive without losing precision. |
| // Generates code: |
| // uint32 u0 = u & 0x7FF; |
| // uint32 u1 = (u >> 11) & 0x7FF; |
| // uint32 u2 = u >> 22; |
| // uint32 v0 = v & 0x7FF; |
| // uint32 v1 = (v >> 11) & 0x7FF; |
| // uint32 v2 = v >> 22; |
| // return u0 * v0 + ((u1 * v0 + u0 * v1) << 11) + |
| // ((u0 * v2 + u1 * v1 + u2 * v0) << 22); |
| auto VSplatImm32 = [&](int I) { |
| return DAG.getNode(TPUISD::SPLAT, DL, VNI32, |
| DAG.getConstant(I, DL, MVT::i32)); |
| }; |
| auto VandU32 = [&](SDValue X, SDValue Y) { |
| return DAG.getNode(ISD::AND, DL, VNI32, X, Y); |
| }; |
| auto VaddS32 = [&](SDValue X, SDValue Y) { |
| return DAG.getNode(ISD::ADD, DL, VNI32, X, Y); |
| }; |
| auto VshrlU32 = [&](SDValue X, SDValue Y) { |
| return DAG.getNode(ISD::SRL, DL, VNI32, X, Y); |
| }; |
| auto VshllU32 = [&](SDValue X, SDValue Y) { |
| return DAG.getNode(ISD::SHL, DL, VNI32, X, Y); |
| }; |
| auto VcvtS32ToF32 = [&](SDValue X) { |
| return DAG.getNode(ISD::SINT_TO_FP, DL, VNF32, X); |
| }; |
| |
| // Computes int32(x * y). We use this as a 11 bit x 11 bit -> 22 bit integer |
| // multiplication primitive without losing precision. |
| auto VmulU11 = [&](SDValue Lhs, SDValue Rhs) { |
| auto Product = |
| DAG.getNode(ISD::FMUL, DL, VNF32, VcvtS32ToF32(Lhs), VcvtS32ToF32(Rhs)); |
| return DAG.getNode(ISD::FP_TO_SINT, DL, VNI32, Product); |
| }; |
| |
| auto mask = VSplatImm32(0x7FF); |
| auto k11 = VSplatImm32(11); |
| auto k22 = VSplatImm32(22); |
| |
| auto u0 = VandU32(lhs, mask); |
| auto u1 = VandU32(VshrlU32(lhs, k11), mask); |
| auto u2 = VshrlU32(lhs, k22); |
| |
| auto v0 = VandU32(rhs, mask); |
| auto v1 = VandU32(VshrlU32(rhs, k11), mask); |
| auto v2 = VshrlU32(rhs, k22); |
| |
| auto w0 = VmulU11(u0, v0); |
| |
| auto w1 = VmulU11(u1, v0); |
| w1 = VaddS32(w1, VmulU11(u0, v1)); |
| w1 = VshllU32(w1, k11); |
| |
| auto w2 = VmulU11(u0, v2); |
| w2 = VaddS32(w2, VmulU11(u1, v1)); |
| w2 = VaddS32(w2, VmulU11(u2, v0)); |
| w2 = VshllU32(w2, k22); |
| |
| return VaddS32(VaddS32(w0, w1), w2); |
| } |
| |
| SDValue TPUTargetLowering::LowerADDRSPACECAST(SDValue Op, |
| SelectionDAG &DAG) const { |
| SDLoc dl(Op); |
| const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); |
| unsigned SrcAS = ASC->getSrcAddressSpace(); |
| unsigned DestAS = ASC->getDestAddressSpace(); |
| if ((SrcAS == TPUAS_Smem && DestAS == TPUAS_SmemAny) || |
| (SrcAS == TPUAS_Hbm && DestAS == TPUAS_HbmAny) || |
| (SrcAS == TPUAS_Sflag && DestAS == TPUAS_SflagAny) || |
| (SrcAS == TPUAS_Sflag && DestAS == TPUAS_SflagTile)) { |
| return DAG.getNode(ISD::BITCAST, dl, MVT::i32, ASC->getOperand(0)); |
| } |
| if (!TPUVerifierStrictIntoPtr) |
| return DAG.getNode(ISD::BITCAST, dl, MVT::i32, ASC->getOperand(0)); |
| report_fatal_error("Unsupported addrspace cast " + Twine(SrcAS) + "->" + |
| Twine(DestAS) + ".\n"); |
| } |
| |
| SDValue TPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { |
| switch (Op.getOpcode()) { |
| default: |
| llvm_unreachable("unimplemented operand"); |
| case ISD::SETCC: |
| return LowerSETCC(Op, DAG); |
| case ISD::GlobalAddress: |
| return LowerGlobalAddress(Op, DAG); |
| case ISD::SDIV: |
| case ISD::SREM: |
| return LowerSDIV32(Op, DAG); |
| case ISD::FDIV: |
| if (!ST->hasVPU()) |
| llvm_unreachable("fdiv on scalar core is not supported."); |
| return LowerFDIV32(Op, DAG); |
| case ISD::MUL: { |
| if (Op.getValueType() == MVT::i32) |
| return LowerMUL32(Op, DAG); |
| return LowerVMUL32(Op, DAG); |
| } |
| case ISD::ADDRSPACECAST: |
| return LowerADDRSPACECAST(Op, DAG); |
| case TPUISD::SPLAT: |
| // We're doing some specific type checking, because this is a special case |
| // for MVT::v32i8 when the DAG legalizer tries to promote MVT::i8. |
| if (isTypeLegal(Op->getOperand(0).getValueType())) |
| llvm_unreachable( |
| "This should only happen if the splat element isn't legal."); |
| EVT VT = Op->getOperand(0).getValueType(); |
| if (!VT.isSimple() || !VT.isInteger() || VT != MVT::i8) |
| llvm_unreachable("This should only happen on scalar type MVT::i8, " |
| "which is being promoted."); |
| // We're promoting the MVT::i8 Splat element and match it later. |
| return DAG.getNode( |
| TPUISD::SPLAT, SDLoc(Op), Op->getSimpleValueType(0), |
| DAG.getTargetConstant(Op->getConstantOperandAPInt(0).zext(32), |
| SDLoc(Op), MVT::i32)); |
| } |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Calling Convention Implementation |
| //===----------------------------------------------------------------------===// |
| |
| static bool isMaskVT(MVT VT, const TPUSubtarget &ST) { |
| return VT.getScalarType() == MVT::i1 && |
| /* This check is for real low precision i1 types */ |
| VT.getSizeInBits() != 8 * ST.vectorSizeInBytes(); |
| } |
| |
| // Custom version of CCInfo.AnalyzeFormalArguments, supporting scalar and vector |
| // stacks. Hacks the memory offsets split into two stacks into the ArgLocs. |
| static void analyzeFormalArguments(const TPUTargetLowering &TLI, |
| const TPUSubtarget *ST, |
| const SmallVectorImpl<ISD::InputArg> &Ins, |
| CCState &CCInfo, |
| SmallVector<CCValAssign, 16> &ArgLocs) { |
| int NumBytesScalar = 0; |
| int NumBytesVector = 0; |
| unsigned NumArgs = Ins.size(); |
| for (unsigned i = 0; i != NumArgs; ++i) { |
| MVT ArgVT = Ins[i].VT; |
| ISD::ArgFlagsTy ArgFlags = Ins[i].Flags; |
| int PrevNumBytes = CCInfo.getNextStackOffset(); |
| if (CC_TPU(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo)) |
| report_fatal_error("unable to allocate function argument #" + Twine(i)); |
| if (!ST->isTPUABIEnabled() || CCInfo.getCallingConv() == CallingConv::Fast) |
| continue; |
| CCValAssign &CCV = ArgLocs[i]; |
| if (int StackOffsetDelta = CCInfo.getNextStackOffset() - PrevNumBytes) { |
| if (ArgVT.isVector()) { |
| assert(ST->hasVPU()); |
| // This is a trick using the API in order to adjust the LocMemOffset, |
| // because we have two separate stacks for scalar and vector. |
| if (isMaskVT(ArgVT, *ST)) { |
| int AlignedStackOffsetDelta = |
| alignTo(StackOffsetDelta, ST->vectorSizeInBytes()); |
| StackOffsetDelta = AlignedStackOffsetDelta; |
| } |
| assert(StackOffsetDelta == ST->vectorSizeInBytes()); |
| CCV.convertToMem(NumBytesVector); |
| NumBytesVector += StackOffsetDelta; |
| } else { |
| // Same comment as above. |
| CCV.convertToMem(NumBytesScalar); |
| NumBytesScalar += StackOffsetDelta; |
| } |
| } |
| } |
| } |
| |
| // Transform physical registers into virtual registers and |
| // generate load operations for arguments places on the stack. |
| SDValue TPUTargetLowering::LowerFormalArguments( |
| SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, |
| const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
| SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
| switch (CallConv) { |
| case CallingConv::Fast: |
| case CallingConv::C: |
| break; |
| default: |
| report_fatal_error("Unsupported calling convention"); |
| } |
| |
| MachineFunction &MF = DAG.getMachineFunction(); |
| MachineRegisterInfo &RegInfo = MF.getRegInfo(); |
| |
| if (ST->isTPUABIEnabled()) |
| RegInfo.addLiveIn(TPU::LR); |
| |
| // Assign locations to all of the incoming arguments. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, |
| *DAG.getContext()); |
| analyzeFormalArguments(*this, ST, Ins, CCInfo, ArgLocs); |
| |
| DenseMap<unsigned, SmallVector<Register, 4>> OrigArgToRegLoc; |
| for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { |
| CCValAssign &VA = ArgLocs[i]; |
| assert(!MF.getFunction().hasStructRetAttr()); |
| assert(!IsVarArg); |
| assert(VA.getLocInfo() == CCValAssign::Full); |
| EVT VT = VA.getLocVT(); |
| Register VirtReg; |
| switch (VT.getSimpleVT().SimpleTy) { |
| default: |
| llvm_unreachable("Unhandled type in call lowering!"); |
| case MVT::i8: |
| case MVT::i16: |
| case MVT::i32: |
| case MVT::f32: |
| VirtReg = RegInfo.createVirtualRegister(&TPU::GPRRegClass); |
| break; |
| case MVT::i1: |
| assert(!ST->isTPUABIEnabled()); |
| VirtReg = RegInfo.createVirtualRegister(&TPU::PPRRegClass); |
| break; |
| case MVT::x86mmx: |
| assert(ST->hasVPU()); |
| VirtReg = RegInfo.createVirtualRegister(&TPU::CBRRegClass); |
| break; |
| case MVT::v8i32: |
| case MVT::v8f32: |
| case MVT::v16bf16: |
| case MVT::v16f16: |
| case MVT::v16i16: |
| case MVT::v32i8: |
| case MVT::v64i4: |
| case MVT::v128i2: |
| case MVT::v256i1: |
| case MVT::v16i32: |
| case MVT::v16f32: |
| case MVT::v32bf16: |
| case MVT::v32f16: |
| case MVT::v32i16: |
| case MVT::v64i8: |
| case MVT::v128i4: |
| case MVT::v256i2: |
| case MVT::v512i1: |
| case MVT::v1024i32: |
| case MVT::v1024f32: |
| assert(ST->hasVPU()); |
| if (IsBC && TPU::VAGGRegClass.contains(VA.getLocReg())) { |
| assert(!ST->isTPUABIEnabled()); |
| VirtReg = RegInfo.createVirtualRegister(&TPU::VAGGRegClass); |
| } else { |
| VirtReg = RegInfo.createVirtualRegister(&TPU::VPRRegClass); |
| } |
| break; |
| case MVT::v64i1: |
| assert(ST->hasVPU()); |
| if (ST->hasV8()) |
| llvm_unreachable("Unexpected mask type."); |
| VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass); |
| break; |
| case MVT::v16i1: |
| assert(ST->hasVPU()); |
| VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass); |
| break; |
| case MVT::v32i1: |
| assert(ST->hasVPU()); |
| if (ST->hasV8() && !HasLPGL) |
| llvm_unreachable("Needs +lp."); |
| VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass); |
| break; |
| case MVT::v8i1: |
| case MVT::v1024i1: |
| assert(ST->hasVPU()); |
| VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass); |
| break; |
| } |
| if (VA.isRegLoc()) { |
| OrigArgToRegLoc[Ins[i].getOrigArgIndex()].push_back(VA.getLocReg()); |
| RegInfo.addLiveIn(VA.getLocReg(), VirtReg); |
| InVals.push_back(DAG.getCopyFromReg(Chain, DL, VirtReg, VT)); |
| } else { // VA.isRegLoc() |
| assert(VA.isMemLoc()); |
| assert(!VA.needsCustom()); |
| MachineFunction &MF = DAG.getMachineFunction(); |
| unsigned LocMemOffset = VA.getLocMemOffset(); |
| // In order to make it easier for the callee, the stack pointer in the |
| // caller is incremented such that it points to a free slot in the callee |
| // for the return address. Adjusting the argument offsets here. |
| if (!VA.getValVT().isVector()) |
| LocMemOffset += ST->scalarSizeInBytes(); |
| unsigned AdjustedLocMemOffset = |
| TPU::adjustForWordSize( |
| APInt(32, LocMemOffset), |
| VA.getValVT().isVector() ? TPUAS_TileSpmem : TPUAS_Smem, *ST) |
| .getZExtValue(); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| EVT ArgVT = Ins[i].ArgVT; |
| int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), AdjustedLocMemOffset, |
| /*IsImmutable=*/false); |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| unsigned Opcode; |
| SDValue StackPtr; |
| if (isMaskVT(VA.getValVT(), *ST)) { |
| assert(ST->hasVPU()); |
| Opcode = TPU::RESTORE_MPRs; |
| StackPtr = DAG.getRegister(TPU::FPV, MVT::i32); |
| } else if (VA.getValVT().isVector()) { |
| assert(ST->hasVPU()); |
| Opcode = TPU::RESTORE_VPRs; |
| StackPtr = DAG.getRegister(TPU::FPV, MVT::i32); |
| } else { |
| Opcode = TPU::RESTORE_GPRs; |
| StackPtr = DAG.getRegister(TPU::FPS, MVT::i32); |
| } |
| SmallVector<SDValue, 8> Ops; |
| SDValue TFI = DAG.getTargetFrameIndex(FI, PtrVT); |
| auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1); |
| auto PredInvert = DAG.getTargetConstant(APInt(1, 0), DL, MVT::i1); |
| Ops.push_back(StackPtr); |
| Ops.push_back(TFI); |
| Ops.push_back(PredReg); |
| Ops.push_back(PredInvert); |
| MVT ValVT = VA.getValVT(); |
| MachineSDNode *MN = DAG.getMachineNode( |
| Opcode, DL, isMaskVT(ValVT, *ST) ? VMNI1 : ValVT, Ops); |
| auto *MemRef = DAG.getMachineFunction().getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), |
| MachineMemOperand::MOLoad, /*s=*/4, |
| /*base_alignment=*/llvm::Align(4)); |
| DAG.setNodeMemRefs(MN, {MemRef}); |
| SDValue Arg = SDValue(MN, 0); |
| InVals.push_back(Arg); |
| } |
| } |
| |
| if (IsBC) { |
| // On BarnaCore, we obtain aggregates as function inputs and refer to them |
| // by their base register throughout the function. We need to block the |
| // register allocator from clobbering them. Aggregates are identified by |
| // multiple registers having the same input argument index. |
| TPUMachineFunctionInfo &MFInfo = *MF.getInfo<TPUMachineFunctionInfo>(); |
| for (auto &Range : OrigArgToRegLoc) { |
| if (Range.second.size() == 1) |
| continue; |
| // Note that we rely on the range already being sorted from above. |
| MFInfo.addBarnaCoreAggregateRange(Range.second.front() - TPU::VAGG0, |
| Range.second.back() - TPU::VAGG0 + 1); |
| } |
| } |
| return Chain; |
| } |
| |
| SDValue |
| TPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
| bool IsVarArg, |
| const SmallVectorImpl<ISD::OutputArg> &Outs, |
| const SmallVectorImpl<SDValue> &OutVals, |
| const SDLoc &DL, SelectionDAG &DAG) const { |
| // CCValAssign - represent the assignment of the return value to a location |
| SmallVector<CCValAssign, 16> RVLocs; |
| MachineFunction &MF = DAG.getMachineFunction(); |
| |
| // CCState - Info about the registers and stack slot. |
| CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext()); |
| |
| // Analize return values. |
| CCInfo.AnalyzeReturn(Outs, RetCC_TPU); |
| SmallVector<SDValue, 4> RetOps(1, Chain); |
| |
| // Copy the result values into the output registers. |
| for (unsigned i = 0; i != RVLocs.size(); ++i) { |
| CCValAssign &VA = RVLocs[i]; |
| // FIXME(b/237788792): Finalize return ABI. |
| assert(VA.isRegLoc() && "Can only return in registers!"); |
| assert(!VA.needsCustom()); |
| Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Chain); |
| RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); |
| } |
| |
| RetOps[0] = Chain; // Update chain |
| |
| // We're checking the call graph here and setting whether or not a function is |
| // an entry function. At least on our system, this is good enough. |
| TPUMachineFunctionInfo &MFInfo = *MF.getInfo<TPUMachineFunctionInfo>(); |
| // Ugly cast, CallGraph should really take a const Module. FIXME(hgreving): |
| // maybe try to change upstream. The cast here is safe because nobody will |
| // change the Module. |
| CallGraph CG(*const_cast<Module *>(MF.getMMI().getModule())); |
| const CallGraphNode *CGN = CG[&MF.getFunction()]; |
| // There's a always at least one null node referencing the function. |
| if (CGN->getNumReferences() == 1) |
| MFInfo.setIsTopLevel(true); |
| else |
| MFInfo.setIsTopLevel(false); |
| |
| if (!ST->isTPUABIEnabled() || MFInfo.isTopLevel()) |
| return DAG.getNode(TPUISD::HALT, DL, MVT::Other, |
| ArrayRef<SDValue>(&RetOps[0], RetOps.size())); |
| return DAG.getNode(TPUISD::RET, DL, MVT::Other, |
| ArrayRef<SDValue>(&RetOps[0], RetOps.size())); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Custom Lowerings |
| //===----------------------------------------------------------------------===// |
| |
| SDValue TPUTargetLowering::PerformSCALAR_TO_VECTORCombine( |
| SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { |
| const SDValue &Val = N->getOperand(0); |
| MVT VecVT = N->getSimpleValueType(0); |
| |
| return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val); |
| } |
| |
| SDValue TPUTargetLowering::PerformINSERT_VECTOR_ELTCombine( |
| SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { |
| const SDValue &Vec = N->getOperand(0); |
| const SDValue &Val = N->getOperand(1); |
| |
| auto PredReg = DCI.DAG.getRegister(TPU::Palways, MVT::i1); |
| auto PredInvert = DCI.DAG.getTargetConstant(APInt(1, 0), SDLoc(N), MVT::i1); |
| MVT VecVT = N->getSimpleValueType(0); |
| |
| SDValue SplatVal = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val); |
| |
| SmallVector<SDValue, 8> Ops; |
| SDValue Vseq = SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), VNI32, |
| PredReg, PredInvert), |
| 0); |
| Ops.push_back(Vseq); |
| |
| SDValue Mask; |
| if (const ConstantSDNode *Idx = |
| dyn_cast<ConstantSDNode>(N->getOperand(2).getNode())) { |
| Ops.push_back(DCI.DAG.getTargetConstant(*Idx->getConstantIntValue(), |
| SDLoc(N), MVT::i32)); |
| Ops.push_back(PredReg); |
| Ops.push_back(PredInvert); |
| Mask = |
| SDValue(DCI.DAG.getMachineNode(TPU::VMLANEi, SDLoc(N), VMNI1, Ops), 0); |
| } else { |
| Ops.push_back(SDValue(cast<SDNode>(N->getOperand(2).getNode()), 0)); |
| Ops.push_back(PredReg); |
| Ops.push_back(PredInvert); |
| Mask = |
| SDValue(DCI.DAG.getMachineNode(TPU::VMLANEr, SDLoc(N), VMNI1, Ops), 0); |
| } |
| return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), VecVT, Mask, SplatVal, Vec); |
| } |
| |
| bool TPUTargetLowering::isNonNaNFPConstSplat(SDValue N) const { |
| if (N->getOpcode() == TPUISD::SPLAT) { |
| if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) |
| return !CN->isNaN(); |
| } |
| return false; |
| } |
| |
| EVT TPUTargetLowering::getOptimalMemOpType( |
| const MemOp &Op, const AttributeList &FuncAttributes) const { |
| // We're returning something that makes sense, though it is useless since we |
| // neither know the memory space, nor can we let selection DAG to the LLVM |
| // MemOp lowering. See header file for explanation. |
| return VNI32; |
| } |
| |
| SDValue TPUTargetLowering::PerformSETCCCombine( |
| SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { |
| // We help the DAG combiner by recognizing ordered setcc of splats that can't |
| // be NaN. LLVM can do that if BUILD_VECTOR, but we combine early into SPLAT, |
| // hence this code. |
| if (!isNonNaNFPConstSplat(N->getOperand(0)) || |
| !isNonNaNFPConstSplat(N->getOperand(1))) |
| return SDValue(); |
| ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); |
| // TODO(hgreving): what about SETO? |
| ISD::CondCode NoNaNCC = getFCmpCodeWithoutNaN(CC); |
| if (NoNaNCC != CC) |
| return DCI.DAG.getSetCC(SDLoc(N), N->getSimpleValueType(0), |
| N->getOperand(0), N->getOperand(1), NoNaNCC); |
| return SDValue(); |
| } |
| |
| SDValue TPUTargetLowering::getSupportedVCMask(SelectionDAG &DAG, int VectorMask, |
| SDLoc Loc) const { |
| if (!ST->hasVCMasks() || !GenerateTpuVCMasks) |
| return SDValue(); |
| int MaskSizeInBits = EVT(VMNI1).getSizeInBits(); |
| int FullMask = (1 << MaskSizeInBits) - 1; |
| // Technically `< MaskSizeInBits` would be enough because a full mask should |
| // be covered by embedded masks. |
| for (int i = 1; i <= MaskSizeInBits; i++) { |
| int CompareMask = (1 << i) - 1; |
| for (int j = 0; j < MaskSizeInBits; j++) { |
| int RotCompareMask = |
| (CompareMask << j | CompareMask >> (MaskSizeInBits - j)) & FullMask; |
| if (VectorMask == RotCompareMask) { |
| int S = j * 4; |
| int E = ((i + j - 1) % MaskSizeInBits) * 4 + 3; |
| assert(S < EVT(VMNI1).getSizeInBits() * 4); |
| assert(E < EVT(VMNI1).getSizeInBits() * 4); |
| auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1); |
| auto PredInvert = DAG.getTargetConstant(APInt(1, 0), Loc, MVT::i1); |
| return SDValue( |
| DAG.getMachineNode( |
| TPU::VCMASKi, Loc, VMNI1, |
| DAG.getTargetConstant(APInt(32, E << 8 | S), Loc, MVT::i32), |
| PredReg, PredInvert), |
| 0); |
| } |
| } |
| } |
| return SDValue(); |
| } |
| |
| SDValue TPUTargetLowering::getSupportedVCMask(SelectionDAG &DAG, |
| SDNode *N) const { |
| if (!ST->hasVCMasks() || !GenerateTpuVCMasks) |
| return SDValue(); |
| int MaskSizeInBits = EVT(VMNI1).getSizeInBits(); |
| if (N->getNumOperands() != MaskSizeInBits) |
| return SDValue(); |
| int BuildVectorMask = 0; |
| for (int i = 0; i < MaskSizeInBits; i++) { |
| ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i)); |
| if (C == nullptr) |
| return SDValue(); |
| BuildVectorMask |= C->getZExtValue() << i; |
| } |
| return getSupportedVCMask(DAG, BuildVectorMask, SDLoc(N)); |
| } |
| |
| SDValue TPUTargetLowering::PerformBUILD_VECTORCombine( |
| SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { |
| // Combine a BUILD_VECTOR(42, 42, 42, 42, ...) -> SPLAT(42) |
| MVT VecVT = N->getSimpleValueType(0); |
| if (VecVT != VNI32 && VecVT != VNF32 && VecVT != VMNI1) { |
| if (!HasLPGL) |
| return SDValue(); |
| if (VecVT != VNBF16 && VecVT != VNI8) |
| return SDValue(); |
| } |
| MVT ScalarVT; |
| if (VecVT == VNI32) |
| ScalarVT = MVT::i32; |
| else if (VecVT == VNF32) |
| ScalarVT = MVT::f32; |
| else if (VecVT == VNBF16) |
| ScalarVT = MVT::bf16; |
| else if (VecVT == VNI8) |
| ScalarVT = MVT::i8; |
| else if (VecVT == VMNI1) |
| // Low precision build_vector masks are currently not supported. |
| ScalarVT = MVT::i1; |
| else |
| llvm_unreachable("Bad vector ty!"); |
| |
| // Checking for supported embedded hardware masks. I would have preferred to |
| // do this in tablegen, and this would be possible with sth like this: |
| // |
| // def tpuvm17 : PatLeaf<(build_vector), [{ |
| // return isMask7f(N); |
| // }]>; |
| // |
| // let Predicates = [HasV8,NotBC] in { |
| // def : Pat<(vNi1 (Splat -1)), (COPY !cast<TPUReg>("M16"))>; |
| // def : Pat<(vNi1 (tpuvm17)), (COPY !cast<TPUReg>("M17"))>; |
| // |
| // However, since we already combine BUILD_VECTOR here, we would have to check |
| // for the embedded masks here anyway and potentially bail combine. |
| // Additionally, it is harder to turn on/off the feature in tablegen. Lastly, |
| // we may run into cases with instructions not supporting the special mask, in |
| // which case we probably want to legalize them, and this will be easier if we |
| // combine the hardware mask here. All of the above is the reason why the code |
| // is here, and not in tablegen. |
| // |
| if (ScalarVT == MVT::i1) { |
| Register EmbeddedMask = getSupportedEmbeddedMask(N); |
| if (EmbeddedMask != TPU::NoRegister) |
| return DCI.DAG.getCopyFromReg(DCI.DAG.getEntryNode(), SDLoc(N), |
| EmbeddedMask, VMNI1); |
| SDValue VMCreate = getSupportedVCMask(DCI.DAG, N); |
| if (VMCreate.getNode()) |
| return VMCreate; |
| } |
| |
| unsigned VecSize = MVT(VecVT).getVectorNumElements(); |
| bool IsSplat = true; |
| bool IsVlaneSeq = true; |
| assert(N->getNumOperands() == VecSize); |
| SDValue Val0 = N->getOperand(0); |
| int IC = -1; |
| if (Val0.getSimpleValueType() != ScalarVT) |
| return SDValue(); |
| for (unsigned I = 0; I < VecSize; ++I) { |
| if (N->getOperand(I) != Val0 && !N->getOperand(I).isUndef()) |
| IsSplat = false; |
| ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(N->getOperand(I)); |
| if (!ValC) { |
| IsVlaneSeq = false; |
| continue; |
| } |
| if (ValC->getZExtValue() != IC++ + 1) |
| IsVlaneSeq = false; |
| if (!IsVlaneSeq && !IsSplat) |
| break; |
| } |
| |
| if (IsSplat) |
| return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val0); |
| |
| auto PredReg = DCI.DAG.getRegister(TPU::Palways, MVT::i1); |
| auto PredInvert = DCI.DAG.getTargetConstant(APInt(1, 0), SDLoc(N), MVT::i1); |
| |
| if (IsVlaneSeq) |
| return SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), VNI32, |
| PredReg, PredInvert), |
| 0); |
| |
| // BUILD_VECTOR(a, b, c, d, ...) -> VSEL(Splat(a), ...) |
| // This is really ugly but is the only way :( |
| |
| // Pick an initial splat value. |
| SDValue InitialSplatted = N->getOperand(VecSize - 1); |
| SDValue V = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, InitialSplatted); |
| for (unsigned I = 0; I < VecSize; ++I) { |
| if (N->getOperand(I)->isUndef() || N->getOperand(I) == InitialSplatted) |
| continue; |
| SDValue SplatVal = |
| DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, N->getOperand(I)); |
| |
| SDValue VMCreate = getSupportedVCMask(DCI.DAG, 1 << I, SDLoc(N)); |
| SDValue Mask; |
| if (VMCreate.getNode()) { |
| Mask = VMCreate; |
| } else { |
| SmallVector<SDValue, 8> Ops; |
| SDValue Vseq = SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), |
| VNI32, PredReg, PredInvert), |
| 0); |
| Ops.push_back(Vseq); |
| Ops.push_back(DCI.DAG.getTargetConstant(I, SDLoc(N), MVT::i32)); |
| Ops.push_back(PredReg); |
| Ops.push_back(PredInvert); |
| Mask = SDValue(DCI.DAG.getMachineNode(TPU::VMLANEi, SDLoc(N), VMNI1, Ops), |
| 0); |
| } |
| |
| // And use that mask to select-in this value. |
| V = DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), VecVT, Mask, SplatVal, V); |
| } |
| return V; |
| } |
| |
| SDValue TPUTargetLowering::PerformVECTOR_SHUFFLECombine( |
| ShuffleVectorSDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { |
| // Combine a VECTOR_SHUFFLE(1, 2, 3, 4, 5, 6, 7, 0) -> VROTDOWN() |
| // or VECTOR_SHUFFLE(VECTOR_INSERT(x,y, n), n, n, ...) -> VSPLAT(y) |
| // or VECTOR_SHUFFLE(x, x, x, x, x, x, x, x) -> VSPLAT(VROTDOWN()) |
| |
| MVT VecVT = N->getSimpleValueType(0); |
| if (VecVT != VNI32 && VecVT != VNF32 && VecVT != VMNI1) |
| return SDValue(); |
| assert(N->getNumOperands() == 2); |
| SDValue Val = N->getOperand(0); |
| |
| unsigned VecSize = MVT(VecVT).getVectorNumElements(); |
| bool IsSequence = true; |
| bool IsSame = true; |
| unsigned Offset = N->getMaskElt(0); |
| for (unsigned I = 0; I < VecSize; ++I) { |
| if (N->getMaskElt(I) != (I + Offset) % VecSize) |
| IsSequence = false; |
| if (N->getMaskElt(I) != Offset) |
| IsSame = false; |
| } |
| |
| bool NeedsTrunc = false; |
| if (VecVT == VMNI1) { |
| Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VNI32, Val); |
| VecVT = VNI32; |
| NeedsTrunc = true; |
| } |
| |
| // Helper function to trucate the result if we performed extension |
| // of the operation from i1 |
| auto TruncateReturnIfNeed = [&](SDValue V) { |
| if (NeedsTrunc) |
| return DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), VMNI1, V); |
| return V; |
| }; |
| |
| if (IsSequence && ST->isSparseCore()) |
| return TruncateReturnIfNeed( |
| DCI.DAG.getNode(TPUISD::VROTDOWN, SDLoc(N), VecVT, Val, |
| DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32))); |
| |
| if (!IsSame && ST->isSparseCore()) { |
| // SparseCore has a vector permute that permutes the elements into all lanes |
| // of a vector based on a vector mask. |
| SmallVector<SDValue, 8> MaskElements; |
| for (int El : N->getMask()) |
| MaskElements.push_back(DCI.DAG.getConstant(El, SDLoc(N), MVT::i32)); |
| SDValue VMask = |
| DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VNI32, MaskElements); |
| return TruncateReturnIfNeed( |
| DCI.DAG.getNode(TPUISD::VPERMUTE, SDLoc(N), VecVT, Val, VMask)); |
| } |
| |
| if (!IsSame) |
| return SDValue(); |
| |
| // On tensorcore we cannot use rotdown to move any element into lane 0. |
| if (!ST->isSparseCore() && Offset != 0) |
| return SDValue(); |
| |
| MVT ScalarVT = VecVT == VNI32 ? MVT::i32 : MVT::f32; |
| // If the value replicated comes from an insert, splat directly the original |
| // value |
| if (N->getOperand(0).getOpcode() == ISD::INSERT_VECTOR_ELT) { |
| SDNode *ExtractElt = cast<SDNode>(N->getOperand(0)); |
| const ConstantSDNode *Idx = |
| cast<ConstantSDNode>(ExtractElt->getOperand(2).getNode()); |
| if (Idx->getConstantIntValue()->getZExtValue() == Offset) { |
| SDValue ExtractedVal = ExtractElt->getOperand(1); |
| MVT ExtractedSplatVT = NeedsTrunc ? VMNI1 : VecVT; |
| return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), ExtractedSplatVT, |
| ExtractedVal); |
| } |
| } |
| if (ST->hasBroadcast()) { |
| // SparseCore has a vector broadcast that broadcasts the element at Offset |
| // into all lanes of a vector without traversing the scalar side. |
| return TruncateReturnIfNeed( |
| DCI.DAG.getNode(TPUISD::VBROADCAST, SDLoc(N), VecVT, Val, |
| DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32))); |
| } |
| // Extract the splatted value from the vector and re-splat it. |
| // Rotate the vector if the offset is not zero. |
| if (Offset != 0) { |
| Val = DCI.DAG.getNode(TPUISD::VROTDOWN, SDLoc(N), VecVT, Val, |
| DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32)); |
| } |
| Val = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ScalarVT, Val, |
| DCI.DAG.getConstant(0, SDLoc(N), MVT::i32)); |
| Val = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val); |
| return TruncateReturnIfNeed(Val); |
| } |
| |
| SDValue TPUTargetLowering::PerformVSELECTCombine( |
| SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { |
| if (N->getValueType(0) != VMNI1) |
| return SDValue(); |
| SDValue Cond = N->getOperand(0); |
| SDValue Op1 = N->getOperand(1); |
| SDValue Op2 = N->getOperand(2); |
| if (Op1.getOpcode() == TPUISD::SPLAT && Op2.getOpcode() == TPUISD::SPLAT && |
| isa<ConstantSDNode>(Op1->getOperand(0)) && |
| isa<ConstantSDNode>(Op2->getOperand(0))) { |
| bool TrueVal = cast<ConstantSDNode>(Op1->getOperand(0))->getLimitedValue(); |
| bool FalseVal = cast<ConstantSDNode>(Op2->getOperand(0))->getLimitedValue(); |
| |
| if (TrueVal == FalseVal) |
| // select(C, X, X) -> X |
| return Op1; |
| if (TrueVal == true && FalseVal == false) |
| // select(C, 1, 0) -> C |
| return Cond; |
| assert(TrueVal == false && FalseVal == true); |
| // select(C, 0, 1) -> !C === C xor -1 |
| return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VMNI1, Cond, Op2); |
| } |
| |
| // select(C, X, Y) -> (C & X) | (~C & Y) |
| SDValue CAndX = DCI.DAG.getNode(ISD::AND, SDLoc(N), VMNI1, Cond, Op1); |
| SDValue NotC = DCI.DAG.getNode( |
| ISD::XOR, SDLoc(N), VMNI1, Cond, |
| DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VMNI1, |
| DCI.DAG.getConstant(-1, SDLoc(N), MVT::i1))); |
| SDValue NotCAndY = DCI.DAG.getNode(ISD::AND, SDLoc(N), VMNI1, NotC, Op2); |
| return DCI.DAG.getNode(ISD::OR, SDLoc(N), VMNI1, CAndX, NotCAndY); |
| } |
| |
| SDValue TPUTargetLowering::PerformBcInsertValueCombine( |
| SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { |
| // Combine llvm.tpu.bc.insertvalue.loopindex -> BC_INSERTVALUE. |
| // The intrinsic takes an array and returns an array. This is lowered to |
| // %a = merge_values a0,a1,a2,...,an-1 |
| // %b1,b2,...,bn-1 = @llvm.tpu.bc.insertvalue.loopindex %a, %c |
| // |
| // We don't care about the values of any physical registers. We've already |
| // reserved a block of registers for this aggregate, all we need to do is |
| // keep the zeroth register to plumb through as the base value. |
| // |
| // Here we replace the intrinsic with an BC_INSERTVALUE of the base register |
| // and a MERGE_VALUES result, with the base register in value 0 and the rest |
| // UNDEF. The optimizer will then clean things up. |
| |
| SDLoc DL(N); |
| SDValue BaseReg = N->getOperand(1); |
| SDValue InsertedValue = N->getOperand(2); |
| EVT VT = BaseReg.getValueType(); |
| SDValue NewN = |
| DCI.DAG.getNode(TPUISD::BC_INSERTVALUE, DL, VT, BaseReg, InsertedValue); |
| SmallVector<SDValue, 4> Vs(N->getNumValues(), DCI.DAG.getUNDEF(VT)); |
| Vs[0] = NewN; |
| return DCI.DAG.getMergeValues(Vs, DL); |
| } |
| |
| SDValue TPUTargetLowering::PerformBcExtractValueCombine( |
| SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { |
| // Combine llvm.tpu.bc.extractvalue.loopindex -> BC_EXTRACTVALUE. |
| // The intrinsic takes an array and returns a vector. This is lowered to |
| // %a = merge_values a0,a1,a2,...,an-1 |
| // %b:v8f32 = @llvm.tpu.bc.extractvalue.loopindex %a |
| // |
| // We don't care about the values of any physical registers. We've already |
| // reserved a block of registers for this aggregate, all we need to do is |
| // keep the zeroth register to plumb through as the base value. |
| // |
| // We're already accessing MERGE_VALUES:0, so just rewrite in place. |
| |
| SDLoc DL(N); |
| SDValue BaseReg = N->getOperand(1); |
| EVT VT = BaseReg.getValueType(); |
| return DCI.DAG.getNode(TPUISD::BC_EXTRACTVALUE, DL, VT, BaseReg); |
| } |
| |
| SDValue TPUTargetLowering::PerformPtrToIntCombine(SDNode *N) const { |
| return N->getOperand(1); |
| } |
| |
| const char *TPUTargetLowering::getTargetNodeName(unsigned Opcode) const { |
| switch (Opcode) { |
| default: |
| return "<TPU unknown opcode>"; |
| case TPUISD::HALT: |
| return "TPUISD::HALT"; |
| case TPUISD::VROTDOWN: |
| return "TPUISD::VROTDOWN"; |
| case TPUISD::VBROADCAST: |
| return "TPUISD::VBROADCAST"; |
| case TPUISD::VPERMUTE: |
| return "TPUISD::VPERMUTE"; |
| case TPUISD::SPLAT: |
| return "TPUISD::SPLAT"; |
| case TPUISD::WRAPPER: |
| return "TPUISD::WRAPPER"; |
| case TPUISD::BC_INSERTVALUE: |
| return "TPUISD::BC_INSERTVALUE"; |
| case TPUISD::BC_EXTRACTVALUE: |
| return "TPUISD::BC_EXTRACTVALUE"; |
| case TPUISD::UMUL24: |
| return "TPUISD::UMUL24"; |
| case TPUISD::CALL: |
| return "TPUISD::CALL"; |
| } |
| } |
| |
| SDValue TPUTargetLowering::PerformDAGCombine(SDNode *N, |
| DAGCombinerInfo &DCI) const { |
| switch (N->getOpcode()) { |
| case ISD::BUILD_VECTOR: |
| return PerformBUILD_VECTORCombine(N, DCI); |
| case ISD::VECTOR_SHUFFLE: |
| return PerformVECTOR_SHUFFLECombine(cast<ShuffleVectorSDNode>(N), DCI); |
| case ISD::INSERT_VECTOR_ELT: |
| return PerformINSERT_VECTOR_ELTCombine(N, DCI); |
| case ISD::SCALAR_TO_VECTOR: |
| return PerformSCALAR_TO_VECTORCombine(N, DCI); |
| case ISD::VSELECT: |
| return PerformVSELECTCombine(N, DCI); |
| case ISD::INTRINSIC_WO_CHAIN: |
| switch (N->getConstantOperandVal(0)) { |
| default: |
| return SDValue(); |
| case Intrinsic::tpu_bc_insertvalue_loopindex: |
| return PerformBcInsertValueCombine(N, DCI); |
| case Intrinsic::tpu_bc_extractvalue_loopindex: |
| return PerformBcExtractValueCombine(N, DCI); |
| case Intrinsic::tpu_inttoptr: |
| case Intrinsic::tpu_ptrtoint: |
| return PerformPtrToIntCombine(N); |
| } |
| case ISD::SETCC: |
| return PerformSETCCCombine(N, DCI); |
| default: |
| break; |
| } |
| |
| return SDValue(); |
| } |
| |
| std::optional<bool> |
| TPUTargetLowering::IsFifoAccess(MachineInstr &MI, |
| const TargetRegisterClass *RegClass) const { |
| const MCInstrDesc &MCID = TII->get(MI.getOpcode()); |
| for (auto I = MCID.opInfo_begin(), IE = MCID.opInfo_end(); I != IE; I++) { |
| if (I->RegClass == RegClass->getID()) { |
| // For push instruction the destination register needs to match the |
| // given reg class. For Pop instruction of the operands needs to match |
| // the given reg class. |
| if (I == MCID.opInfo_begin()) |
| return false; |
| else if (I != MCID.opInfo_begin()) |
| return true; |
| } |
| } |
| return std::nullopt; |
| } |
| |
| bool TPUTargetLowering::UsesSpecialReg( |
| MachineInstr &MI, const TargetRegisterClass *RegClass) const { |
| const MCInstrDesc &MCID = TII->get(MI.getOpcode()); |
| for (auto I = MCID.opInfo_begin(), IE = MCID.opInfo_end(); I != IE; I++) { |
| if (I->RegClass == RegClass->getID()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| Register TPUTargetLowering::getSupportedEmbeddedMask(SDNode *N) const { |
| if (!ST->hasEmbeddedMasks() || !PropagateTpuEmbeddedMasks) |
| return TPU::NoRegister; |
| assert(N->getOpcode() == ISD::BUILD_VECTOR); |
| // See e.g. go/vfc-sc-isa#vector-modify-mask-instructions. |
| DenseMap<int, Register> SupportedEmbeddedMasks{ |
| {0xff, TPU::M16}, {0x7f, TPU::M17}, {0x3f, TPU::M18}, {0x1f, TPU::M19}, |
| {0xf, TPU::M20}, {0x7, TPU::M21}, {0x3, TPU::M22}, {0x1, TPU::M23}, |
| }; |
| int MaskSizeInBits = EVT(VMNI1).getSizeInBits(); |
| if (N->getNumOperands() != MaskSizeInBits) |
| return TPU::NoRegister; |
| auto MatchesBitMask = [MaskSizeInBits, N](int BitMask) { |
| for (int i = 0; i < MaskSizeInBits; i++) { |
| ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i)); |
| if (C == nullptr) |
| return false; |
| if (C->getZExtValue() != ((BitMask >> i) & 0x1)) |
| return false; |
| } |
| return true; |
| }; |
| for (auto &KV : SupportedEmbeddedMasks) { |
| if (MatchesBitMask(KV.first)) |
| return KV.second; |
| } |
| return TPU::NoRegister; |
| } |
| |
| void TPUTargetLowering::SetDependency(MachineInstr &MI, MachineBasicBlock *MBB, |
| const TargetRegisterClass *RegClass, |
| bool IsPush) const { |
| const TPUTargetMachine &TM = |
| static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget()); |
| MachinePointerInfo MPI(TM.getFifoPSV(IsPush, RegClass)); |
| auto *MemRef = MBB->getParent()->getMachineMemOperand( |
| MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4, |
| /*base_alignment=*/llvm::Align(4)); |
| MI.addMemOperand(*MBB->getParent(), MemRef); |
| } |
| |
| // DWG needs dependencies with all matmul. |
| // The first matmul after a DWG need dependencies with all matpush. |
| // DWG can be re-ordered across matpush instructions. |
| // This function adds the memory operators to enforce this order. |
| MachineBasicBlock *TPUTargetLowering::SetDWGDep(MachineInstr &MI, |
| MachineBasicBlock *MBB) const { |
| MachineRegisterInfo &RegInfo = MBB->getParent()->getRegInfo(); |
| SmallDenseSet<MachineInstr *, 32> DWGUses; |
| Register Dst = MI.getOperand(0).getReg(); |
| for (MachineInstr &MIUser : RegInfo.use_instructions(Dst)) { |
| assert(MIUser.getParent() == MBB && |
| "matmul use WDG from a different block, this case is currently not " |
| "supported"); |
| DWGUses.insert(&MIUser); |
| } |
| if (DWGUses.empty()) |
| return MBB; |
| auto E = MBB->end(); |
| MachineInstr *FirstMatMul = nullptr; |
| for (auto I = MI.getIterator(); I != E; I++) { |
| if (DWGUses.count(&(*I)) > 0) { |
| FirstMatMul = &(*I); |
| break; |
| } |
| } |
| assert(FirstMatMul != nullptr && "didn't find any matmul"); |
| // The first MatMul needs to have an explicit dependency with gsfn as it |
| // triggers the copy from gsfn/gsft to gmr. This means the following push |
| // cannot be re-ordered across the first matmul. |
| const TargetRegisterClass *GSFNRegClass = |
| RegInfo.getRegClass(MI.getOperand(1).getReg()); |
| SetDependency(*FirstMatMul, MBB, GSFNRegClass); |
| // DWG cannot be re-ordered across any matmul instruction so add a dependency |
| // to push MRF to represent that. |
| const TargetRegisterClass *MRFRegClass = |
| RegInfo.getRegClass(FirstMatMul->getOperand(0).getReg()); |
| SetDependency(MI, MBB, MRFRegClass, /*isPush=*/true); |
| return MBB; |
| } |
| |
| MachineBasicBlock * |
| TPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, |
| MachineBasicBlock *MBB) const { |
| // Generic handling of instructions that to sets dependencies |
| if (static_cast<const TPUInstrInfo *>(TII)->isDWGInst(MI)) { |
| return SetDWGDep(MI, MBB); |
| } |
| bool IsSpecialRegAccess = false; |
| for (auto Fifo : FifoClasses) { |
| if (auto IsPop = IsFifoAccess(MI, Fifo)) { |
| SetDependency(MI, MBB, Fifo, !*IsPop); |
| IsSpecialRegAccess = true; |
| } |
| } |
| for (auto ImplicitReg : SpecialStagingReg) { |
| if (UsesSpecialReg(MI, ImplicitReg)) { |
| SetDependency(MI, MBB, ImplicitReg); |
| IsSpecialRegAccess = true; |
| } |
| } |
| // Instruction with special register accesses only need to be modified to have |
| // an extra pseudo source. |
| if (IsSpecialRegAccess) |
| return MBB; |
| |
| auto &ST = MI.getMF()->getSubtarget<TPUSubtarget>(); |
| unsigned PopOpcode = TPU::SPOP_V2SF; |
| const TargetRegisterClass *RegClass = &TPU::V2SFPRRegClass; |
| if (ST.hasVfcTensorCore()) { |
| PopOpcode = TPU::SPOP_SFRF; |
| RegClass = &TPU::SFRFPRRegClass; |
| } |
| |
| switch (MI.getOpcode()) { |
| default: |
| llvm_unreachable("Unknown instruction for custom emission!"); |
| case TPU::VROTDOWNri: |
| return EmitVROTDOWN(MI, MBB); |
| case TPU::VFREADi: |
| return EmitVecOrSFlagToScalar( |
| MI, MBB, ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEi : TPU::VSYNCMOVEi, |
| 1, PopOpcode, RegClass); |
| case TPU::VFREADr: |
| return EmitVecOrSFlagToScalar( |
| MI, MBB, ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEr : TPU::VSYNCMOVEr, |
| 1, PopOpcode, RegClass); |
| case TPU::VFREADDONEi: |
| return EmitVecOrSFlagToScalar( |
| MI, MBB, |
| ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEDONEi : TPU::VSYNCMOVEDONEi, |
| 1, PopOpcode, RegClass); |
| case TPU::VFREADDONEr: |
| return EmitVecOrSFlagToScalar( |
| MI, MBB, |
| ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEDONEr : TPU::VSYNCMOVEDONEr, |
| 1, PopOpcode, RegClass); |
| case TPU::VFREADPAi: |
| return EmitVecOrSFlagToScalar(MI, MBB, TPU::tcvfVSYNCMOVEPAi, 1, PopOpcode, |
| RegClass); |
| case TPU::VFREADPAr: |
| return EmitVecOrSFlagToScalar(MI, MBB, TPU::tcvfVSYNCMOVEPAr, 1, PopOpcode, |
| RegClass); |
| case TPU::VREAD: |
| assert(!IsSC); |
| return EmitVecOrSFlagToScalar(MI, MBB, TPU::VPUSH, 1, TPU::SPOP_V2SF, |
| &TPU::V2SFPRRegClass); |
| case TPU::scVREADi: |
| assert(IsSC); |
| return EmitVecOrSFlagToScalar(MI, MBB, TPU::scVPUSHi, 2, PopOpcode, |
| RegClass); |
| case TPU::scVREADr: |
| assert(IsSC); |
| return EmitVecOrSFlagToScalar(MI, MBB, TPU::scVPUSHr, 2, PopOpcode, |
| RegClass); |
| case TPU::VMREAD: |
| return EmitVmread(MI, MBB); |
| } |
| } |
| |
| MachineBasicBlock *TPUTargetLowering::EmitVecOrSFlagToScalar( |
| MachineInstr &MI, MachineBasicBlock *MBB, unsigned PushOpcode, |
| int NumOfInputs, unsigned PopOpcode, |
| const TargetRegisterClass *RegClass) const { |
| auto &MRI = MBB->getParent()->getRegInfo(); |
| auto InsertPt = MI.getIterator(); |
| |
| const unsigned FifoReg = MRI.createVirtualRegister(RegClass); |
| MachineInstrBuilder MIB = |
| BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(PushOpcode), FifoReg); |
| for (int i = 1; i <= NumOfInputs; i++) |
| MIB.add(MI.getOperand(i)); |
| MachineInstr *Push = AddDefaultPred(MIB); |
| MachineInstr *Pop = |
| AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(), |
| TII->get(PopOpcode), MI.getOperand(0).getReg()) |
| .addReg(FifoReg, getKillRegState(true))); |
| MI.eraseFromParent(); |
| |
| for (auto &I : {Push, Pop}) { |
| const TPUTargetMachine &TM = |
| static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget()); |
| MachinePointerInfo MPI(TM.getFifoPSV(I == Push, RegClass)); |
| auto *MemRef = MBB->getParent()->getMachineMemOperand( |
| MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4, |
| /*base_alignment=*/llvm::Align(4)); |
| I->addMemOperand(*MBB->getParent(), MemRef); |
| } |
| return MBB; |
| } |
| |
| MachineBasicBlock *TPUTargetLowering::EmitVmread(MachineInstr &MI, |
| MachineBasicBlock *MBB) const { |
| auto &MRI = MBB->getParent()->getRegInfo(); |
| auto InsertPt = MI.getIterator(); |
| |
| unsigned ZeroReg = MRI.createVirtualRegister(&TPU::VPRRegClass); |
| AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(), |
| TII->get(TPU::VIMMI), ZeroReg) |
| .addImm(0)); |
| AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(), |
| TII->get(TPU::VSELir), |
| MI.getOperand(0).getReg()) |
| .add(MI.getOperand(1)) |
| .addImm(1) |
| .addReg(ZeroReg)); |
| MI.eraseFromParent(); |
| return MBB; |
| } |
| |
| MachineBasicBlock * |
| TPUTargetLowering::EmitVROTDOWN(MachineInstr &MI, |
| MachineBasicBlock *MBB) const { |
| // Emit VROTDOWNri as a sequence of N VROTDOWNr's. |
| auto &MRI = MBB->getParent()->getRegInfo(); |
| |
| unsigned Imm = MI.getOperand(2).getImm(); |
| auto OpReg = MI.getOperand(1).getReg(); |
| auto FinalReg = MI.getOperand(0).getReg(); |
| auto InsertPt = MI.getIterator(); |
| if (Imm == 0) { |
| BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(TPU::COPY), FinalReg) |
| .addReg(OpReg); |
| MI.eraseFromParent(); |
| return MBB; |
| } |
| |
| // TODO(hgreving): Sparsecore and Viperfish should be able to use |
| // one xlane instruction. |
| MachineInstr *TheMI = &MI; |
| for (unsigned I = 0; I < Imm; ++I) { |
| unsigned OutReg = (I == (Imm - 1)) |
| ? FinalReg |
| : MRI.createVirtualRegister(&TPU::VPRRegClass); |
| TheMI = AddDefaultPred( |
| BuildMI(*MBB, InsertPt, MI.getDebugLoc(), |
| TII->get(TPU::VROTDOWNr), OutReg) |
| .addReg(OpReg, getKillRegState(true))); |
| OpReg = OutReg; |
| } |
| MI.eraseFromParent(); |
| |
| return MBB; |
| } |
| |
| bool TPUTargetLowering::allowsMemoryAccess(LLVMContext &Context, |
| const DataLayout &DL, EVT VT, |
| unsigned AddrSpace, Align Alignment, |
| MachineMemOperand::Flags Flags, |
| unsigned *Fast) const { |
| // Disallow load/store we don't support natively. |
| if (VT != MVT::i32 && VT != MVT::f32 && VT != VNF32 && VT != VNI32) |
| return false; |
| bool Allows = TargetLowering::allowsMemoryAccess(Context, DL, VT, AddrSpace, |
| Alignment, Flags, Fast); |
| if (Allows) |
| *Fast = 1; |
| return Allows; |
| } |
| |
| bool TPUTargetLowering::allowsMisalignedMemoryAccesses( |
| EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
| unsigned *Fast) const { |
| // No memory access on TPU requires alignment > 4 bytes. |
| if (Alignment >= Align(4)) |
| return true; |
| return false; |
| } |
| |
| bool TPUTargetLowering::allowsMisalignedMemoryAccesses( |
| LLT LT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
| unsigned *Fast) const { |
| // No memory access on TPU requires alignment > 4 bytes. |
| if (Alignment >= Align(4)) |
| return true; |
| return false; |
| } |
| |
| void TPUTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, |
| SDNode *Node) const { |
| MachineBasicBlock *MBB = MI.getParent(); |
| const TPUTargetMachine &TM = |
| static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget()); |
| if (MI.getOpcode() == TPU::INIT_STACK) { |
| // Move stack initialization to the very top of the function. |
| assert(ST->isTPUABIEnabled()); |
| MI.setFlags(MachineInstr::FrameSetup); |
| MI.moveBefore(&*MBB->instr_begin()); |
| return; |
| } |
| if (MI.getOpcode() == TPU::bcVST_concat || |
| MI.getOpcode() == TPU::bcVST_concat_aliaddr) { |
| MachinePointerInfo MPI( |
| TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ConcatReg)); |
| auto *MemRef = MBB->getParent()->getMachineMemOperand( |
| MPI, MachineMemOperand::MOStore, /*s=*/4, /*base_alignment=*/llvm::Align(4)); |
| MI.addMemOperand(*MBB->getParent(), MemRef); |
| return; |
| } |
| if (MI.getOpcode() == TPU::bcVSHIFT || |
| MI.getOpcode() == TPU::bcVSHIFT_aliaddr) { |
| { |
| MachinePointerInfo MPI( |
| TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ShiftReg)); |
| auto *MemRef = MBB->getParent()->getMachineMemOperand( |
| MPI, MachineMemOperand::MOStore, /*s=*/4, /*base_alignment=*/llvm::Align(4)); |
| MI.addMemOperand(*MBB->getParent(), MemRef); |
| } |
| { |
| MachinePointerInfo MPI( |
| TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ConcatReg)); |
| auto *MemRef = MBB->getParent()->getMachineMemOperand( |
| MPI, MachineMemOperand::MOLoad, /*s=*/4, /*base_alignment=*/llvm::Align(4)); |
| MI.addMemOperand(*MBB->getParent(), MemRef); |
| } |
| return; |
| } |
| // We rely on convention of brcond ordering to match bcLOOP_END correctly. |
| // Ensure we actually matched correctly here. bcLOOP_END should point back |
| // to its own block (single block loops only are allowed). |
| assert(MI.getOpcode() == TPU::bcLOOP_END); |
| assert(MI.getOperand(0).getMBB() == MI.getParent() && |
| "bcLOOP_END does not point to its parent!"); |
| MI.getParent()->setMachineBlockAddressTaken(); |
| } |
| |
| // Custom version of CCInfo.AnalyzeCallOperands, supporting scalar and vector |
| // stacks. Hacks the memory offsets split into two stacks into the ArgLocs and |
| // returns the scalar and vector sizes for call parameters. Also returns the |
| // extra bytes used for alignment of the vector stack of masks. |
| static void analyzeCallOperands(const TPUTargetLowering &TLI, |
| const TPUSubtarget *ST, |
| const TargetLowering::CallLoweringInfo &CLI, |
| CCState &CCInfo, |
| SmallVector<CCValAssign, 16> &ArgLocs, |
| int &NumBytesScalar, int &NumBytesVector, |
| int &ExtraAlignBytesVector) { |
| const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; |
| unsigned NumOps = Outs.size(); |
| for (unsigned i = 0; i != NumOps; ++i) { |
| MVT ArgVT = Outs[i].VT; |
| ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; |
| int PrevNumBytes = CCInfo.getNextStackOffset(); |
| if (CC_TPU(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo)) { |
| #ifndef NDEBUG |
| dbgs() << "Call operand #" << i << " has unhandled type " |
| << EVT(ArgVT).getEVTString() << '\n'; |
| #endif |
| llvm_unreachable(nullptr); |
| } |
| if (!ST->isTPUABIEnabled() || CCInfo.getCallingConv() == CallingConv::Fast) |
| continue; |
| assert(!ArgLocs[i].isMemLoc() || |
| PrevNumBytes == ArgLocs[i].getLocMemOffset()); |
| CCValAssign &CCV = ArgLocs[i]; |
| if (int StackOffsetDelta = CCInfo.getNextStackOffset() - PrevNumBytes) { |
| if (ArgVT.isVector()) { |
| assert(ST->hasVPU()); |
| // This is a trick using the API in order to adjust the LocMemOffset, |
| // because we have two separate stacks for scalar and vector. |
| if (isMaskVT(ArgVT, *ST)) { |
| int AlignedStackOffsetDelta = |
| alignTo(StackOffsetDelta, ST->vectorSizeInBytes()); |
| ExtraAlignBytesVector += AlignedStackOffsetDelta - StackOffsetDelta; |
| StackOffsetDelta = AlignedStackOffsetDelta; |
| } |
| assert(StackOffsetDelta == ST->vectorSizeInBytes()); |
| CCV.convertToMem(NumBytesVector); |
| NumBytesVector += StackOffsetDelta; |
| } else { |
| assert(StackOffsetDelta == ST->scalarSizeInBytes()); |
| // Same comment as above. |
| CCV.convertToMem(NumBytesScalar); |
| NumBytesScalar += StackOffsetDelta; |
| } |
| } |
| } |
| assert(CCInfo.getCallingConv() == CallingConv::Fast || |
| ArgLocs.size() == NumOps); |
| } |
| |
| SDValue TPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, |
| SmallVectorImpl<SDValue> &InVals) const { |
| SelectionDAG &DAG = CLI.DAG; |
| SDLoc &DL = CLI.DL; |
| SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; |
| SDValue Chain = CLI.Chain; |
| SDValue Callee = CLI.Callee; |
| bool &IsTailCall = CLI.IsTailCall; |
| CallingConv::ID CallConv = CLI.CallConv; |
| bool IsVarArg = CLI.IsVarArg; |
| // Not supported. |
| assert(!IsVarArg); |
| // FIXME(b/237788792): Support return values. |
| assert(CLI.RetTy->isVoidTy() && |
| "Return values should be passed by reference"); |
| // No support for tail calls right now. |
| IsTailCall = false; |
| |
| // Analyze operands of the call, assigning locations to each operand. |
| SmallVector<CCValAssign, 16> ArgLocs; |
| CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, |
| *DAG.getContext()); |
| GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); |
| // How many bytes are to be pushed on the scalar stack. |
| int NumBytesScalar = 0; |
| // How many bytes are to be pushed on the vector stack. |
| int NumBytesVector = 0; |
| // Extra bytes added for vector memory alignment, used for masks. |
| int ExtraAlignBytesVector = 0; |
| analyzeCallOperands(*this, ST, CLI, CCInfo, ArgLocs, NumBytesScalar, |
| NumBytesVector, ExtraAlignBytesVector); |
| assert(NumBytesScalar + NumBytesVector - ExtraAlignBytesVector == |
| CCInfo.getNextStackOffset()); |
| |
| Chain = DAG.getCALLSEQ_START(Chain, NumBytesScalar, NumBytesVector, DL); |
| |
| SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass; |
| SmallVector<SDValue, 12> MemOpChains; |
| |
| // Walk the register assignments, inserting copies. |
| for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { |
| CCValAssign &VA = ArgLocs[I]; |
| assert(VA.getValVT() == VA.getLocVT()); |
| SDValue Arg = OutVals[I]; |
| if (VA.isRegLoc()) { |
| // Promote the value if needed. |
| switch (VA.getLocInfo()) { |
| case CCValAssign::Full: |
| break; |
| case CCValAssign::SExt: |
| Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::ZExt: |
| Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| case CCValAssign::AExt: |
| Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); |
| break; |
| default: |
| llvm_unreachable("Unknown loc info!"); |
| } |
| |
| // Arguments that can be passed on register must be kept at RegsToPass |
| // vector |
| RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); |
| } else { // VA.isRegLoc() |
| assert(VA.isMemLoc()); |
| assert(!VA.needsCustom()); |
| auto PtrVT = getPointerTy(DAG.getDataLayout()); |
| MachineFunction &MF = DAG.getMachineFunction(); |
| unsigned LocMemOffset = VA.getLocMemOffset(); |
| // In order to make it easier for the callee, the stack pointer in the |
| // caller is incremented such that it points to a free slot in the callee |
| // for the return address. Adjusting the argument offsets here. |
| if (!VA.getValVT().isVector()) |
| LocMemOffset += ST->scalarSizeInBytes(); |
| else |
| assert(ST->hasVPU()); |
| unsigned AdjustedLocMemOffset = |
| TPU::adjustForWordSize( |
| APInt(32, LocMemOffset), |
| VA.getValVT().isVector() ? TPUAS_TileSpmem : TPUAS_Smem, *ST) |
| .getZExtValue(); |
| SDValue PtrOff = DAG.getIntPtrConstant(AdjustedLocMemOffset, DL); |
| // Stack pointer (not frame pointer) based after call stack adjustments. |
| SDValue DstAddr = DAG.getNode( |
| ISD::ADD, DL, PtrVT, |
| DAG.getRegister(VA.getValVT().isVector() ? TPU::SPV : TPU::SPS, |
| MVT::i32), |
| PtrOff); |
| MachinePointerInfo DstInfo = |
| VA.getValVT().isVector() |
| ? MachinePointerInfo(TPUAS_TileSpmem, LocMemOffset) |
| : MachinePointerInfo::getStack(MF, LocMemOffset); |
| SDValue Store; |
| if (isMaskVT(VA.getValVT(), *ST)) { |
| SDValue Select = |
| DAG.getNode(ISD::VSELECT, DL, VNI32, Arg, |
| DAG.getNode(TPUISD::SPLAT, DL, VNI32, |
| DAG.getConstant(0xFFFFFFFF, DL, MVT::i32)), |
| DAG.getNode(TPUISD::SPLAT, DL, VNI32, |
| DAG.getConstant(0, DL, MVT::i32))); |
| Store = DAG.getStore(Chain, DL, Select, DstAddr, DstInfo); |
| } else { |
| Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); |
| } |
| MemOpChains.push_back(Store); |
| } |
| } |
| |
| if (!MemOpChains.empty()) |
| Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); |
| |
| SDValue InFlag; |
| |
| // Build a sequence of copy-to-reg nodes chained together with token chain and |
| // flag operands which copy the outgoing args into registers. The InFlag is |
| // necessary since all emitted instructions must be stuck together. |
| for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) { |
| Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first, |
| RegsToPass[I].second, InFlag); |
| InFlag = Chain.getValue(1); |
| } |
| |
| // If the callee is a GlobalAddress node (quite common, every direct call is) |
| // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. |
| // Likewise ExternalSymbol -> TargetExternalSymbol. |
| Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, |
| getPointerTy(DAG.getDataLayout()), 0); |
| Callee = DAG.getNode(TPUISD::WRAPPER, DL, MVT::i32, Callee); |
| |
| // Function always return void. |
| SDVTList NodeTys = DAG.getVTList(MVT::isVoid, MVT::Glue); |
| SmallVector<SDValue, 8> Ops; |
| Ops.push_back(Chain); |
| Ops.push_back(Callee); |
| |
| // Add a register mask operand representing the call-preserved registers. |
| const uint32_t *Mask = |
| TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); |
| assert(Mask && "Missing call preserved mask for calling convention"); |
| Ops.push_back(DAG.getRegisterMask(Mask)); |
| |
| // Add argument registers to the end of the list so that they are |
| // known live into the call. |
| for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) |
| Ops.push_back(DAG.getRegister(RegsToPass[I].first, |
| RegsToPass[I].second.getValueType())); |
| |
| if (InFlag.getNode()) |
| Ops.push_back(InFlag); |
| |
| Chain = DAG.getNode(CallConv == CallingConv::Fast ? TPUISD::CALL_FAST |
| : TPUISD::CALL, |
| DL, NodeTys, ArrayRef<SDValue>(&Ops[0], Ops.size())); |
| InFlag = Chain.getValue(1); |
| |
| // Create the CALLSEQ_END node. |
| Chain = DAG.getCALLSEQ_END( |
| Chain, |
| DAG.getConstant(NumBytesScalar, DL, getPointerTy(DAG.getDataLayout()), |
| true), |
| DAG.getConstant(NumBytesVector, DL, getPointerTy(DAG.getDataLayout()), |
| true), |
| InFlag, DL); |
| InFlag = Chain.getValue(1); |
| return Chain; |
| } |
| |
| bool TPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, |
| const CallInst &I, |
| MachineFunction &MF, |
| unsigned Intrinsic) const { |
| const TPUTargetMachine &TM = |
| static_cast<const TPUTargetMachine &>(MF.getTarget()); |
| switch (Intrinsic) { |
| case Intrinsic::tpu_syncadd: |
| case Intrinsic::tpu_syncadd_done: |
| case Intrinsic::tpu_syncadd_notdone: |
| case Intrinsic::tpu_syncadd_remote: |
| case Intrinsic::tpu_syncadd_remote_done: |
| case Intrinsic::tpu_syncadd_remote_doneinv: |
| case Intrinsic::tpu_syncadd_tile: |
| case Intrinsic::tpu_syncset_done: |
| case Intrinsic::tpu_syncset_notdone: |
| case Intrinsic::tpu_syncset_remote: |
| case Intrinsic::tpu_syncset_remote_doneinv: |
| case Intrinsic::tpu_syncdonemov: |
| Info.opc = (Intrinsic == Intrinsic::tpu_syncdonemov) |
| ? ISD::INTRINSIC_W_CHAIN |
| : ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::i32; |
| Info.ptrVal = I.getOperand(0); |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_vld_shuffle: |
| case Intrinsic::tpu_vld_strided: |
| case Intrinsic::tpu_vld_indexed: |
| case Intrinsic::tpu_vld_replicate_evenodd_sublanes: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType()); |
| Info.ptrVal = I.getOperand(0); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_rdcbreg_smem_base: |
| case Intrinsic::tpu_rdcbreg_tilespmem_base: |
| case Intrinsic::tpu_rdcbreg_size: |
| case Intrinsic::tpu_rdcbreg_offset: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| // FIXME(hgreving): re-visit memory operand strategy for this. The reason |
| // for this to read memory at all are the cb.upd semantics that are not |
| // modeled through register dependencies. |
| Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType()); |
| Info.ptrVal = nullptr; |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_sld_cb: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType()); |
| // FIXME(hgreving): re-visit memory operand strategy for this. We don't |
| // have a pointer and PSV values also don't work well here (upstream bug: |
| // can't set address space). |
| Info.ptrVal = nullptr; |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_sld_cb_upd: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType()); |
| // FIXME(hgreving): re-visit memory operand strategy for this. We don't |
| // have a pointer and PSV values also don't work well here (upstream bug: |
| // can't set address space). |
| Info.ptrVal = nullptr; |
| Info.size = MemoryLocation::UnknownSize; |
| // upd modeled as store. |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_vld_msk: |
| case Intrinsic::tpu_vld_msk_strided: |
| case Intrinsic::tpu_vld_msk_idx_strided: |
| case Intrinsic::tpu_vld_msk_idx: |
| case Intrinsic::tpu_vld_msk_idx_np: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType()); |
| Info.ptrVal = I.getOperand(1); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_vst_strided: |
| case Intrinsic::tpu_vst_indexed: |
| case Intrinsic::tpu_vst_evenodd_sublanes: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(0)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.ptrVal = I.getOperand(1); |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_sst_cb: |
| case Intrinsic::tpu_sst_cb_upd: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(0)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| // FIXME(hgreving): re-visit memory operand strategy for this. We don't |
| // have a pointer and PSV values also don't work well here (upstream bug: |
| // can't set address space). |
| Info.ptrVal = nullptr; |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_vst_msk_idx_add: |
| case Intrinsic::tpu_vst_msk_idx_add_np: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(3)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.ptrVal = I.getOperand(1); |
| Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_vst_msk_idx_ret_add_np: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getOperand(3)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.ptrVal = I.getOperand(1); |
| Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_vst_msk: |
| case Intrinsic::tpu_vst_msk_add: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(2)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.ptrVal = I.getOperand(1); |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_vst_cb_msk: |
| case Intrinsic::tpu_vst_cb_msk_add: |
| case Intrinsic::tpu_vst_cb_msk_add_strided: |
| case Intrinsic::tpu_vst_cb_msk_idx: |
| case Intrinsic::tpu_vst_cb_msk_idx_add: |
| case Intrinsic::tpu_vst_cb_msk_strided: |
| case Intrinsic::tpu_vst_cb_upd_msk: |
| case Intrinsic::tpu_vst_cb_upd_msk_add: |
| case Intrinsic::tpu_vst_cb_upd_msk_add_strided: |
| case Intrinsic::tpu_vst_cb_upd_msk_strided: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(3)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| // FIXME(hgreving): re-visit memory operand strategy for this. We don't |
| // have a pointer and PSV values also don't work well here (upstream bug: |
| // can't set address space). |
| Info.ptrVal = nullptr; |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_vld_cb_msk: |
| case Intrinsic::tpu_vld_cb_msk_idx: |
| case Intrinsic::tpu_vld_cb_msk_idx_np: |
| case Intrinsic::tpu_vld_cb_msk_strided: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType()); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.ptrVal = nullptr; |
| Info.flags = MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_vld_cb_upd_msk: |
| case Intrinsic::tpu_vld_cb_upd_msk_strided: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType()); |
| Info.size = MemoryLocation::UnknownSize; |
| // FIXME(hgreving): re-visit memory operand strategy for this. We don't |
| // have a pointer and PSV values also don't work well here (upstream bug: |
| // can't set address space). |
| Info.ptrVal = nullptr; |
| // upd modeled as store |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_vst_msk_strided: |
| case Intrinsic::tpu_vst_msk_add_strided: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(3)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.ptrVal = I.getOperand(1); |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_vst_msk_idx: |
| case Intrinsic::tpu_vst_msk_idx_np: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(3)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.ptrVal = I.getOperand(1); |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_vst_msk_idx_strided: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(4)->getType()); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.ptrVal = I.getOperand(1); |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_dma_hbm_to_smem: |
| case Intrinsic::tpu_dma_hbm_to_smem_sc_simple: |
| case Intrinsic::tpu_dma_hbm_to_vmem: |
| case Intrinsic::tpu_dma_hbm_to_spmem_sc_simple: |
| case Intrinsic::tpu_dma_hbm_to_hbm: |
| case Intrinsic::tpu_dma_hbm_to_hbm_sc_simple: |
| case Intrinsic::tpu_dma_hbm_to_hib: |
| case Intrinsic::tpu_dma_hbm_to_vmem_hib_update: |
| case Intrinsic::tpu_dma_smem_to_hbm: |
| case Intrinsic::tpu_dma_smem_to_hbm_sc_simple: |
| case Intrinsic::tpu_dma_vmem_to_hbm: |
| case Intrinsic::tpu_dma_spmem_to_hbm_sc_simple: |
| case Intrinsic::tpu_dma_spmem_to_spmem_sc_simple: |
| case Intrinsic::tpu_dma_timem_to_hbm: |
| case Intrinsic::tpu_dma_timem_to_hbm_sc_simple: |
| case Intrinsic::tpu_dma_hbm_to_simem_sc_simple: |
| case Intrinsic::tpu_dma_hbm_to_timem: |
| case Intrinsic::tpu_dma_hbm_to_timem_sc_simple: |
| case Intrinsic::tpu_dma_hbm_to_smem_single_strided: |
| case Intrinsic::tpu_dma_hbm_to_vmem_single_strided: |
| case Intrinsic::tpu_dma_smem_to_hbm_single_strided: |
| case Intrinsic::tpu_dma_vmem_to_hbm_single_strided: |
| case Intrinsic::tpu_dma_hbm_to_smem_general: |
| case Intrinsic::tpu_dma_hbm_to_vmem_general: |
| case Intrinsic::tpu_dma_smem_to_hbm_general: |
| case Intrinsic::tpu_dma_vmem_to_hbm_general: |
| case Intrinsic::tpu_dma_hbm_to_hbm_sc_general: |
| case Intrinsic::tpu_dma_smem_to_smem_sc_general: |
| case Intrinsic::tpu_dma_hbm_to_smem_sc_general: |
| case Intrinsic::tpu_dma_hbm_to_timem_sc_general: |
| case Intrinsic::tpu_dma_hbm_to_spmem_sc_general: |
| case Intrinsic::tpu_dma_smem_to_hbm_sc_general: |
| case Intrinsic::tpu_dma_timem_to_hbm_sc_general: |
| case Intrinsic::tpu_dma_spmem_to_spmem_sc_general: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = VNI32; |
| // Access multiple pointers so set it to null so that alias analysis don't |
| // make any assumption. |
| // TODO(thomasraoux): We could have a finer grain aliasing information by |
| // adding several memory operands and actually add the pointers. |
| Info.ptrVal = nullptr; |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_dma_hbm_to_iova_sc_simple: |
| case Intrinsic::tpu_dma_iova_to_hbm_sc_simple: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::v1024i32; |
| // Same comments as above. |
| Info.ptrVal = nullptr; |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_spmem_to_smem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_tilespmem_tileN_to_smem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_tilespmem_tileN_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_upd_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_upd_add_f32_hbm_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_upd_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_upd_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_upd_add_s32_hbm_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_upd_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_upd_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_upd_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_upd_spmem_to_smem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_upd_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_cb_upd_tilespmem_tileN_to_smem: |
| case Intrinsic:: |
| tpu_stream_indirect_gather_cb_upd_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_spmem_to_smem: |
| case Intrinsic::tpu_stream_indirect_gather_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_smem: |
| case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_smem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_upd_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_upd_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_upd_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_upd_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_upd_smem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_upd_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_scatter_cb_upd_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_smem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_scatter_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_add_f32_hbm_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_add_s32_hbm_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_spmem_to_smem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_tilespmem_tileN_to_smem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_tilespmem_tileN_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_add_f32_hbm_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_add_s32_hbm_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_spmem_to_smem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_tilespmem_tileN_to_smem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_cb_upd_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_smem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_smem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_smem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_spmem: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_indirect_vreg_scatter_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_gather_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_linear_gather_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_linear_gather_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_linear_gather_cb_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_linear_gather_cb_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_spmem_to_smem: |
| case Intrinsic::tpu_stream_linear_gather_cb_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_tilespmem_tileN_to_smem: |
| case Intrinsic::tpu_stream_linear_gather_cb_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_linear_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_linear_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_spmem_to_smem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_cb_upd_tilespmem_tileN_to_smem: |
| case Intrinsic:: |
| tpu_stream_linear_gather_cb_upd_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_spmem_to_smem: |
| case Intrinsic::tpu_stream_linear_gather_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_smem: |
| case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_spmem: |
| case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_spmem: |
| case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_smem_to_spmem: |
| case Intrinsic::tpu_stream_linear_scatter_cb_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_spmem: |
| case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_smem_to_spmem: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_linear_scatter_cb_upd_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_smem_to_spmem: |
| case Intrinsic::tpu_stream_linear_scatter_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_spmem: |
| case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_gather_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_strided_gather_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_strided_gather_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_strided_gather_cb_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_strided_gather_cb_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_spmem_to_smem: |
| case Intrinsic::tpu_stream_strided_gather_cb_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_tilespmem_tileN_to_smem: |
| case Intrinsic::tpu_stream_strided_gather_cb_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_strided_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_spmem_to_tilespmem: |
| case Intrinsic:: |
| tpu_stream_strided_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_spmem_to_smem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_cb_upd_tilespmem_tileN_to_smem: |
| case Intrinsic:: |
| tpu_stream_strided_gather_cb_upd_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_hbm4b_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_hbm_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_spmem_to_smem: |
| case Intrinsic::tpu_stream_strided_gather_spmem_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_smem: |
| case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_tilespmem: |
| case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_spmem: |
| case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_spmem: |
| case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_smem_to_spmem: |
| case Intrinsic::tpu_stream_strided_scatter_cb_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_spmem: |
| case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_upd_add_f32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_upd_add_f32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_upd_add_s32_smem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_upd_add_s32_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_upd_smem_to_spmem: |
| case Intrinsic::tpu_stream_strided_scatter_cb_upd_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_spmem: |
| case Intrinsic:: |
| tpu_stream_strided_scatter_cb_upd_tilespmem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_smem_to_spmem: |
| case Intrinsic::tpu_stream_strided_scatter_smem_to_tilespmem_tileN: |
| case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm: |
| case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm4b: |
| case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_spmem: |
| case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_tilespmem_tileN: |
| // We don't actually need to add memory operands for stream. We'd get |
| // regular barriers from the DAG builder otherwise, but we're doing it |
| // right and stick to memory edges here. |
| assert(IsSC); |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = VNI32; |
| // TODO(hgreving): We could have a finer grain aliasing information by |
| // adding several memory operands. We currently only attach the TileSpmem |
| // memory operand, because that's all we currently consider when analyzing |
| // the DAG's edge later. We also don't want to hard-code the operand |
| // number, because there are too many stream intrinsics. Instead, we're |
| // just searching the operands. |
| Info.ptrVal = nullptr; |
| for (auto &Op : I.operands()) { |
| if (!Op->getType()->isPointerTy()) |
| continue; |
| if (Op->getType()->getPointerAddressSpace() != TPUAS_TileSpmem) |
| continue; |
| Info.ptrVal = Op; |
| break; |
| } |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_bc_load_aliaddr: |
| case Intrinsic::tpu_bc_load_aliaddr_flm: |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::getVT(I.getType()); |
| Info.ptrVal = I.getOperand(0); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOLoad; |
| return true; |
| case Intrinsic::tpu_bc_store_aliaddr: |
| case Intrinsic::tpu_bc_store_aliaddr_flm: |
| Info.opc = ISD::INTRINSIC_VOID; |
| Info.memVT = MVT::getVT(I.getOperand(0)->getType()); |
| Info.ptrVal = I.getOperand(1); |
| Info.size = MemoryLocation::UnknownSize; |
| Info.flags = MachineMemOperand::MOStore; |
| return true; |
| case Intrinsic::tpu_bc_loop_end: { |
| Info.opc = ISD::INTRINSIC_W_CHAIN; |
| Info.memVT = MVT::i1; |
| Info.ptrVal = TM.getPSV(TPUTargetMachine::PSV_BarnaCoreChannel_LoopEnd); |
| Info.size = 1; |
| Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
| return true; |
| } |
| default: |
| return false; |
| } |
| } |
| |
| void TPUTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, |
| KnownBits &Known, |
| const APInt &DemandedElts, |
| const SelectionDAG &DAG, |
| unsigned Depth) const { |
| KnownBits Known2; |
| Known.resetAll(); |
| |
| switch (Op.getOpcode()) { |
| default: |
| break; |
| case TPUISD::UMUL24: |
| unsigned BitWidth = 32; |
| Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); |
| Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); |
| // Instruction zero out the 8 top bits. |
| Known.Zero.setHighBits(8); |
| Known2.Zero.setHighBits(8); |
| // If low bits are zero in either operand, output low known-0 bits. |
| // Also compute a conservative estimate for high known-0 bits. |
| unsigned TrailZ = |
| Known.countMinTrailingZeros() + Known2.countMinTrailingZeros(); |
| unsigned LeadZ = |
| std::max(Known.countMinLeadingZeros() + Known2.countMinLeadingZeros(), |
| BitWidth) - |
| BitWidth; |
| |
| Known.resetAll(); |
| Known.Zero.setLowBits(std::min(TrailZ, BitWidth)); |
| Known.Zero.setHighBits(std::min(LeadZ, BitWidth)); |
| break; |
| } |
| } |
| |
| void TPUTargetLowering::addTPUMemOperand(SelectionDAG &DAG, SDNode *N, |
| bool IsPush, |
| const TargetRegisterClass *RC) const { |
| // Add a MachineMemOperand to N, marking it as a push or pop of the given |
| // register class. |
| MachineSDNode *MN = cast<MachineSDNode>(N); |
| MachinePointerInfo MPI( |
| static_cast<const TPUTargetMachine &>(getTargetMachine()) |
| .getFifoPSV(IsPush, RC)); |
| auto *MemRef = DAG.getMachineFunction().getMachineMemOperand( |
| MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4, |
| /*base_alignment=*/llvm::Align(4)); |
| DAG.setNodeMemRefs(MN, {MemRef}); |
| } |