tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelLowering.cpp - collateral - Git at Google

 //===--------- TPUISelLowering.cpp - TPU DAG Lowering Implementation ------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the TPUTargetLowering class.
 //
 //===----------------------------------------------------------------------===//

 #include "TPUISelLowering.h"
 #include "MCTargetDesc/TPUBaseInfo.h"
 #include "MCTargetDesc/TPUMCTargetDesc.h"
 #include "TPU.h"
 #include "TPUCallingConv.h"
 #include "TPUIRUtils.h"
 #include "TPUMachineFunctionInfo.h"
 #include "TPURegisterInfo.h"
 #include "TPUSubtarget.h"
 #include "TPUTargetMachine.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IntrinsicsTPU.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <utility>

 #define DEBUG_TYPE "tpu-lower"

 using namespace llvm;

 static cl::opt<bool> PropagateTpuEmbeddedMasks(
     "tpu-enable-embedded-masks", cl::Hidden, cl::init(true),
     cl::desc("Enables propagating embedded hardware masks "
              "into special mask registers."));

 static cl::opt<bool>
     GenerateTpuVCMasks("tpu-enable-vcmasks", cl::Hidden, cl::init(true),
                        cl::desc("Enables generation of vcmask instructions to "
                                 "create mask immediates whenever possible."));

 static cl::opt<bool>
     EmulateSignedDivRem("tpu-emulate-signed-divrem", cl::Hidden,
                         cl::init(false),
                         cl::desc("Enables emulation of signed div/rem via the "
                                  "unsigned div/rem instructions"));

 extern cl::opt<bool> TPUVerifierStrictIntoPtr;

 bool TPUTargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
     const DataLayout &DL) const {
   // All aggregates on BarnaCore are allocated consecutive registers.
   return IsBC && (Ty->isArrayTy() || Ty->isStructTy());
 }

 TPUTargetLowering::TPUTargetLowering(const TargetMachine &TM,
                                      const TPUSubtarget &STI)
     : TargetLowering(TM) {
   ST = &STI;
   TII = ST->getInstrInfo();

   IsBC = ST->isPxcBarnaCore();
   IsSC = ST->isSparseCore();
   IsVFTC = ST->hasVfcTensorCore();
   HasLPVF = ST->hasLPVF();
   HasLPGL = ST->hasLPGL();
   HasVMinMax = ST->hasVMinMax();

   if (ST->hasV1024()) {
     HasVPU = true;
     VNI32 = MVT::v1024i32;
     VNF32 = MVT::v1024f32;
     // TODO(thomasraoux): Mask can be 2bits per elements on PFC:
     // https://g3doc.corp.google.com/platforms/deepsea/logic/pfc/g3doc/isa/tensorcore.md#create-sublane-mask-instruction
     VMNI1 = MVT::v1024i1;
   } else if (ST->hasV16()) {
     HasVPU = true;
     VNI32 = MVT::v16i32;
     VNF32 = MVT::v16f32;
     VMNI1 = MVT::v16i1;
   } else if (ST->hasV8()) {
     HasVPU = true;
     VNI32 = MVT::v8i32;
     VNF32 = MVT::v8f32;
     VMNI1 = MVT::v8i1;
   } else {
     // No vector support.
     VNI32 = MVT::i32;
     VNF32 = MVT::f32;
     VMNI1 = MVT::i1;
   }

   VNBF16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VNF16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VNI16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VNI4 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VNI2 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VNI8 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VMNBF16I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VNI8I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VMN16I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VMN32I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   VMN64I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
   if (HasLPVF || HasLPGL) {
     if (ST->hasV8()) {
       VNBF16 = MVT::v16bf16;
       VNF16 = MVT::v16f16;
       VNI16 = MVT::v16i16;
       VNI8 = MVT::v32i8;
       VNI4 = MVT::v64i4;
       VNI2 = MVT::v128i2;
       VMNBF16I1 = MVT::v16i1;
       VNI8I1 = MVT::v32i1;
       VNI1 = MVT::v256i1;
       VMN16I1 = MVT::v16i1;
       VMN32I1 = MVT::v32i1;
     } else if (ST->hasV16()) {
       VNBF16 = MVT::v32bf16;
       VNF16 = MVT::v32f16;
       VNI16 = MVT::v32i16;
       VNI8 = MVT::v64i8;
       VNI1 = MVT::v512i1;
       VNI4 = MVT::v128i4;
       VNI2 = MVT::v256i2;
       VMNBF16I1 = MVT::v32i1;
       VNI8I1 = MVT::v64i1;
       VMN32I1 = MVT::v32i1;
       VMN64I1 = MVT::v32i1;
     } else {
       llvm_unreachable("Unexpected VPU size.");
     }
   }

   // Set up the register classes.
   addRegisterClass(MVT::i32, &TPU::GPRRegClass);
   addRegisterClass(MVT::f32, &TPU::GPRRegClass);
   addRegisterClass(MVT::bf16, &TPU::GPRRegClass);
   addRegisterClass(MVT::i1, &TPU::PPRRegClass);
   // MVT::i8 is not legal in GPR.

   if (IsSC) {
     // SparseCore is hijacking the mmx data type for cbreg.
     addRegisterClass(MVT::x86mmx, &TPU::CBRRegClass);
   }

   if (HasVPU) {
     if (IsBC) {
       // BarnaCore has Vregs and Vaggregs that both have the same type, so
       // use VPR_AGGRegClass which is the superclass of both. Restricting a
       // regclass to a strict subset is trivial.
       addRegisterClass(VNI32, &TPU::VPR_AGGRegClass);
       addRegisterClass(VNF32, &TPU::VPR_AGGRegClass);
     } else {
       addRegisterClass(VNI32, &TPU::VPRRegClass);
       addRegisterClass(VNF32, &TPU::VPRRegClass);
     }
     addRegisterClass(VMNI1, &TPU::MPRRegClass);
   }
   if (HasLPVF || HasLPGL) {
     addRegisterClass(VNBF16, &TPU::VPRRegClass);
     addRegisterClass(VNF16, &TPU::VPRRegClass);
     addRegisterClass(VNI16, &TPU::VPRRegClass);
     addRegisterClass(VNI8, &TPU::VPRRegClass);
     addRegisterClass(VNI4, &TPU::VPRRegClass);
     addRegisterClass(VNI2, &TPU::VPRRegClass);
     addRegisterClass(VNI1, &TPU::VPRRegClass);
     if (ST->hasV8())
       addRegisterClass(VMN16I1, &TPU::MPRRegClass);
     addRegisterClass(VMN32I1, &TPU::MPRRegClass);
     if (ST->hasV16())
       addRegisterClass(VMN64I1, &TPU::MPRRegClass);
   }

   // Compute derived properties from the register classes
   TRI = ST->getRegisterInfo();
   computeRegisterProperties(TRI);

   setStackPointerRegisterToSaveRestore(TPU::SPS);

   setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i8, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i8, Expand);

   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Legal);
   setOperationAction(ISD::SETCC, MVT::i32, Legal);
   setOperationAction(ISD::SETCC, MVT::i1, Promote);
   setOperationAction(ISD::SELECT, MVT::i32, Legal);
   setOperationAction(ISD::SELECT, MVT::f32, Legal);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
   setOperationAction(ISD::SELECT_CC, VNF32, Expand);
   setOperationAction(ISD::SELECT_CC, VNI32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

   setOperationAction(ISD::VASTART, MVT::Other, Expand);
   setOperationAction(ISD::VAARG, MVT::Other, Expand);
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);

   if (IsSC) {
     setOperationAction(ISD::SDIVREM, MVT::i32, Legal);
     setOperationAction(ISD::UDIVREM, MVT::i32, Legal);
     setOperationAction(ISD::FDIV, MVT::f32, Custom);
   } else {
     setOperationAction(ISD::SDIVREM, VNI32, Expand);
     setOperationAction(ISD::UDIVREM, VNI32, Expand);
   }

   // We rely on the combiner to expand into DIVREM.
   auto SDivRemAction = EmulateSignedDivRem ? Custom : Expand;
   setOperationAction(ISD::SDIV, MVT::i32, SDivRemAction);
   setOperationAction(ISD::SREM, MVT::i32, SDivRemAction);
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
   setOperationAction(ISD::UREM, MVT::i32, Expand);

   // We do not currently VDiv i32.
   setOperationAction(ISD::SDIV, VNI32, Expand);
   setOperationAction(ISD::UDIV, VNI32, Expand);
   setOperationAction(ISD::SREM, VNI32, Expand);
   setOperationAction(ISD::UREM, VNI32, Expand);

   for (const auto &VT : {MVT::i32, VNI32}) {
     setOperationAction(ISD::MUL, VT, Legal);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);

     setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::ROTL, VT, Expand);
     setOperationAction(ISD::SHL_PARTS, VT, Expand);
     setOperationAction(ISD::SRL_PARTS, VT, Expand);
     setOperationAction(ISD::SRA_PARTS, VT, Expand);

     setOperationAction(ISD::BSWAP, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Legal);
     setOperationAction(ISD::CTLZ, VT, Legal);
     setOperationAction(ISD::CTTZ, VT, Expand);
   }

   // If VMul i32 is not natively supported, we need to emulate it.
   if (!IsSC && !IsVFTC)
     setOperationAction(ISD::MUL, VNI32, Custom);
   // For Jellyfish do a custom lowering of i32 MUL
   if (!ST->hasSMul32())
     setOperationAction(ISD::MUL, MVT::i32, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);

   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   if (IsVFTC || IsSC) {
     setOperationAction(ISD::UMAX, MVT::i32, Legal);
     setOperationAction(ISD::UMIN, MVT::i32, Legal);
     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
   } // else we will fail lowering.
   setOperationAction(ISD::FNEG, MVT::f32, Legal);
   setOperationAction(ISD::FNEG, VNF32, Legal);

   // Extended load operations for i1 types must be promoted
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
   }
   setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
   setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
   setOperationAction(ISD::FMAXIMUM, VNF32, Legal);
   setOperationAction(ISD::FMINIMUM, VNF32, Legal);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
   setOperationAction(ISD::FCOPYSIGN, VNF32, Legal);
   if (HasLPGL) {
     setOperationAction(ISD::FMAXIMUM, VNBF16, Legal);
     setOperationAction(ISD::FMINIMUM, VNBF16, Legal);
   }

   // Unordered comparisons not supported.
   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
   setCondCodeAction(ISD::SETO, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUEQ, VNF32, Expand);
   setCondCodeAction(ISD::SETUGT, VNF32, Expand);
   setCondCodeAction(ISD::SETUGE, VNF32, Expand);
   setCondCodeAction(ISD::SETULT, VNF32, Expand);
   setCondCodeAction(ISD::SETULE, VNF32, Expand);
   setCondCodeAction(ISD::SETONE, VNF32, Expand);
   setCondCodeAction(ISD::SETUO, VNF32, Expand);
   setCondCodeAction(ISD::SETO, VNF32, Expand);
   if (HasLPGL) {
     setCondCodeAction(ISD::SETUEQ, VNBF16, Expand);
     setCondCodeAction(ISD::SETUGT, VNBF16, Expand);
     setCondCodeAction(ISD::SETUGE, VNBF16, Expand);
     setCondCodeAction(ISD::SETULT, VNBF16, Expand);
     setCondCodeAction(ISD::SETULE, VNBF16, Expand);
     setCondCodeAction(ISD::SETONE, VNBF16, Expand);
     setCondCodeAction(ISD::SETUO, VNBF16, Expand);
     setCondCodeAction(ISD::SETO, VNBF16, Expand);
   }
   if (HasVMinMax) {
     setOperationAction(ISD::UMAX, VNI32, Legal);
     setOperationAction(ISD::UMIN, VNI32, Legal);
     if (HasLPGL) {
       setOperationAction(ISD::UMAX, VNI16, Legal);
       setOperationAction(ISD::UMIN, VNI16, Legal);
     }
   }

   // Unsigned scalar comparisons supported for VF and SC subtargets.
   LegalizeAction UnsignedCmpLegalizeAction = Custom;
   if (ST->hasUnsignedScalarCompare()) {
     UnsignedCmpLegalizeAction = Legal;
   }
   setCondCodeAction(ISD::SETUGT, MVT::i32, UnsignedCmpLegalizeAction);
   setCondCodeAction(ISD::SETUGE, MVT::i32, UnsignedCmpLegalizeAction);
   setCondCodeAction(ISD::SETULT, MVT::i32, UnsignedCmpLegalizeAction);
   setCondCodeAction(ISD::SETULE, MVT::i32, UnsignedCmpLegalizeAction);

   // Unsigned scalar comparisons supported for SC subtargets.
   UnsignedCmpLegalizeAction = Custom;
   if (ST->hasUnsignedVectorCompare()) {
     UnsignedCmpLegalizeAction = Legal;
   }
   setCondCodeAction(ISD::SETUGT, VNI32, UnsignedCmpLegalizeAction);
   setCondCodeAction(ISD::SETUGE, VNI32, UnsignedCmpLegalizeAction);
   setCondCodeAction(ISD::SETULT, VNI32, UnsignedCmpLegalizeAction);
   setCondCodeAction(ISD::SETULE, VNI32, UnsignedCmpLegalizeAction);

   setTargetDAGCombine(ISD::BUILD_VECTOR);
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);

   // We could match this during isel in tablegen, but we want a bit more
   // control.
   setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

   // Function alignments (log2)
   setMinFunctionAlignment(Align(2));
   setPrefFunctionAlignment(Align(2));

   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);

   setJumpIsExpensive(false);

   // TODO(jmolloy): This is a hangover from Lanai. Evaluate if jumptables are
   // needed or useful.
   setMinimumJumpTableEntries(100);

   // We'd run into trouble with pointer word sizes if we let native selection
   // DAG lower these.
   MaxStoresPerMemset = 0; // For @llvm.memset -> sequence of stores
   MaxStoresPerMemsetOptSize = 0;
   MaxStoresPerMemcpy = 0; // For @llvm.memcpy -> sequence of stores
   MaxStoresPerMemcpyOptSize = 0;
   MaxStoresPerMemmove = 0; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 0;

   // Booleans always contain 0 or 1.
   setBooleanContents(ZeroOrOneBooleanContent);
 }

 SDValue TPUTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();

   SDValue TargetAddr = DAG.getTargetGlobalAddress(
       GV, DL, getPointerTy(DAG.getDataLayout()), Offset);
   return DAG.getNode(TPUISD::WRAPPER, DL, MVT::i32, TargetAddr);
 }

 SDValue TPUTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   CondCodeSDNode *Cond = cast<CondCodeSDNode>(Op.getOperand(2).getNode());
   assert(!Op.getOperand(0).getValueType().isFloatingPoint() &&
          ISD::isUnsignedIntSetCC(Cond->get()) &&
          "Comparisons involving floating-point and signed-int types should not "
          "be custom lowered as they are either expanded or legal.");

   ISD::CondCode SignedCond;
   switch (Cond->get()) {
   default:
     llvm_unreachable("Unknown signed condcode?");
   case ISD::CondCode::SETULT:
     SignedCond = ISD::CondCode::SETLT;
     break;
   case ISD::CondCode::SETULE:
     SignedCond = ISD::CondCode::SETLE;
     break;
   case ISD::CondCode::SETUGT:
     SignedCond = ISD::CondCode::SETGT;
     break;
   case ISD::CondCode::SETUGE:
     SignedCond = ISD::CondCode::SETGE;
     break;
   }
   SDLoc DL(Op);

   // Generate unsigned setcc as:
   //  %x = setcc signed %a, %b
   //  %y = xor %a, %b      // one if bitwise different.
   //  %z = setcc slt %y, 0 // sign bit different?
   //       xor %x, %z
   EVT VT = Op.getValueType();
   EVT InputVT = Op.getOperand(0).getValueType();
   SDValue X =
       DAG.getSetCC(DL, VT, Op.getOperand(0), Op.getOperand(1), SignedCond);
   SDValue Y = DAG.getNode(ISD::XOR, DL, InputVT, Op.getOperand(0),
                               Op.getOperand(1));
   SDValue Z = DAG.getSetCC(DL, VT, Y, DAG.getConstant(0, DL, InputVT),
                                ISD::CondCode::SETLT);
   return DAG.getNode(ISD::XOR, DL, VT, X, Z);
 }

 SDValue TPUTargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   assert(ST->isSparseCore());
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
   SDValue Splat = DAG.getNode(TPUISD::SPLAT, DL, VNF32, Y);
   auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
   auto PredInvert = DAG.getTargetConstant(APInt(1, 0), DL, MVT::i1);
   MachineSDNode *VRcpPush =
       DAG.getMachineNode(TPU::VRCP, DL, MVT::i32, {Splat, PredReg, PredInvert});
   addTPUMemOperand(DAG, VRcpPush, /*IsPush=*/true, &TPU::ERFPRRegClass);
   MachineSDNode *VRcpPop = DAG.getMachineNode(
       TPU::VRES_EUP, DL, VNF32, {SDValue(VRcpPush, 0), PredReg, PredInvert});
   addTPUMemOperand(DAG, VRcpPop, /*IsPush=*/false, &TPU::ERFPRRegClass);
   SDValue Srcp =
       SDValue(DAG.getMachineNode(TPU::scVREADr, SDLoc(Op), MVT::f32,
                                  {SDValue(VRcpPop, 0),
                                   DAG.getTargetConstant(0, DL, MVT::i32),
                                   PredReg, PredInvert}),
               0);
   SDValue FDivRes = DAG.getNode(ISD::FMUL, DL, MVT::f32, X, Srcp);
   return FDivRes;
 }

 SDValue TPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
   assert(Op.getValueType() == MVT::i32 && "Expected i32 when Lowering SDIV32");

   // To emulate signed division, we:
   //   1. Take the absolute value of the operands
   //   2. Perform an unsigned divide of the operands
   //   3. Possibly negate the result of (2.).
   unsigned UnsignedOpCode;
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Unknown signed divrem opcode");
   case ISD::SDIV:
     UnsignedOpCode = ISD::UDIV;
     break;
   case ISD::SREM:
     UnsignedOpCode = ISD::UREM;
     break;
   }

   EVT VT = Op.getValueType();

   // 1. Compute abs(x), abs(y): abs(x) = x ^ (x >> 31) - (x >> 31)
   //
   // Note: we do this slightly differently than LLO, which uses
   // compares+selects, but we end up with the same number of instructions.
   // http://google3/platforms/xla/service/jellyfish/llo_region_builder.cc?l=950&rcl=378412916
   SDValue XMask = DAG.getNode(ISD::SRA, DL, VT, X, DAG.getConstant(31, DL, VT));
   SDValue YMask = DAG.getNode(ISD::SRA, DL, VT, Y, DAG.getConstant(31, DL, VT));

   SDValue XInv = DAG.getNode(ISD::XOR, DL, VT, X, XMask);
   SDValue YInv = DAG.getNode(ISD::XOR, DL, VT, Y, YMask);

   SDValue XAbs = DAG.getNode(ISD::SUB, DL, VT, XInv, XMask);
   SDValue YAbs = DAG.getNode(ISD::SUB, DL, VT, YInv, YMask);

   // 2. Compute unsigned div/rem.
   SDValue AbsResult = DAG.getNode(UnsignedOpCode, DL, VT, XAbs, YAbs);

   // 3. Possibly negate the result of the unsigned div/rem.
   SDValue SignMask;
   if (Op.getOpcode() == ISD::SDIV) {
     SignMask = DAG.getNode(ISD::XOR, DL, VT, XMask, YMask);
   } else {
     // For rem, the sign is determined by the dividend (X), defined the same way
     // as the remainder operator % in C:
     //   (a % b) == a - (a / b) * b
     SignMask = XMask;
   }
   // SignMask is either all zeros or all ones (in which case the result should
   // be negative). When it is all ones, we can use this mask to negate the two's
   // complement result similar to finding abs(x):
   //   result = abs_result ^ mask - mask
   SDValue AbsResultInv = DAG.getNode(ISD::XOR, DL, VT, AbsResult, SignMask);
   SDValue SignedResult = DAG.getNode(ISD::SUB, DL, VT, AbsResultInv, SignMask);

   return SignedResult;
 }

 SDValue TPUTargetLowering::LowerMUL32(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
   assert(Op.getValueType() == MVT::i32 && "Expected i32 when Lowering MUL32");
   // Expand a MUL i32 operation using the UMUL24 node for Jellyfish.
   // The decomposition looks like:
   // c = mul i32 a, b
   // -->
   // ll = umul24 i32 a, b
   // al = srl i32 a, 24
   // bl = srl i32 b, 24
   // lh = umul24 i32 al, b
   // hl = umul24 i32 a, bl
   // sum = add i32 lh, hl
   // shiftsum = shl i32 sum, 24
   // c = add i32 shiftsum, ll
   EVT VT = Op.getValueType();
   KnownBits KBX = DAG.computeKnownBits(X);
   KnownBits KBY = DAG.computeKnownBits(Y);
   bool X_is_24bit = (KBX.Zero & 0xFF000000U) == 0xFF000000U;
   bool Y_is_24bit = (KBY.Zero & 0xFF000000U) == 0xFF000000U;
   // Using the fact that the smul.u24 instruction automatically zeroes out the
   // upper bits of the operands. This saves the need to do it ourselves.
   SDValue Low_Low = DAG.getNode(TPUISD::UMUL24, DL, VT, X, Y);
   SDValue High_Low, Low_High;
   if (!X_is_24bit) {
     SDValue HighX =
         DAG.getNode(ISD::SRL, DL, VT, X, DAG.getConstant(24, DL, VT));
     High_Low = DAG.getNode(TPUISD::UMUL24, DL, VT, HighX, Y);
   }
   if (!Y_is_24bit) {
     SDValue HighY =
         DAG.getNode(ISD::SRL, DL, VT, Y, DAG.getConstant(24, DL, VT));
     Low_High = DAG.getNode(TPUISD::UMUL24, DL, VT, X, HighY);
   }
   SDValue MixedSum;
   if (High_Low && Low_High) {
     MixedSum = DAG.getNode(ISD::ADD, DL, VT, High_Low, Low_High);
   } else if (High_Low) {
     MixedSum = High_Low;
   } else if (Low_High) {
     MixedSum = Low_High;
   } else {
     return Low_Low;
   }
   SDValue ShiftedSum =
       DAG.getNode(ISD::SHL, DL, VT, MixedSum, DAG.getConstant(24, DL, VT));
   return DAG.getNode(ISD::ADD, DL, VT, Low_Low, ShiftedSum);
 }

 // Handle the lowering of the simple cases where one operand is a constant.
 // This uses non-adjacent form (NAF).
 SDValue TPUTargetLowering::SimpleEmulVMUL32(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
   if (Y.getOpcode() != TPUISD::SPLAT)
     return SDValue();
   ConstantSDNode *C = isConstOrConstSplat(Y.getOperand(0));
   if (C == nullptr)
     return SDValue();
   int M = C->getZExtValue();
   int HighestOne = -1;
   int NonZeroEntries = 0;
   std::array<int, 32> SignedDigit;
   SignedDigit.fill(0);

   // Following algortihm taken from:
   // https://en.wikipedia.org/wiki/Non-adjacent_form
   int64_t e = std::abs(M);
   const int s = M < 0 ? -1 : 1;
   int i = 0;
   while (e > 0) {
     int zi = 0;
     if (e % 2 == 1) {
       zi = 2 - (e % 4);
       if (zi != 0) {
         ++NonZeroEntries;
       }
     }
     SignedDigit[i] = s * zi;
     if (SignedDigit[i] == 1) {
       HighestOne = i;
     }
     e = (e - zi) / 2;
     ++i;
   }

   // Initialize the running sum. Set the running sum to the maximal
   // shifted positive value (i.e., largest i such that zi == 1 and MulAmt
   // has V<<i as a term NAF).
   SDValue Res;
   if (HighestOne == -1) {
     Res =
         DAG.getNode(TPUISD::SPLAT, DL, VNI32, DAG.getConstant(0, DL, MVT::i32));
   } else {
     Res = DAG.getNode(TPUISD::SPLAT, DL, VNI32,
                       DAG.getConstant(HighestOne, DL, MVT::i32));
     Res = DAG.getNode(ISD::SHL, DL, VNI32, X, Res);
     SignedDigit[HighestOne] = 0;
   }

   // Assemble multiplication from shift, add, sub using NAF form and
   // running sum.
   for (size_t i = 0; i < SignedDigit.size(); ++i) {
     if (SignedDigit[i] == 0) {
       continue;
     }

     SDValue op = X;
     // Shifted multiplicand (v<<i).
     if (i > 0) {
       SDValue I = DAG.getNode(TPUISD::SPLAT, DL, VNI32,
                               DAG.getConstant(i, DL, MVT::i32));
       op = DAG.getNode(ISD::SHL, DL, VNI32, X, I);
     }
     if (SignedDigit[i] == 1) {
       Res = DAG.getNode(ISD::ADD, DL, VNI32, Res, op);
     } else if (SignedDigit[i] == -1) {
       Res = DAG.getNode(ISD::SUB, DL, VNI32, Res, op);
     }
   }
   return Res;
 }

 // Logic to lower down VMUL32 copied from LLO region builder.
 SDValue TPUTargetLowering::LowerVMUL32(SDValue Op, SelectionDAG &DAG) const {
   if (SDValue V = SimpleEmulVMUL32(Op, DAG))
     return V;
   SDLoc DL(Op);
   SDValue lhs = Op.getOperand(0);
   SDValue rhs = Op.getOperand(1);
   // Multiword multiplication. Splits the inputs up into 3 11-bit words using
   // fmul, uses VmulU11 to form their products.
   // Computes int32(x * y). We use this as a 11 bit x 11 bit -> 22 bit integer
   // multiplication primitive without losing precision.
   // Generates code:
   //   uint32 u0 = u & 0x7FF;
   //   uint32 u1 = (u >> 11) & 0x7FF;
   //   uint32 u2 = u >> 22;
   //   uint32 v0 = v & 0x7FF;
   //   uint32 v1 = (v >> 11) & 0x7FF;
   //   uint32 v2 = v >> 22;
   //   return u0 * v0 + ((u1 * v0 + u0 * v1) << 11) +
   //          ((u0 * v2 + u1 * v1 + u2 * v0) << 22);
   auto VSplatImm32 = [&](int I) {
     return DAG.getNode(TPUISD::SPLAT, DL, VNI32,
                        DAG.getConstant(I, DL, MVT::i32));
   };
   auto VandU32 = [&](SDValue X, SDValue Y) {
     return DAG.getNode(ISD::AND, DL, VNI32, X, Y);
   };
   auto VaddS32 = [&](SDValue X, SDValue Y) {
     return DAG.getNode(ISD::ADD, DL, VNI32, X, Y);
   };
   auto VshrlU32 = [&](SDValue X, SDValue Y) {
     return DAG.getNode(ISD::SRL, DL, VNI32, X, Y);
   };
   auto VshllU32 = [&](SDValue X, SDValue Y) {
     return DAG.getNode(ISD::SHL, DL, VNI32, X, Y);
   };
   auto VcvtS32ToF32 = [&](SDValue X) {
     return DAG.getNode(ISD::SINT_TO_FP, DL, VNF32, X);
   };

   // Computes int32(x * y). We use this as a 11 bit x 11 bit -> 22 bit integer
   // multiplication primitive without losing precision.
   auto VmulU11 = [&](SDValue Lhs, SDValue Rhs) {
     auto Product =
         DAG.getNode(ISD::FMUL, DL, VNF32, VcvtS32ToF32(Lhs), VcvtS32ToF32(Rhs));
     return DAG.getNode(ISD::FP_TO_SINT, DL, VNI32, Product);
   };

   auto mask = VSplatImm32(0x7FF);
   auto k11 = VSplatImm32(11);
   auto k22 = VSplatImm32(22);

   auto u0 = VandU32(lhs, mask);
   auto u1 = VandU32(VshrlU32(lhs, k11), mask);
   auto u2 = VshrlU32(lhs, k22);

   auto v0 = VandU32(rhs, mask);
   auto v1 = VandU32(VshrlU32(rhs, k11), mask);
   auto v2 = VshrlU32(rhs, k22);

   auto w0 = VmulU11(u0, v0);

   auto w1 = VmulU11(u1, v0);
   w1 = VaddS32(w1, VmulU11(u0, v1));
   w1 = VshllU32(w1, k11);

   auto w2 = VmulU11(u0, v2);
   w2 = VaddS32(w2, VmulU11(u1, v1));
   w2 = VaddS32(w2, VmulU11(u2, v0));
   w2 = VshllU32(w2, k22);

   return VaddS32(VaddS32(w0, w1), w2);
 }

 SDValue TPUTargetLowering::LowerADDRSPACECAST(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc dl(Op);
   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
   unsigned SrcAS = ASC->getSrcAddressSpace();
   unsigned DestAS = ASC->getDestAddressSpace();
   if ((SrcAS == TPUAS_Smem && DestAS == TPUAS_SmemAny) ||
       (SrcAS == TPUAS_Hbm && DestAS == TPUAS_HbmAny) ||
       (SrcAS == TPUAS_Sflag && DestAS == TPUAS_SflagAny) ||
       (SrcAS == TPUAS_Sflag && DestAS == TPUAS_SflagTile)) {
     return DAG.getNode(ISD::BITCAST, dl, MVT::i32, ASC->getOperand(0));
   }
   if (!TPUVerifierStrictIntoPtr)
     return DAG.getNode(ISD::BITCAST, dl, MVT::i32, ASC->getOperand(0));
   report_fatal_error("Unsupported addrspace cast " + Twine(SrcAS) + "->" +
                      Twine(DestAS) + ".\n");
 }

 SDValue TPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("unimplemented operand");
   case ISD::SETCC:
     return LowerSETCC(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::SDIV:
   case ISD::SREM:
     return LowerSDIV32(Op, DAG);
   case ISD::FDIV:
     if (!ST->hasVPU())
       llvm_unreachable("fdiv on scalar core is not supported.");
     return LowerFDIV32(Op, DAG);
   case ISD::MUL: {
     if (Op.getValueType() == MVT::i32)
       return LowerMUL32(Op, DAG);
     return LowerVMUL32(Op, DAG);
   }
   case ISD::ADDRSPACECAST:
     return LowerADDRSPACECAST(Op, DAG);
   case TPUISD::SPLAT:
     // We're doing some specific type checking, because this is a special case
     // for MVT::v32i8 when the DAG legalizer tries to promote MVT::i8.
     if (isTypeLegal(Op->getOperand(0).getValueType()))
       llvm_unreachable(
           "This should only happen if the splat element isn't legal.");
     EVT VT = Op->getOperand(0).getValueType();
     if (!VT.isSimple() || !VT.isInteger() || VT != MVT::i8)
       llvm_unreachable("This should only happen on scalar type MVT::i8, "
                        "which is being promoted.");
     // We're promoting the MVT::i8 Splat element and match it later.
     return DAG.getNode(
         TPUISD::SPLAT, SDLoc(Op), Op->getSimpleValueType(0),
         DAG.getTargetConstant(Op->getConstantOperandAPInt(0).zext(32),
                               SDLoc(Op), MVT::i32));
   }
 }

 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//

 static bool isMaskVT(MVT VT, const TPUSubtarget &ST) {
   return VT.getScalarType() == MVT::i1 &&
          /* This check is for real low precision i1 types */
          VT.getSizeInBits() != 8 * ST.vectorSizeInBytes();
 }

 // Custom version of CCInfo.AnalyzeFormalArguments, supporting scalar and vector
 // stacks. Hacks the memory offsets split into two stacks into the ArgLocs.
 static void analyzeFormalArguments(const TPUTargetLowering &TLI,
                                    const TPUSubtarget *ST,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
                                    CCState &CCInfo,
                                    SmallVector<CCValAssign, 16> &ArgLocs) {
   int NumBytesScalar = 0;
   int NumBytesVector = 0;
   unsigned NumArgs = Ins.size();
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Ins[i].VT;
     ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
     int PrevNumBytes = CCInfo.getNextStackOffset();
     if (CC_TPU(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo))
       report_fatal_error("unable to allocate function argument #" + Twine(i));
     if (!ST->isTPUABIEnabled() || CCInfo.getCallingConv() == CallingConv::Fast)
       continue;
     CCValAssign &CCV = ArgLocs[i];
     if (int StackOffsetDelta = CCInfo.getNextStackOffset() - PrevNumBytes) {
       if (ArgVT.isVector()) {
         assert(ST->hasVPU());
         // This is a trick using the API in order to adjust the LocMemOffset,
         // because we have two separate stacks for scalar and vector.
         if (isMaskVT(ArgVT, *ST)) {
           int AlignedStackOffsetDelta =
               alignTo(StackOffsetDelta, ST->vectorSizeInBytes());
           StackOffsetDelta = AlignedStackOffsetDelta;
         }
         assert(StackOffsetDelta == ST->vectorSizeInBytes());
         CCV.convertToMem(NumBytesVector);
         NumBytesVector += StackOffsetDelta;
       } else {
         // Same comment as above.
         CCV.convertToMem(NumBytesScalar);
         NumBytesScalar += StackOffsetDelta;
       }
     }
   }
 }

 // Transform physical registers into virtual registers and
 // generate load operations for arguments places on the stack.
 SDValue TPUTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   switch (CallConv) {
   case CallingConv::Fast:
   case CallingConv::C:
     break;
   default:
     report_fatal_error("Unsupported calling convention");
   }

   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();

   if (ST->isTPUABIEnabled())
     RegInfo.addLiveIn(TPU::LR);

   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   analyzeFormalArguments(*this, ST, Ins, CCInfo, ArgLocs);

   DenseMap<unsigned, SmallVector<Register, 4>> OrigArgToRegLoc;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     assert(!MF.getFunction().hasStructRetAttr());
     assert(!IsVarArg);
     assert(VA.getLocInfo() == CCValAssign::Full);
     EVT VT = VA.getLocVT();
     Register VirtReg;
     switch (VT.getSimpleVT().SimpleTy) {
     default:
       llvm_unreachable("Unhandled type in call lowering!");
     case MVT::i8:
     case MVT::i16:
     case MVT::i32:
     case MVT::f32:
       VirtReg = RegInfo.createVirtualRegister(&TPU::GPRRegClass);
       break;
     case MVT::i1:
       assert(!ST->isTPUABIEnabled());
       VirtReg = RegInfo.createVirtualRegister(&TPU::PPRRegClass);
       break;
     case MVT::x86mmx:
       assert(ST->hasVPU());
       VirtReg = RegInfo.createVirtualRegister(&TPU::CBRRegClass);
       break;
     case MVT::v8i32:
     case MVT::v8f32:
     case MVT::v16bf16:
     case MVT::v16f16:
     case MVT::v16i16:
     case MVT::v32i8:
     case MVT::v64i4:
     case MVT::v128i2:
     case MVT::v256i1:
     case MVT::v16i32:
     case MVT::v16f32:
     case MVT::v32bf16:
     case MVT::v32f16:
     case MVT::v32i16:
     case MVT::v64i8:
     case MVT::v128i4:
     case MVT::v256i2:
     case MVT::v512i1:
     case MVT::v1024i32:
     case MVT::v1024f32:
       assert(ST->hasVPU());
       if (IsBC && TPU::VAGGRegClass.contains(VA.getLocReg())) {
         assert(!ST->isTPUABIEnabled());
         VirtReg = RegInfo.createVirtualRegister(&TPU::VAGGRegClass);
       } else {
         VirtReg = RegInfo.createVirtualRegister(&TPU::VPRRegClass);
       }
       break;
     case MVT::v64i1:
       assert(ST->hasVPU());
       if (ST->hasV8())
         llvm_unreachable("Unexpected mask type.");
       VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
       break;
     case MVT::v16i1:
       assert(ST->hasVPU());
       VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
       break;
     case MVT::v32i1:
       assert(ST->hasVPU());
       if (ST->hasV8() && !HasLPGL)
         llvm_unreachable("Needs +lp.");
       VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
       break;
     case MVT::v8i1:
     case MVT::v1024i1:
       assert(ST->hasVPU());
       VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
       break;
     }
     if (VA.isRegLoc()) {
       OrigArgToRegLoc[Ins[i].getOrigArgIndex()].push_back(VA.getLocReg());
       RegInfo.addLiveIn(VA.getLocReg(), VirtReg);
       InVals.push_back(DAG.getCopyFromReg(Chain, DL, VirtReg, VT));
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc());
       assert(!VA.needsCustom());
       MachineFunction &MF = DAG.getMachineFunction();
       unsigned LocMemOffset = VA.getLocMemOffset();
       // In order to make it easier for the callee, the stack pointer in the
       // caller is incremented such that it points to a free slot in the callee
       // for the return address. Adjusting the argument offsets here.
       if (!VA.getValVT().isVector())
         LocMemOffset += ST->scalarSizeInBytes();
       unsigned AdjustedLocMemOffset =
           TPU::adjustForWordSize(
               APInt(32, LocMemOffset),
               VA.getValVT().isVector() ? TPUAS_TileSpmem : TPUAS_Smem, *ST)
               .getZExtValue();
       MachineFrameInfo &MFI = MF.getFrameInfo();
       EVT ArgVT = Ins[i].ArgVT;
       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), AdjustedLocMemOffset,
                                      /*IsImmutable=*/false);
       auto PtrVT = getPointerTy(DAG.getDataLayout());
       unsigned Opcode;
       SDValue StackPtr;
       if (isMaskVT(VA.getValVT(), *ST)) {
         assert(ST->hasVPU());
         Opcode = TPU::RESTORE_MPRs;
         StackPtr = DAG.getRegister(TPU::FPV, MVT::i32);
       } else if (VA.getValVT().isVector()) {
         assert(ST->hasVPU());
         Opcode = TPU::RESTORE_VPRs;
         StackPtr = DAG.getRegister(TPU::FPV, MVT::i32);
       } else {
         Opcode = TPU::RESTORE_GPRs;
         StackPtr = DAG.getRegister(TPU::FPS, MVT::i32);
       }
       SmallVector<SDValue, 8> Ops;
       SDValue TFI = DAG.getTargetFrameIndex(FI, PtrVT);
       auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
       auto PredInvert = DAG.getTargetConstant(APInt(1, 0), DL, MVT::i1);
       Ops.push_back(StackPtr);
       Ops.push_back(TFI);
       Ops.push_back(PredReg);
       Ops.push_back(PredInvert);
       MVT ValVT = VA.getValVT();
       MachineSDNode *MN = DAG.getMachineNode(
           Opcode, DL, isMaskVT(ValVT, *ST) ? VMNI1 : ValVT, Ops);
       auto *MemRef = DAG.getMachineFunction().getMachineMemOperand(
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
           MachineMemOperand::MOLoad, /*s=*/4,
           /*base_alignment=*/llvm::Align(4));
       DAG.setNodeMemRefs(MN, {MemRef});
       SDValue Arg = SDValue(MN, 0);
       InVals.push_back(Arg);
     }
   }

   if (IsBC) {
     // On BarnaCore, we obtain aggregates as function inputs and refer to them
     // by their base register throughout the function. We need to block the
     // register allocator from clobbering them. Aggregates are identified by
     // multiple registers having the same input argument index.
     TPUMachineFunctionInfo &MFInfo = *MF.getInfo<TPUMachineFunctionInfo>();
     for (auto &Range : OrigArgToRegLoc) {
       if (Range.second.size() == 1)
         continue;
       // Note that we rely on the range already being sorted from above.
       MFInfo.addBarnaCoreAggregateRange(Range.second.front() - TPU::VAGG0,
                                         Range.second.back() - TPU::VAGG0 + 1);
     }
   }
   return Chain;
 }

 SDValue
 TPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                bool IsVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
                                const SDLoc &DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
   MachineFunction &MF = DAG.getMachineFunction();

   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());

   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_TPU);
   SmallVector<SDValue, 4> RetOps(1, Chain);

   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     // FIXME(b/237788792): Finalize return ABI.
     assert(VA.isRegLoc() && "Can only return in registers!");
     assert(!VA.needsCustom());
     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Chain);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }

   RetOps[0] = Chain; // Update chain

   // We're checking the call graph here and setting whether or not a function is
   // an entry function. At least on our system, this is good enough.
   TPUMachineFunctionInfo &MFInfo = *MF.getInfo<TPUMachineFunctionInfo>();
   // Ugly cast, CallGraph should really take a const Module. FIXME(hgreving):
   // maybe try to change upstream. The cast here is safe because nobody will
   // change the Module.
   CallGraph CG(*const_cast<Module *>(MF.getMMI().getModule()));
   const CallGraphNode *CGN = CG[&MF.getFunction()];
   // There's a always at least one null node referencing the function.
   if (CGN->getNumReferences() == 1)
     MFInfo.setIsTopLevel(true);
   else
     MFInfo.setIsTopLevel(false);

   if (!ST->isTPUABIEnabled() || MFInfo.isTopLevel())
     return DAG.getNode(TPUISD::HALT, DL, MVT::Other,
                        ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
   return DAG.getNode(TPUISD::RET, DL, MVT::Other,
                      ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
 }

 //===----------------------------------------------------------------------===//
 //                      Custom Lowerings
 //===----------------------------------------------------------------------===//

 SDValue TPUTargetLowering::PerformSCALAR_TO_VECTORCombine(
     SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
   const SDValue &Val = N->getOperand(0);
   MVT VecVT = N->getSimpleValueType(0);

   return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);
 }

 SDValue TPUTargetLowering::PerformINSERT_VECTOR_ELTCombine(
     SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
   const SDValue &Vec = N->getOperand(0);
   const SDValue &Val = N->getOperand(1);

   auto PredReg = DCI.DAG.getRegister(TPU::Palways, MVT::i1);
   auto PredInvert = DCI.DAG.getTargetConstant(APInt(1, 0), SDLoc(N), MVT::i1);
   MVT VecVT = N->getSimpleValueType(0);

   SDValue SplatVal = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);

   SmallVector<SDValue, 8> Ops;
   SDValue Vseq = SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), VNI32,
                                                 PredReg, PredInvert),
                          0);
   Ops.push_back(Vseq);

   SDValue Mask;
   if (const ConstantSDNode *Idx =
           dyn_cast<ConstantSDNode>(N->getOperand(2).getNode())) {
     Ops.push_back(DCI.DAG.getTargetConstant(*Idx->getConstantIntValue(),
                                             SDLoc(N), MVT::i32));
     Ops.push_back(PredReg);
     Ops.push_back(PredInvert);
     Mask =
         SDValue(DCI.DAG.getMachineNode(TPU::VMLANEi, SDLoc(N), VMNI1, Ops), 0);
   } else {
     Ops.push_back(SDValue(cast<SDNode>(N->getOperand(2).getNode()), 0));
     Ops.push_back(PredReg);
     Ops.push_back(PredInvert);
     Mask =
         SDValue(DCI.DAG.getMachineNode(TPU::VMLANEr, SDLoc(N), VMNI1, Ops), 0);
   }
   return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), VecVT, Mask, SplatVal, Vec);
 }

 bool TPUTargetLowering::isNonNaNFPConstSplat(SDValue N) const {
   if (N->getOpcode() == TPUISD::SPLAT) {
     if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
       return !CN->isNaN();
   }
   return false;
 }

 EVT TPUTargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
   // We're returning something that makes sense, though it is useless since we
   // neither know the memory space, nor can we let selection DAG to the LLVM
   // MemOp lowering. See header file for explanation.
   return VNI32;
 }

 SDValue TPUTargetLowering::PerformSETCCCombine(
     SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
   // We help the DAG combiner by recognizing ordered setcc of splats that can't
   // be NaN. LLVM can do that if BUILD_VECTOR, but we combine early into SPLAT,
   // hence this code.
   if (!isNonNaNFPConstSplat(N->getOperand(0)) ||
       !isNonNaNFPConstSplat(N->getOperand(1)))
     return SDValue();
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   // TODO(hgreving): what about SETO?
   ISD::CondCode NoNaNCC = getFCmpCodeWithoutNaN(CC);
   if (NoNaNCC != CC)
     return DCI.DAG.getSetCC(SDLoc(N), N->getSimpleValueType(0),
                             N->getOperand(0), N->getOperand(1), NoNaNCC);
   return SDValue();
 }

 SDValue TPUTargetLowering::getSupportedVCMask(SelectionDAG &DAG, int VectorMask,
                                               SDLoc Loc) const {
   if (!ST->hasVCMasks() || !GenerateTpuVCMasks)
     return SDValue();
   int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
   int FullMask = (1 << MaskSizeInBits) - 1;
   // Technically `< MaskSizeInBits` would be enough because a full mask should
   // be covered by embedded masks.
   for (int i = 1; i <= MaskSizeInBits; i++) {
     int CompareMask = (1 << i) - 1;
     for (int j = 0; j < MaskSizeInBits; j++) {
       int RotCompareMask =
           (CompareMask << j | CompareMask >> (MaskSizeInBits - j)) & FullMask;
       if (VectorMask == RotCompareMask) {
         int S = j * 4;
         int E = ((i + j - 1) % MaskSizeInBits) * 4 + 3;
         assert(S < EVT(VMNI1).getSizeInBits() * 4);
         assert(E < EVT(VMNI1).getSizeInBits() * 4);
         auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
         auto PredInvert = DAG.getTargetConstant(APInt(1, 0), Loc, MVT::i1);
         return SDValue(
             DAG.getMachineNode(
                 TPU::VCMASKi, Loc, VMNI1,
                 DAG.getTargetConstant(APInt(32, E << 8 | S), Loc, MVT::i32),
                 PredReg, PredInvert),
             0);
       }
     }
   }
   return SDValue();
 }

 SDValue TPUTargetLowering::getSupportedVCMask(SelectionDAG &DAG,
                                               SDNode *N) const {
   if (!ST->hasVCMasks() || !GenerateTpuVCMasks)
     return SDValue();
   int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
   if (N->getNumOperands() != MaskSizeInBits)
     return SDValue();
   int BuildVectorMask = 0;
   for (int i = 0; i < MaskSizeInBits; i++) {
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i));
     if (C == nullptr)
       return SDValue();
     BuildVectorMask |= C->getZExtValue() << i;
   }
   return getSupportedVCMask(DAG, BuildVectorMask, SDLoc(N));
 }

 SDValue TPUTargetLowering::PerformBUILD_VECTORCombine(
     SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
   // Combine a BUILD_VECTOR(42, 42, 42, 42, ...) -> SPLAT(42)
   MVT VecVT = N->getSimpleValueType(0);
   if (VecVT != VNI32 && VecVT != VNF32 && VecVT != VMNI1) {
     if (!HasLPGL)
       return SDValue();
     if (VecVT != VNBF16 && VecVT != VNI8)
       return SDValue();
   }
   MVT ScalarVT;
   if (VecVT == VNI32)
     ScalarVT = MVT::i32;
   else if (VecVT == VNF32)
     ScalarVT = MVT::f32;
   else if (VecVT == VNBF16)
     ScalarVT = MVT::bf16;
   else if (VecVT == VNI8)
     ScalarVT = MVT::i8;
   else if (VecVT == VMNI1)
     // Low precision build_vector masks are currently not supported.
     ScalarVT = MVT::i1;
   else
     llvm_unreachable("Bad vector ty!");

   // Checking for supported embedded hardware masks. I would have preferred to
   // do this in tablegen, and this would be possible with sth like this:
   //
   // def tpuvm17 : PatLeaf<(build_vector), [{
   //   return isMask7f(N);
   // }]>;
   //
   // let Predicates = [HasV8,NotBC] in {
   // def : Pat<(vNi1 (Splat -1)), (COPY !cast<TPUReg>("M16"))>;
   // def : Pat<(vNi1 (tpuvm17)), (COPY !cast<TPUReg>("M17"))>;
   //
   // However, since we already combine BUILD_VECTOR here, we would have to check
   // for the embedded masks here anyway and potentially bail combine.
   // Additionally, it is harder to turn on/off the feature in tablegen. Lastly,
   // we may run into cases with instructions not supporting the special mask, in
   // which case we probably want to legalize them, and this will be easier if we
   // combine the hardware mask here. All of the above is the reason why the code
   // is here, and not in tablegen.
   //
   if (ScalarVT == MVT::i1) {
     Register EmbeddedMask = getSupportedEmbeddedMask(N);
     if (EmbeddedMask != TPU::NoRegister)
       return DCI.DAG.getCopyFromReg(DCI.DAG.getEntryNode(), SDLoc(N),
                                     EmbeddedMask, VMNI1);
     SDValue VMCreate = getSupportedVCMask(DCI.DAG, N);
     if (VMCreate.getNode())
       return VMCreate;
   }

   unsigned VecSize = MVT(VecVT).getVectorNumElements();
   bool IsSplat = true;
   bool IsVlaneSeq = true;
   assert(N->getNumOperands() == VecSize);
   SDValue Val0 = N->getOperand(0);
   int IC = -1;
   if (Val0.getSimpleValueType() != ScalarVT)
     return SDValue();
   for (unsigned I = 0; I < VecSize; ++I) {
     if (N->getOperand(I) != Val0 && !N->getOperand(I).isUndef())
       IsSplat = false;
     ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(N->getOperand(I));
     if (!ValC) {
       IsVlaneSeq = false;
       continue;
     }
     if (ValC->getZExtValue() != IC++ + 1)
       IsVlaneSeq = false;
     if (!IsVlaneSeq && !IsSplat)
       break;
   }

   if (IsSplat)
     return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val0);

   auto PredReg = DCI.DAG.getRegister(TPU::Palways, MVT::i1);
   auto PredInvert = DCI.DAG.getTargetConstant(APInt(1, 0), SDLoc(N), MVT::i1);

   if (IsVlaneSeq)
     return SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), VNI32,
                                           PredReg, PredInvert),
                    0);

   // BUILD_VECTOR(a, b, c, d, ...) -> VSEL(Splat(a), ...)
   // This is really ugly but is the only way :(

   // Pick an initial splat value.
   SDValue InitialSplatted = N->getOperand(VecSize - 1);
   SDValue V = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, InitialSplatted);
   for (unsigned I = 0; I < VecSize; ++I) {
     if (N->getOperand(I)->isUndef() || N->getOperand(I) == InitialSplatted)
       continue;
     SDValue SplatVal =
         DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, N->getOperand(I));

     SDValue VMCreate = getSupportedVCMask(DCI.DAG, 1 << I, SDLoc(N));
     SDValue Mask;
     if (VMCreate.getNode()) {
       Mask = VMCreate;
     } else {
       SmallVector<SDValue, 8> Ops;
       SDValue Vseq = SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N),
                                                     VNI32, PredReg, PredInvert),
                              0);
       Ops.push_back(Vseq);
       Ops.push_back(DCI.DAG.getTargetConstant(I, SDLoc(N), MVT::i32));
       Ops.push_back(PredReg);
       Ops.push_back(PredInvert);
       Mask = SDValue(DCI.DAG.getMachineNode(TPU::VMLANEi, SDLoc(N), VMNI1, Ops),
                      0);
     }

     // And use that mask to select-in this value.
     V = DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), VecVT, Mask, SplatVal, V);
   }
   return V;
 }

 SDValue TPUTargetLowering::PerformVECTOR_SHUFFLECombine(
     ShuffleVectorSDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
   // Combine a VECTOR_SHUFFLE(1, 2, 3, 4, 5, 6, 7, 0) -> VROTDOWN()
   // or        VECTOR_SHUFFLE(VECTOR_INSERT(x,y, n), n, n, ...) -> VSPLAT(y)
   // or        VECTOR_SHUFFLE(x, x, x, x, x, x, x, x) -> VSPLAT(VROTDOWN())

   MVT VecVT = N->getSimpleValueType(0);
   if (VecVT != VNI32 && VecVT != VNF32 && VecVT != VMNI1)
     return SDValue();
   assert(N->getNumOperands() == 2);
   SDValue Val = N->getOperand(0);

   unsigned VecSize = MVT(VecVT).getVectorNumElements();
   bool IsSequence = true;
   bool IsSame = true;
   unsigned Offset = N->getMaskElt(0);
   for (unsigned I = 0; I < VecSize; ++I) {
     if (N->getMaskElt(I) != (I + Offset) % VecSize)
       IsSequence = false;
     if (N->getMaskElt(I) != Offset)
       IsSame = false;
   }

   bool NeedsTrunc = false;
   if (VecVT == VMNI1) {
     Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VNI32, Val);
     VecVT = VNI32;
     NeedsTrunc = true;
   }

   // Helper function to trucate the result if we performed extension
   // of the operation from i1
   auto TruncateReturnIfNeed = [&](SDValue V) {
     if (NeedsTrunc)
       return DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), VMNI1, V);
     return V;
   };

   if (IsSequence && ST->isSparseCore())
     return TruncateReturnIfNeed(
         DCI.DAG.getNode(TPUISD::VROTDOWN, SDLoc(N), VecVT, Val,
                         DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32)));

   if (!IsSame && ST->isSparseCore()) {
     // SparseCore has a vector permute that permutes the elements into all lanes
     // of a vector based on a vector mask.
     SmallVector<SDValue, 8> MaskElements;
     for (int El : N->getMask())
       MaskElements.push_back(DCI.DAG.getConstant(El, SDLoc(N), MVT::i32));
     SDValue VMask =
         DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VNI32, MaskElements);
     return TruncateReturnIfNeed(
         DCI.DAG.getNode(TPUISD::VPERMUTE, SDLoc(N), VecVT, Val, VMask));
   }

   if (!IsSame)
     return SDValue();

   // On tensorcore we cannot use rotdown to move any element into lane 0.
   if (!ST->isSparseCore() && Offset != 0)
     return SDValue();

   MVT ScalarVT = VecVT == VNI32 ? MVT::i32 : MVT::f32;
   // If the value replicated comes from an insert, splat directly the original
   // value
   if (N->getOperand(0).getOpcode() == ISD::INSERT_VECTOR_ELT) {
     SDNode *ExtractElt = cast<SDNode>(N->getOperand(0));
     const ConstantSDNode *Idx =
         cast<ConstantSDNode>(ExtractElt->getOperand(2).getNode());
     if (Idx->getConstantIntValue()->getZExtValue() == Offset) {
       SDValue ExtractedVal = ExtractElt->getOperand(1);
       MVT ExtractedSplatVT = NeedsTrunc ? VMNI1 : VecVT;
       return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), ExtractedSplatVT,
                              ExtractedVal);
     }
   }
   if (ST->hasBroadcast()) {
     // SparseCore has a vector broadcast that broadcasts the element at Offset
     // into all lanes of a vector without traversing the scalar side.
     return TruncateReturnIfNeed(
         DCI.DAG.getNode(TPUISD::VBROADCAST, SDLoc(N), VecVT, Val,
                         DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32)));
   }
   // Extract the splatted value from the vector and re-splat it.
   // Rotate the vector if the offset is not zero.
   if (Offset != 0) {
     Val = DCI.DAG.getNode(TPUISD::VROTDOWN, SDLoc(N), VecVT, Val,
                           DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32));
   }
   Val = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ScalarVT, Val,
                         DCI.DAG.getConstant(0, SDLoc(N), MVT::i32));
   Val = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);
   return TruncateReturnIfNeed(Val);
 }

 SDValue TPUTargetLowering::PerformVSELECTCombine(
     SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
   if (N->getValueType(0) != VMNI1)
     return SDValue();
   SDValue Cond = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Op2 = N->getOperand(2);
   if (Op1.getOpcode() == TPUISD::SPLAT && Op2.getOpcode() == TPUISD::SPLAT &&
       isa<ConstantSDNode>(Op1->getOperand(0)) &&
       isa<ConstantSDNode>(Op2->getOperand(0))) {
     bool TrueVal = cast<ConstantSDNode>(Op1->getOperand(0))->getLimitedValue();
     bool FalseVal = cast<ConstantSDNode>(Op2->getOperand(0))->getLimitedValue();

     if (TrueVal == FalseVal)
       // select(C, X, X) -> X
       return Op1;
     if (TrueVal == true && FalseVal == false)
       // select(C, 1, 0) -> C
       return Cond;
     assert(TrueVal == false && FalseVal == true);
     // select(C, 0, 1) -> !C === C xor -1
     return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VMNI1, Cond, Op2);
   }

   // select(C, X, Y) -> (C & X) | (~C & Y)
   SDValue CAndX = DCI.DAG.getNode(ISD::AND, SDLoc(N), VMNI1, Cond, Op1);
   SDValue NotC = DCI.DAG.getNode(
       ISD::XOR, SDLoc(N), VMNI1, Cond,
       DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VMNI1,
                       DCI.DAG.getConstant(-1, SDLoc(N), MVT::i1)));
   SDValue NotCAndY = DCI.DAG.getNode(ISD::AND, SDLoc(N), VMNI1, NotC, Op2);
   return DCI.DAG.getNode(ISD::OR, SDLoc(N), VMNI1, CAndX, NotCAndY);
 }

 SDValue TPUTargetLowering::PerformBcInsertValueCombine(
     SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
   // Combine llvm.tpu.bc.insertvalue.loopindex -> BC_INSERTVALUE.
   // The intrinsic takes an array and returns an array. This is lowered to
   //   %a = merge_values a0,a1,a2,...,an-1
   //   %b1,b2,...,bn-1 = @llvm.tpu.bc.insertvalue.loopindex %a, %c
   //
   // We don't care about the values of any physical registers. We've already
   // reserved a block of registers for this aggregate, all we need to do is
   // keep the zeroth register to plumb through as the base value.
   //
   // Here we replace the intrinsic with an BC_INSERTVALUE of the base register
   // and a MERGE_VALUES result, with the base register in value 0 and the rest
   // UNDEF. The optimizer will then clean things up.

   SDLoc DL(N);
   SDValue BaseReg = N->getOperand(1);
   SDValue InsertedValue = N->getOperand(2);
   EVT VT = BaseReg.getValueType();
   SDValue NewN =
       DCI.DAG.getNode(TPUISD::BC_INSERTVALUE, DL, VT, BaseReg, InsertedValue);
   SmallVector<SDValue, 4> Vs(N->getNumValues(), DCI.DAG.getUNDEF(VT));
   Vs[0] = NewN;
   return DCI.DAG.getMergeValues(Vs, DL);
 }

 SDValue TPUTargetLowering::PerformBcExtractValueCombine(
     SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
   // Combine llvm.tpu.bc.extractvalue.loopindex -> BC_EXTRACTVALUE.
   // The intrinsic takes an array and returns a vector. This is lowered to
   //   %a = merge_values a0,a1,a2,...,an-1
   //   %b:v8f32 = @llvm.tpu.bc.extractvalue.loopindex %a
   //
   // We don't care about the values of any physical registers. We've already
   // reserved a block of registers for this aggregate, all we need to do is
   // keep the zeroth register to plumb through as the base value.
   //
   // We're already accessing MERGE_VALUES:0, so just rewrite in place.

   SDLoc DL(N);
   SDValue BaseReg = N->getOperand(1);
   EVT VT = BaseReg.getValueType();
   return DCI.DAG.getNode(TPUISD::BC_EXTRACTVALUE, DL, VT, BaseReg);
 }

 SDValue TPUTargetLowering::PerformPtrToIntCombine(SDNode *N) const {
   return N->getOperand(1);
 }

 const char *TPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default:
     return "<TPU unknown opcode>";
   case TPUISD::HALT:
     return "TPUISD::HALT";
   case TPUISD::VROTDOWN:
     return "TPUISD::VROTDOWN";
   case TPUISD::VBROADCAST:
     return "TPUISD::VBROADCAST";
   case TPUISD::VPERMUTE:
     return "TPUISD::VPERMUTE";
   case TPUISD::SPLAT:
     return "TPUISD::SPLAT";
   case TPUISD::WRAPPER:
     return "TPUISD::WRAPPER";
   case TPUISD::BC_INSERTVALUE:
     return "TPUISD::BC_INSERTVALUE";
   case TPUISD::BC_EXTRACTVALUE:
     return "TPUISD::BC_EXTRACTVALUE";
   case TPUISD::UMUL24:
     return "TPUISD::UMUL24";
   case TPUISD::CALL:
     return "TPUISD::CALL";
   }
 }

 SDValue TPUTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   case ISD::BUILD_VECTOR:
     return PerformBUILD_VECTORCombine(N, DCI);
   case ISD::VECTOR_SHUFFLE:
     return PerformVECTOR_SHUFFLECombine(cast<ShuffleVectorSDNode>(N), DCI);
   case ISD::INSERT_VECTOR_ELT:
     return PerformINSERT_VECTOR_ELTCombine(N, DCI);
   case ISD::SCALAR_TO_VECTOR:
     return PerformSCALAR_TO_VECTORCombine(N, DCI);
   case ISD::VSELECT:
     return PerformVSELECTCombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN:
     switch (N->getConstantOperandVal(0)) {
     default:
       return SDValue();
     case Intrinsic::tpu_bc_insertvalue_loopindex:
       return PerformBcInsertValueCombine(N, DCI);
     case Intrinsic::tpu_bc_extractvalue_loopindex:
       return PerformBcExtractValueCombine(N, DCI);
     case Intrinsic::tpu_inttoptr:
     case Intrinsic::tpu_ptrtoint:
       return PerformPtrToIntCombine(N);
     }
   case ISD::SETCC:
     return PerformSETCCCombine(N, DCI);
   default:
     break;
   }

   return SDValue();
 }

 std::optional<bool>
 TPUTargetLowering::IsFifoAccess(MachineInstr &MI,
                                 const TargetRegisterClass *RegClass) const {
   const MCInstrDesc &MCID = TII->get(MI.getOpcode());
   for (auto I = MCID.opInfo_begin(), IE = MCID.opInfo_end(); I != IE; I++) {
     if (I->RegClass == RegClass->getID()) {
       // For push instruction the destination register needs to match the
       // given reg class. For Pop instruction of the operands needs to match
       // the given reg class.
       if (I == MCID.opInfo_begin())
         return false;
       else if (I != MCID.opInfo_begin())
         return true;
     }
   }
   return std::nullopt;
 }

 bool TPUTargetLowering::UsesSpecialReg(
     MachineInstr &MI, const TargetRegisterClass *RegClass) const {
   const MCInstrDesc &MCID = TII->get(MI.getOpcode());
   for (auto I = MCID.opInfo_begin(), IE = MCID.opInfo_end(); I != IE; I++) {
     if (I->RegClass == RegClass->getID()) {
       return true;
     }
   }
   return false;
 }

 Register TPUTargetLowering::getSupportedEmbeddedMask(SDNode *N) const {
   if (!ST->hasEmbeddedMasks() || !PropagateTpuEmbeddedMasks)
     return TPU::NoRegister;
   assert(N->getOpcode() == ISD::BUILD_VECTOR);
   // See e.g. go/vfc-sc-isa#vector-modify-mask-instructions.
   DenseMap<int, Register> SupportedEmbeddedMasks{
       {0xff, TPU::M16}, {0x7f, TPU::M17}, {0x3f, TPU::M18}, {0x1f, TPU::M19},
       {0xf, TPU::M20},  {0x7, TPU::M21},  {0x3, TPU::M22},  {0x1, TPU::M23},
   };
   int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
   if (N->getNumOperands() != MaskSizeInBits)
     return TPU::NoRegister;
   auto MatchesBitMask = [MaskSizeInBits, N](int BitMask) {
     for (int i = 0; i < MaskSizeInBits; i++) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i));
       if (C == nullptr)
         return false;
       if (C->getZExtValue() != ((BitMask >> i) & 0x1))
         return false;
     }
     return true;
   };
   for (auto &KV : SupportedEmbeddedMasks) {
     if (MatchesBitMask(KV.first))
       return KV.second;
   }
   return TPU::NoRegister;
 }

 void TPUTargetLowering::SetDependency(MachineInstr &MI, MachineBasicBlock *MBB,
                                       const TargetRegisterClass *RegClass,
                                       bool IsPush) const {
   const TPUTargetMachine &TM =
       static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
   MachinePointerInfo MPI(TM.getFifoPSV(IsPush, RegClass));
   auto *MemRef = MBB->getParent()->getMachineMemOperand(
       MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
       /*base_alignment=*/llvm::Align(4));
   MI.addMemOperand(*MBB->getParent(), MemRef);
 }

 // DWG needs dependencies with all matmul.
 // The first matmul after a DWG need dependencies with all matpush.
 // DWG can be re-ordered across matpush instructions.
 // This function adds the memory operators to enforce this order.
 MachineBasicBlock *TPUTargetLowering::SetDWGDep(MachineInstr &MI,
                                                 MachineBasicBlock *MBB) const {
   MachineRegisterInfo &RegInfo = MBB->getParent()->getRegInfo();
   SmallDenseSet<MachineInstr *, 32> DWGUses;
   Register Dst = MI.getOperand(0).getReg();
   for (MachineInstr &MIUser : RegInfo.use_instructions(Dst)) {
     assert(MIUser.getParent() == MBB &&
            "matmul use WDG from a different block, this case is currently not "
            "supported");
     DWGUses.insert(&MIUser);
   }
   if (DWGUses.empty())
     return MBB;
   auto E = MBB->end();
   MachineInstr *FirstMatMul = nullptr;
   for (auto I = MI.getIterator(); I != E; I++) {
     if (DWGUses.count(&(*I)) > 0) {
       FirstMatMul = &(*I);
       break;
     }
   }
   assert(FirstMatMul != nullptr && "didn't find any matmul");
   // The first MatMul needs to have an explicit dependency with gsfn as it
   // triggers the copy from gsfn/gsft to gmr. This means the following push
   // cannot be re-ordered across the first matmul.
   const TargetRegisterClass *GSFNRegClass =
       RegInfo.getRegClass(MI.getOperand(1).getReg());
   SetDependency(*FirstMatMul, MBB, GSFNRegClass);
   // DWG cannot be re-ordered across any matmul instruction so add a dependency
   // to push MRF to represent that.
   const TargetRegisterClass *MRFRegClass =
       RegInfo.getRegClass(FirstMatMul->getOperand(0).getReg());
   SetDependency(MI, MBB, MRFRegClass, /*isPush=*/true);
   return MBB;
 }

 MachineBasicBlock *
 TPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *MBB) const {
   // Generic handling of instructions that to sets dependencies
   if (static_cast<const TPUInstrInfo *>(TII)->isDWGInst(MI)) {
     return SetDWGDep(MI, MBB);
   }
   bool IsSpecialRegAccess = false;
   for (auto Fifo : FifoClasses) {
     if (auto IsPop = IsFifoAccess(MI, Fifo)) {
       SetDependency(MI, MBB, Fifo, !*IsPop);
       IsSpecialRegAccess = true;
     }
   }
   for (auto ImplicitReg : SpecialStagingReg) {
     if (UsesSpecialReg(MI, ImplicitReg)) {
       SetDependency(MI, MBB, ImplicitReg);
       IsSpecialRegAccess = true;
     }
   }
   // Instruction with special register accesses only need to be modified to have
   // an extra pseudo source.
   if (IsSpecialRegAccess)
     return MBB;

   auto &ST = MI.getMF()->getSubtarget<TPUSubtarget>();
   unsigned PopOpcode = TPU::SPOP_V2SF;
   const TargetRegisterClass *RegClass = &TPU::V2SFPRRegClass;
   if (ST.hasVfcTensorCore()) {
     PopOpcode = TPU::SPOP_SFRF;
     RegClass = &TPU::SFRFPRRegClass;
   }

   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unknown instruction for custom emission!");
   case TPU::VROTDOWNri:
     return EmitVROTDOWN(MI, MBB);
   case TPU::VFREADi:
     return EmitVecOrSFlagToScalar(
         MI, MBB, ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEi : TPU::VSYNCMOVEi,
         1, PopOpcode, RegClass);
   case TPU::VFREADr:
     return EmitVecOrSFlagToScalar(
         MI, MBB, ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEr : TPU::VSYNCMOVEr,
         1, PopOpcode, RegClass);
   case TPU::VFREADDONEi:
     return EmitVecOrSFlagToScalar(
         MI, MBB,
         ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEDONEi : TPU::VSYNCMOVEDONEi,
         1, PopOpcode, RegClass);
   case TPU::VFREADDONEr:
     return EmitVecOrSFlagToScalar(
         MI, MBB,
         ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEDONEr : TPU::VSYNCMOVEDONEr,
         1, PopOpcode, RegClass);
   case TPU::VFREADPAi:
     return EmitVecOrSFlagToScalar(MI, MBB, TPU::tcvfVSYNCMOVEPAi, 1, PopOpcode,
                                   RegClass);
   case TPU::VFREADPAr:
     return EmitVecOrSFlagToScalar(MI, MBB, TPU::tcvfVSYNCMOVEPAr, 1, PopOpcode,
                                   RegClass);
   case TPU::VREAD:
     assert(!IsSC);
     return EmitVecOrSFlagToScalar(MI, MBB, TPU::VPUSH, 1, TPU::SPOP_V2SF,
                                   &TPU::V2SFPRRegClass);
   case TPU::scVREADi:
     assert(IsSC);
     return EmitVecOrSFlagToScalar(MI, MBB, TPU::scVPUSHi, 2, PopOpcode,
                                   RegClass);
   case TPU::scVREADr:
     assert(IsSC);
     return EmitVecOrSFlagToScalar(MI, MBB, TPU::scVPUSHr, 2, PopOpcode,
                                   RegClass);
   case TPU::VMREAD:
     return EmitVmread(MI, MBB);
   }
 }

 MachineBasicBlock *TPUTargetLowering::EmitVecOrSFlagToScalar(
     MachineInstr &MI, MachineBasicBlock *MBB, unsigned PushOpcode,
     int NumOfInputs, unsigned PopOpcode,
     const TargetRegisterClass *RegClass) const {
   auto &MRI = MBB->getParent()->getRegInfo();
   auto InsertPt = MI.getIterator();

   const unsigned FifoReg = MRI.createVirtualRegister(RegClass);
   MachineInstrBuilder MIB =
       BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(PushOpcode), FifoReg);
   for (int i = 1; i <= NumOfInputs; i++)
     MIB.add(MI.getOperand(i));
   MachineInstr *Push = AddDefaultPred(MIB);
   MachineInstr *Pop =
       AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
                              TII->get(PopOpcode), MI.getOperand(0).getReg())
                          .addReg(FifoReg, getKillRegState(true)));
   MI.eraseFromParent();

   for (auto &I : {Push, Pop}) {
     const TPUTargetMachine &TM =
         static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
     MachinePointerInfo MPI(TM.getFifoPSV(I == Push, RegClass));
     auto *MemRef = MBB->getParent()->getMachineMemOperand(
         MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
         /*base_alignment=*/llvm::Align(4));
     I->addMemOperand(*MBB->getParent(), MemRef);
   }
   return MBB;
 }

 MachineBasicBlock *TPUTargetLowering::EmitVmread(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
   auto &MRI = MBB->getParent()->getRegInfo();
   auto InsertPt = MI.getIterator();

   unsigned ZeroReg = MRI.createVirtualRegister(&TPU::VPRRegClass);
   AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
                          TII->get(TPU::VIMMI), ZeroReg)
                      .addImm(0));
   AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
                          TII->get(TPU::VSELir),
                          MI.getOperand(0).getReg())
                      .add(MI.getOperand(1))
                      .addImm(1)
                      .addReg(ZeroReg));
   MI.eraseFromParent();
   return MBB;
 }

 MachineBasicBlock *
 TPUTargetLowering::EmitVROTDOWN(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const {
   // Emit VROTDOWNri as a sequence of N VROTDOWNr's.
   auto &MRI = MBB->getParent()->getRegInfo();

   unsigned Imm = MI.getOperand(2).getImm();
   auto OpReg = MI.getOperand(1).getReg();
   auto FinalReg = MI.getOperand(0).getReg();
   auto InsertPt = MI.getIterator();
   if (Imm == 0) {
     BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(TPU::COPY), FinalReg)
         .addReg(OpReg);
     MI.eraseFromParent();
     return MBB;
   }

   // TODO(hgreving): Sparsecore and Viperfish should be able to use
   // one xlane instruction.
   MachineInstr *TheMI = &MI;
   for (unsigned I = 0; I < Imm; ++I) {
     unsigned OutReg = (I == (Imm - 1))
                           ? FinalReg
                           : MRI.createVirtualRegister(&TPU::VPRRegClass);
     TheMI = AddDefaultPred(
         BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
                 TII->get(TPU::VROTDOWNr), OutReg)
             .addReg(OpReg, getKillRegState(true)));
     OpReg = OutReg;
   }
   MI.eraseFromParent();

   return MBB;
 }

 bool TPUTargetLowering::allowsMemoryAccess(LLVMContext &Context,
                                            const DataLayout &DL, EVT VT,
                                            unsigned AddrSpace, Align Alignment,
                                            MachineMemOperand::Flags Flags,
                                            unsigned *Fast) const {
   // Disallow load/store we don't support natively.
   if (VT != MVT::i32 && VT != MVT::f32 && VT != VNF32 && VT != VNI32)
     return false;
   bool Allows = TargetLowering::allowsMemoryAccess(Context, DL, VT, AddrSpace,
                                                    Alignment, Flags, Fast);
   if (Allows)
     *Fast = 1;
   return Allows;
 }

 bool TPUTargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     unsigned *Fast) const {
   // No memory access on TPU requires alignment > 4 bytes.
   if (Alignment >= Align(4))
     return true;
   return false;
 }

 bool TPUTargetLowering::allowsMisalignedMemoryAccesses(
     LLT LT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     unsigned *Fast) const {
   // No memory access on TPU requires alignment > 4 bytes.
   if (Alignment >= Align(4))
     return true;
   return false;
 }

 void TPUTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                       SDNode *Node) const {
   MachineBasicBlock *MBB = MI.getParent();
   const TPUTargetMachine &TM =
       static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
   if (MI.getOpcode() == TPU::INIT_STACK) {
     // Move stack initialization to the very top of the function.
     assert(ST->isTPUABIEnabled());
     MI.setFlags(MachineInstr::FrameSetup);
     MI.moveBefore(&*MBB->instr_begin());
     return;
   }
   if (MI.getOpcode() == TPU::bcVST_concat ||
       MI.getOpcode() == TPU::bcVST_concat_aliaddr) {
     MachinePointerInfo MPI(
         TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ConcatReg));
     auto *MemRef = MBB->getParent()->getMachineMemOperand(
         MPI, MachineMemOperand::MOStore, /*s=*/4, /*base_alignment=*/llvm::Align(4));
     MI.addMemOperand(*MBB->getParent(), MemRef);
     return;
   }
   if (MI.getOpcode() == TPU::bcVSHIFT ||
       MI.getOpcode() == TPU::bcVSHIFT_aliaddr) {
     {
       MachinePointerInfo MPI(
           TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ShiftReg));
       auto *MemRef = MBB->getParent()->getMachineMemOperand(
           MPI, MachineMemOperand::MOStore, /*s=*/4, /*base_alignment=*/llvm::Align(4));
       MI.addMemOperand(*MBB->getParent(), MemRef);
     }
     {
       MachinePointerInfo MPI(
           TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ConcatReg));
       auto *MemRef = MBB->getParent()->getMachineMemOperand(
           MPI, MachineMemOperand::MOLoad, /*s=*/4, /*base_alignment=*/llvm::Align(4));
       MI.addMemOperand(*MBB->getParent(), MemRef);
     }
     return;
   }
   // We rely on convention of brcond ordering to match bcLOOP_END correctly.
   // Ensure we actually matched correctly here. bcLOOP_END should point back
   // to its own block (single block loops only are allowed).
   assert(MI.getOpcode() == TPU::bcLOOP_END);
   assert(MI.getOperand(0).getMBB() == MI.getParent() &&
          "bcLOOP_END does not point to its parent!");
   MI.getParent()->setMachineBlockAddressTaken();
 }

 // Custom version of CCInfo.AnalyzeCallOperands, supporting scalar and vector
 // stacks. Hacks the memory offsets split into two stacks into the ArgLocs and
 // returns the scalar and vector sizes for call parameters. Also returns the
 // extra bytes used for alignment of the vector stack of masks.
 static void analyzeCallOperands(const TPUTargetLowering &TLI,
                                 const TPUSubtarget *ST,
                                 const TargetLowering::CallLoweringInfo &CLI,
                                 CCState &CCInfo,
                                 SmallVector<CCValAssign, 16> &ArgLocs,
                                 int &NumBytesScalar, int &NumBytesVector,
                                 int &ExtraAlignBytesVector) {
   const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   unsigned NumOps = Outs.size();
   for (unsigned i = 0; i != NumOps; ++i) {
     MVT ArgVT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     int PrevNumBytes = CCInfo.getNextStackOffset();
     if (CC_TPU(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo)) {
 #ifndef NDEBUG
       dbgs() << "Call operand #" << i << " has unhandled type "
              << EVT(ArgVT).getEVTString() << '\n';
 #endif
       llvm_unreachable(nullptr);
     }
     if (!ST->isTPUABIEnabled() || CCInfo.getCallingConv() == CallingConv::Fast)
       continue;
     assert(!ArgLocs[i].isMemLoc() ||
            PrevNumBytes == ArgLocs[i].getLocMemOffset());
     CCValAssign &CCV = ArgLocs[i];
     if (int StackOffsetDelta = CCInfo.getNextStackOffset() - PrevNumBytes) {
       if (ArgVT.isVector()) {
         assert(ST->hasVPU());
         // This is a trick using the API in order to adjust the LocMemOffset,
         // because we have two separate stacks for scalar and vector.
         if (isMaskVT(ArgVT, *ST)) {
           int AlignedStackOffsetDelta =
               alignTo(StackOffsetDelta, ST->vectorSizeInBytes());
           ExtraAlignBytesVector += AlignedStackOffsetDelta - StackOffsetDelta;
           StackOffsetDelta = AlignedStackOffsetDelta;
         }
         assert(StackOffsetDelta == ST->vectorSizeInBytes());
         CCV.convertToMem(NumBytesVector);
         NumBytesVector += StackOffsetDelta;
       } else {
         assert(StackOffsetDelta == ST->scalarSizeInBytes());
         // Same comment as above.
         CCV.convertToMem(NumBytesScalar);
         NumBytesScalar += StackOffsetDelta;
       }
     }
   }
   assert(CCInfo.getCallingConv() == CallingConv::Fast ||
          ArgLocs.size() == NumOps);
 }

 SDValue TPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                      SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
   SDLoc &DL = CLI.DL;
   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   bool &IsTailCall = CLI.IsTailCall;
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   // Not supported.
   assert(!IsVarArg);
   // FIXME(b/237788792): Support return values.
   assert(CLI.RetTy->isVoidTy() &&
          "Return values should be passed by reference");
   // No support for tail calls right now.
   IsTailCall = false;

   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   // How many bytes are to be pushed on the scalar stack.
   int NumBytesScalar = 0;
   // How many bytes are to be pushed on the vector stack.
   int NumBytesVector = 0;
   // Extra bytes added for vector memory alignment, used for masks.
   int ExtraAlignBytesVector = 0;
   analyzeCallOperands(*this, ST, CLI, CCInfo, ArgLocs, NumBytesScalar,
                       NumBytesVector, ExtraAlignBytesVector);
   assert(NumBytesScalar + NumBytesVector - ExtraAlignBytesVector ==
          CCInfo.getNextStackOffset());

   Chain = DAG.getCALLSEQ_START(Chain, NumBytesScalar, NumBytesVector, DL);

   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;

   // Walk the register assignments, inserting copies.
   for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
     CCValAssign &VA = ArgLocs[I];
     assert(VA.getValVT() == VA.getLocVT());
     SDValue Arg = OutVals[I];
     if (VA.isRegLoc()) {
       // Promote the value if needed.
       switch (VA.getLocInfo()) {
       case CCValAssign::Full:
         break;
       case CCValAssign::SExt:
         Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
         break;
       case CCValAssign::ZExt:
         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
         break;
       case CCValAssign::AExt:
         Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
         break;
       default:
         llvm_unreachable("Unknown loc info!");
       }

       // Arguments that can be passed on register must be kept at RegsToPass
       // vector
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc());
       assert(!VA.needsCustom());
       auto PtrVT = getPointerTy(DAG.getDataLayout());
       MachineFunction &MF = DAG.getMachineFunction();
       unsigned LocMemOffset = VA.getLocMemOffset();
       // In order to make it easier for the callee, the stack pointer in the
       // caller is incremented such that it points to a free slot in the callee
       // for the return address. Adjusting the argument offsets here.
       if (!VA.getValVT().isVector())
         LocMemOffset += ST->scalarSizeInBytes();
       else
         assert(ST->hasVPU());
       unsigned AdjustedLocMemOffset =
           TPU::adjustForWordSize(
               APInt(32, LocMemOffset),
               VA.getValVT().isVector() ? TPUAS_TileSpmem : TPUAS_Smem, *ST)
               .getZExtValue();
       SDValue PtrOff = DAG.getIntPtrConstant(AdjustedLocMemOffset, DL);
       // Stack pointer (not frame pointer) based after call stack adjustments.
       SDValue DstAddr = DAG.getNode(
           ISD::ADD, DL, PtrVT,
           DAG.getRegister(VA.getValVT().isVector() ? TPU::SPV : TPU::SPS,
                           MVT::i32),
           PtrOff);
       MachinePointerInfo DstInfo =
           VA.getValVT().isVector()
               ? MachinePointerInfo(TPUAS_TileSpmem, LocMemOffset)
               : MachinePointerInfo::getStack(MF, LocMemOffset);
       SDValue Store;
       if (isMaskVT(VA.getValVT(), *ST)) {
         SDValue Select =
             DAG.getNode(ISD::VSELECT, DL, VNI32, Arg,
                         DAG.getNode(TPUISD::SPLAT, DL, VNI32,
                                     DAG.getConstant(0xFFFFFFFF, DL, MVT::i32)),
                         DAG.getNode(TPUISD::SPLAT, DL, VNI32,
                                     DAG.getConstant(0, DL, MVT::i32)));
         Store = DAG.getStore(Chain, DL, Select, DstAddr, DstInfo);
       } else {
         Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
       }
       MemOpChains.push_back(Store);
     }
   }

   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

   SDValue InFlag;

   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag is
   // necessary since all emitted instructions must be stuck together.
   for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
                              RegsToPass[I].second, InFlag);
     InFlag = Chain.getValue(1);
   }

   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL,
                                       getPointerTy(DAG.getDataLayout()), 0);
   Callee = DAG.getNode(TPUISD::WRAPPER, DL, MVT::i32, Callee);

   // Function always return void.
   SDVTList NodeTys = DAG.getVTList(MVT::isVoid, MVT::Glue);
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);

   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask =
       TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));

   // Add argument registers to the end of the list so that they are
   // known live into the call.
   for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
     Ops.push_back(DAG.getRegister(RegsToPass[I].first,
                                   RegsToPass[I].second.getValueType()));

   if (InFlag.getNode())
     Ops.push_back(InFlag);

   Chain = DAG.getNode(CallConv == CallingConv::Fast ? TPUISD::CALL_FAST
                                                     : TPUISD::CALL,
                       DL, NodeTys, ArrayRef<SDValue>(&Ops[0], Ops.size()));
   InFlag = Chain.getValue(1);

   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(
       Chain,
       DAG.getConstant(NumBytesScalar, DL, getPointerTy(DAG.getDataLayout()),
                       true),
       DAG.getConstant(NumBytesVector, DL, getPointerTy(DAG.getDataLayout()),
                       true),
       InFlag, DL);
   InFlag = Chain.getValue(1);
   return Chain;
 }

 bool TPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
                                            MachineFunction &MF,
                                            unsigned Intrinsic) const {
   const TPUTargetMachine &TM =
       static_cast<const TPUTargetMachine &>(MF.getTarget());
   switch (Intrinsic) {
     case Intrinsic::tpu_syncadd:
     case Intrinsic::tpu_syncadd_done:
     case Intrinsic::tpu_syncadd_notdone:
     case Intrinsic::tpu_syncadd_remote:
     case Intrinsic::tpu_syncadd_remote_done:
     case Intrinsic::tpu_syncadd_remote_doneinv:
     case Intrinsic::tpu_syncadd_tile:
     case Intrinsic::tpu_syncset_done:
     case Intrinsic::tpu_syncset_notdone:
     case Intrinsic::tpu_syncset_remote:
     case Intrinsic::tpu_syncset_remote_doneinv:
     case Intrinsic::tpu_syncdonemov:
       Info.opc = (Intrinsic == Intrinsic::tpu_syncdonemov)
                      ? ISD::INTRINSIC_W_CHAIN
                      : ISD::INTRINSIC_VOID;
       Info.memVT = MVT::i32;
       Info.ptrVal = I.getOperand(0);
       Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_vld_shuffle:
     case Intrinsic::tpu_vld_strided:
     case Intrinsic::tpu_vld_indexed:
     case Intrinsic::tpu_vld_replicate_evenodd_sublanes:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
       Info.ptrVal = I.getOperand(0);
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_rdcbreg_smem_base:
     case Intrinsic::tpu_rdcbreg_tilespmem_base:
     case Intrinsic::tpu_rdcbreg_size:
     case Intrinsic::tpu_rdcbreg_offset:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       // FIXME(hgreving): re-visit memory operand strategy for this. The reason
       // for this to read memory at all are the cb.upd semantics that are not
       // modeled through register dependencies.
       Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
       Info.ptrVal = nullptr;
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_sld_cb:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
       // FIXME(hgreving): re-visit memory operand strategy for this. We don't
       // have a pointer and PSV values also don't work well here (upstream bug:
       // can't set address space).
       Info.ptrVal = nullptr;
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_sld_cb_upd:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
       // FIXME(hgreving): re-visit memory operand strategy for this. We don't
       // have a pointer and PSV values also don't work well here (upstream bug:
       // can't set address space).
       Info.ptrVal = nullptr;
       Info.size = MemoryLocation::UnknownSize;
       // upd modeled as store.
       Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_vld_msk:
     case Intrinsic::tpu_vld_msk_strided:
     case Intrinsic::tpu_vld_msk_idx_strided:
     case Intrinsic::tpu_vld_msk_idx:
     case Intrinsic::tpu_vld_msk_idx_np:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
       Info.ptrVal = I.getOperand(1);
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_vst_strided:
     case Intrinsic::tpu_vst_indexed:
     case Intrinsic::tpu_vst_evenodd_sublanes:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(0)->getType());
       Info.size = MemoryLocation::UnknownSize;
       Info.ptrVal = I.getOperand(1);
       Info.flags = MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_sst_cb:
     case Intrinsic::tpu_sst_cb_upd:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(0)->getType());
       Info.size = MemoryLocation::UnknownSize;
       // FIXME(hgreving): re-visit memory operand strategy for this. We don't
       // have a pointer and PSV values also don't work well here (upstream bug:
       // can't set address space).
       Info.ptrVal = nullptr;
       Info.flags = MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_vst_msk_idx_add:
     case Intrinsic::tpu_vst_msk_idx_add_np:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(3)->getType());
       Info.size = MemoryLocation::UnknownSize;
       Info.ptrVal = I.getOperand(1);
       Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_vst_msk_idx_ret_add_np:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(I.getOperand(3)->getType());
       Info.size = MemoryLocation::UnknownSize;
       Info.ptrVal = I.getOperand(1);
       Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_vst_msk:
     case Intrinsic::tpu_vst_msk_add:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(2)->getType());
       Info.size = MemoryLocation::UnknownSize;
       Info.ptrVal = I.getOperand(1);
       Info.flags = MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_vst_cb_msk:
     case Intrinsic::tpu_vst_cb_msk_add:
     case Intrinsic::tpu_vst_cb_msk_add_strided:
     case Intrinsic::tpu_vst_cb_msk_idx:
     case Intrinsic::tpu_vst_cb_msk_idx_add:
     case Intrinsic::tpu_vst_cb_msk_strided:
     case Intrinsic::tpu_vst_cb_upd_msk:
     case Intrinsic::tpu_vst_cb_upd_msk_add:
     case Intrinsic::tpu_vst_cb_upd_msk_add_strided:
     case Intrinsic::tpu_vst_cb_upd_msk_strided:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(3)->getType());
       Info.size = MemoryLocation::UnknownSize;
       // FIXME(hgreving): re-visit memory operand strategy for this. We don't
       // have a pointer and PSV values also don't work well here (upstream bug:
       // can't set address space).
       Info.ptrVal = nullptr;
       Info.flags = MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_vld_cb_msk:
     case Intrinsic::tpu_vld_cb_msk_idx:
     case Intrinsic::tpu_vld_cb_msk_idx_np:
     case Intrinsic::tpu_vld_cb_msk_strided:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
       Info.size = MemoryLocation::UnknownSize;
       Info.ptrVal = nullptr;
       Info.flags = MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_vld_cb_upd_msk:
     case Intrinsic::tpu_vld_cb_upd_msk_strided:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
       Info.size = MemoryLocation::UnknownSize;
       // FIXME(hgreving): re-visit memory operand strategy for this. We don't
       // have a pointer and PSV values also don't work well here (upstream bug:
       // can't set address space).
       Info.ptrVal = nullptr;
       // upd modeled as store
       Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_vst_msk_strided:
     case Intrinsic::tpu_vst_msk_add_strided:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(3)->getType());
       Info.size = MemoryLocation::UnknownSize;
       Info.ptrVal = I.getOperand(1);
       Info.flags = MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_vst_msk_idx:
     case Intrinsic::tpu_vst_msk_idx_np:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(3)->getType());
       Info.size = MemoryLocation::UnknownSize;
       Info.ptrVal = I.getOperand(1);
       Info.flags = MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_vst_msk_idx_strided:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(4)->getType());
       Info.size = MemoryLocation::UnknownSize;
       Info.ptrVal = I.getOperand(1);
       Info.flags = MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_dma_hbm_to_smem:
     case Intrinsic::tpu_dma_hbm_to_smem_sc_simple:
     case Intrinsic::tpu_dma_hbm_to_vmem:
     case Intrinsic::tpu_dma_hbm_to_spmem_sc_simple:
     case Intrinsic::tpu_dma_hbm_to_hbm:
     case Intrinsic::tpu_dma_hbm_to_hbm_sc_simple:
     case Intrinsic::tpu_dma_hbm_to_hib:
     case Intrinsic::tpu_dma_hbm_to_vmem_hib_update:
     case Intrinsic::tpu_dma_smem_to_hbm:
     case Intrinsic::tpu_dma_smem_to_hbm_sc_simple:
     case Intrinsic::tpu_dma_vmem_to_hbm:
     case Intrinsic::tpu_dma_spmem_to_hbm_sc_simple:
     case Intrinsic::tpu_dma_spmem_to_spmem_sc_simple:
     case Intrinsic::tpu_dma_timem_to_hbm:
     case Intrinsic::tpu_dma_timem_to_hbm_sc_simple:
     case Intrinsic::tpu_dma_hbm_to_simem_sc_simple:
     case Intrinsic::tpu_dma_hbm_to_timem:
     case Intrinsic::tpu_dma_hbm_to_timem_sc_simple:
     case Intrinsic::tpu_dma_hbm_to_smem_single_strided:
     case Intrinsic::tpu_dma_hbm_to_vmem_single_strided:
     case Intrinsic::tpu_dma_smem_to_hbm_single_strided:
     case Intrinsic::tpu_dma_vmem_to_hbm_single_strided:
     case Intrinsic::tpu_dma_hbm_to_smem_general:
     case Intrinsic::tpu_dma_hbm_to_vmem_general:
     case Intrinsic::tpu_dma_smem_to_hbm_general:
     case Intrinsic::tpu_dma_vmem_to_hbm_general:
     case Intrinsic::tpu_dma_hbm_to_hbm_sc_general:
     case Intrinsic::tpu_dma_smem_to_smem_sc_general:
     case Intrinsic::tpu_dma_hbm_to_smem_sc_general:
     case Intrinsic::tpu_dma_hbm_to_timem_sc_general:
     case Intrinsic::tpu_dma_hbm_to_spmem_sc_general:
     case Intrinsic::tpu_dma_smem_to_hbm_sc_general:
     case Intrinsic::tpu_dma_timem_to_hbm_sc_general:
     case Intrinsic::tpu_dma_spmem_to_spmem_sc_general:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = VNI32;
       // Access multiple pointers so set it to null so that alias analysis don't
       // make any assumption.
       // TODO(thomasraoux): We could have a finer grain aliasing information by
       // adding several memory operands and actually add the pointers.
       Info.ptrVal = nullptr;
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_dma_hbm_to_iova_sc_simple:
     case Intrinsic::tpu_dma_iova_to_hbm_sc_simple:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::v1024i32;
       // Same comments as above.
       Info.ptrVal = nullptr;
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_spmem_to_smem:
     case Intrinsic::tpu_stream_indirect_gather_cb_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_tilespmem_tileN_to_smem:
     case Intrinsic::tpu_stream_indirect_gather_cb_tilespmem_tileN_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_upd_add_f32_hbm_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_upd_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_upd_add_s32_hbm_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_upd_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_upd_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_upd_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_upd_spmem_to_smem:
     case Intrinsic::tpu_stream_indirect_gather_cb_upd_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_cb_upd_tilespmem_tileN_to_smem:
     case Intrinsic::
         tpu_stream_indirect_gather_cb_upd_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_spmem_to_smem:
     case Intrinsic::tpu_stream_indirect_gather_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_smem:
     case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_spmem:
     case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_spmem:
     case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_smem_to_spmem:
     case Intrinsic::tpu_stream_indirect_scatter_cb_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_spmem:
     case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_upd_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_upd_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_upd_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_upd_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_upd_smem_to_spmem:
     case Intrinsic::tpu_stream_indirect_scatter_cb_upd_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_smem_to_spmem:
     case Intrinsic::tpu_stream_indirect_scatter_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_spmem:
     case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_add_f32_hbm_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_add_s32_hbm_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_spmem_to_smem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_tilespmem_tileN_to_smem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_tilespmem_tileN_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_add_f32_hbm_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_add_s32_hbm_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_spmem_to_smem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_tilespmem_tileN_to_smem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_cb_upd_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_smem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_smem:
     case Intrinsic::
         tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_smem_to_spmem:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_smem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_spmem:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_indirect_vreg_scatter_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_gather_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_linear_gather_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_linear_gather_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_linear_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_linear_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_spmem_to_smem:
     case Intrinsic::tpu_stream_linear_gather_cb_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_tilespmem_tileN_to_smem:
     case Intrinsic::tpu_stream_linear_gather_cb_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_linear_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_linear_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_spmem_to_smem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_cb_upd_tilespmem_tileN_to_smem:
     case Intrinsic::
         tpu_stream_linear_gather_cb_upd_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_spmem_to_smem:
     case Intrinsic::tpu_stream_linear_gather_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_smem:
     case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_spmem:
     case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_spmem:
     case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_smem_to_spmem:
     case Intrinsic::tpu_stream_linear_scatter_cb_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_spmem:
     case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_smem_to_spmem:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_linear_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_smem_to_spmem:
     case Intrinsic::tpu_stream_linear_scatter_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_spmem:
     case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_gather_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_strided_gather_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_strided_gather_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_strided_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_strided_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_spmem_to_smem:
     case Intrinsic::tpu_stream_strided_gather_cb_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_tilespmem_tileN_to_smem:
     case Intrinsic::tpu_stream_strided_gather_cb_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_strided_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_spmem_to_tilespmem:
     case Intrinsic::
         tpu_stream_strided_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_spmem_to_smem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_cb_upd_tilespmem_tileN_to_smem:
     case Intrinsic::
         tpu_stream_strided_gather_cb_upd_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_hbm4b_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_hbm_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_spmem_to_smem:
     case Intrinsic::tpu_stream_strided_gather_spmem_to_tilespmem:
     case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_smem:
     case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_tilespmem:
     case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_spmem:
     case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_spmem:
     case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_smem_to_spmem:
     case Intrinsic::tpu_stream_strided_scatter_cb_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_spmem:
     case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_upd_add_f32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_upd_add_f32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_upd_add_s32_smem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_upd_add_s32_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_upd_smem_to_spmem:
     case Intrinsic::tpu_stream_strided_scatter_cb_upd_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_spmem:
     case Intrinsic::
         tpu_stream_strided_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_smem_to_spmem:
     case Intrinsic::tpu_stream_strided_scatter_smem_to_tilespmem_tileN:
     case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm:
     case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm4b:
     case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_spmem:
     case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_tilespmem_tileN:
       // We don't actually need to add memory operands for stream. We'd get
       // regular barriers from the DAG builder otherwise, but we're doing it
       // right and stick to memory edges here.
       assert(IsSC);
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = VNI32;
       // TODO(hgreving): We could have a finer grain aliasing information by
       // adding several memory operands. We currently only attach the TileSpmem
       // memory operand, because that's all we currently consider when analyzing
       // the DAG's edge later. We also don't want to hard-code the operand
       // number, because there are too many stream intrinsics. Instead, we're
       // just searching the operands.
       Info.ptrVal = nullptr;
       for (auto &Op : I.operands()) {
         if (!Op->getType()->isPointerTy())
           continue;
         if (Op->getType()->getPointerAddressSpace() != TPUAS_TileSpmem)
           continue;
         Info.ptrVal = Op;
         break;
       }
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_bc_load_aliaddr:
     case Intrinsic::tpu_bc_load_aliaddr_flm:
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::getVT(I.getType());
       Info.ptrVal = I.getOperand(0);
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOLoad;
       return true;
     case Intrinsic::tpu_bc_store_aliaddr:
     case Intrinsic::tpu_bc_store_aliaddr_flm:
       Info.opc = ISD::INTRINSIC_VOID;
       Info.memVT = MVT::getVT(I.getOperand(0)->getType());
       Info.ptrVal = I.getOperand(1);
       Info.size = MemoryLocation::UnknownSize;
       Info.flags = MachineMemOperand::MOStore;
       return true;
     case Intrinsic::tpu_bc_loop_end: {
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.memVT = MVT::i1;
       Info.ptrVal = TM.getPSV(TPUTargetMachine::PSV_BarnaCoreChannel_LoopEnd);
       Info.size = 1;
       Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
       return true;
     }
     default:
       return false;
   }
 }

 void TPUTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   KnownBits Known2;
   Known.resetAll();

   switch (Op.getOpcode()) {
   default:
     break;
   case TPUISD::UMUL24:
     unsigned BitWidth = 32;
     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     // Instruction zero out the 8 top bits.
     Known.Zero.setHighBits(8);
     Known2.Zero.setHighBits(8);
     // If low bits are zero in either operand, output low known-0 bits.
     // Also compute a conservative estimate for high known-0 bits.
     unsigned TrailZ =
         Known.countMinTrailingZeros() + Known2.countMinTrailingZeros();
     unsigned LeadZ =
         std::max(Known.countMinLeadingZeros() + Known2.countMinLeadingZeros(),
                  BitWidth) -
         BitWidth;

     Known.resetAll();
     Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
     Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
     break;
   }
 }

 void TPUTargetLowering::addTPUMemOperand(SelectionDAG &DAG, SDNode *N,
                                          bool IsPush,
                                          const TargetRegisterClass *RC) const {
   // Add a MachineMemOperand to N, marking it as a push or pop of the given
   // register class.
   MachineSDNode *MN = cast<MachineSDNode>(N);
   MachinePointerInfo MPI(
       static_cast<const TPUTargetMachine &>(getTargetMachine())
           .getFifoPSV(IsPush, RC));
   auto *MemRef = DAG.getMachineFunction().getMachineMemOperand(
       MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
       /*base_alignment=*/llvm::Align(4));
   DAG.setNodeMemRefs(MN, {MemRef});
 }